aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core')
-rw-r--r--tensorflow/core/BUILD5
-rw-r--r--tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt1
-rw-r--r--tensorflow/core/api_def/base_api/api_def_Roll.pbtxt52
-rw-r--r--tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt32
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.cc3
-rw-r--r--tensorflow/core/distributed_runtime/BUILD1
-rw-r--r--tensorflow/core/distributed_runtime/master_session.cc2
-rw-r--r--tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc18
-rw-r--r--tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h3
-rw-r--r--tensorflow/core/distributed_runtime/session_mgr.cc78
-rw-r--r--tensorflow/core/distributed_runtime/session_mgr.h9
-rw-r--r--tensorflow/core/framework/register_types.h2
-rw-r--r--tensorflow/core/framework/variant_op_registry.cc24
-rw-r--r--tensorflow/core/framework/variant_op_registry.h41
-rw-r--r--tensorflow/core/graph/mkl_layout_pass.cc17
-rw-r--r--tensorflow/core/graph/mkl_layout_pass_test.cc6
-rw-r--r--tensorflow/core/graph/testlib.cc10
-rw-r--r--tensorflow/core/graph/testlib.h4
-rw-r--r--tensorflow/core/kernels/BUILD46
-rw-r--r--tensorflow/core/kernels/compare_and_bitpack_op.cc15
-rw-r--r--tensorflow/core/kernels/decode_bmp_op.cc19
-rw-r--r--tensorflow/core/kernels/fractional_pool_common.h2
-rw-r--r--tensorflow/core/kernels/mkl_aggregate_ops.cc13
-rw-r--r--tensorflow/core/kernels/mkl_avgpooling_op.cc31
-rw-r--r--tensorflow/core/kernels/mkl_concat_op.cc6
-rw-r--r--tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc6
-rw-r--r--tensorflow/core/kernels/mkl_conv_grad_input_ops.cc6
-rw-r--r--tensorflow/core/kernels/mkl_conv_ops.cc7
-rw-r--r--tensorflow/core/kernels/mkl_conv_ops.h6
-rw-r--r--tensorflow/core/kernels/mkl_cwise_ops_common.cc2
-rw-r--r--tensorflow/core/kernels/mkl_fused_batch_norm_op.cc6
-rw-r--r--tensorflow/core/kernels/mkl_identity_op.cc4
-rw-r--r--tensorflow/core/kernels/mkl_input_conversion_op.cc62
-rw-r--r--tensorflow/core/kernels/mkl_lrn_op.cc6
-rw-r--r--tensorflow/core/kernels/mkl_maxpooling_op.cc10
-rw-r--r--tensorflow/core/kernels/mkl_pooling_ops_common.cc6
-rw-r--r--tensorflow/core/kernels/mkl_pooling_ops_common.h8
-rw-r--r--tensorflow/core/kernels/mkl_relu_op.cc8
-rw-r--r--tensorflow/core/kernels/mkl_reshape_op.cc6
-rw-r--r--tensorflow/core/kernels/mkl_softmax_op.cc4
-rw-r--r--tensorflow/core/kernels/mkl_tfconv_op.h4
-rw-r--r--tensorflow/core/kernels/roll_op.cc334
-rw-r--r--tensorflow/core/kernels/roll_op_test.cc484
-rw-r--r--tensorflow/core/kernels/unravel_index_op.cc122
-rw-r--r--tensorflow/core/lib/io/random_inputstream.cc37
-rw-r--r--tensorflow/core/lib/io/random_inputstream.h2
-rw-r--r--tensorflow/core/ops/array_ops.cc7
-rw-r--r--tensorflow/core/ops/image_ops.cc24
-rw-r--r--tensorflow/core/ops/manip_ops.cc33
-rw-r--r--tensorflow/core/ops/nn_ops.cc8
-rw-r--r--tensorflow/core/platform/cpu_feature_guard.cc9
-rw-r--r--tensorflow/core/platform/profile_utils/cpu_utils.h4
-rw-r--r--tensorflow/core/platform/s3/s3_file_system.cc122
-rw-r--r--tensorflow/core/platform/s3/s3_file_system.h22
-rw-r--r--tensorflow/core/platform/s3/s3_file_system_test.cc2
-rw-r--r--tensorflow/core/platform/windows/cpu_info.h2
-rw-r--r--tensorflow/core/profiler/README.md5
-rw-r--r--tensorflow/core/profiler/internal/tfprof_stats.h4
-rw-r--r--tensorflow/core/profiler/profiler.cc8
-rw-r--r--tensorflow/core/public/version.h2
-rw-r--r--tensorflow/core/util/mkl_util.h32
-rw-r--r--tensorflow/core/util/mkl_util_test.cc4
62 files changed, 1668 insertions, 190 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c25aac3acf..7fa0b79766 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -454,6 +454,7 @@ tf_cuda_library(
"framework/reader_interface.h",
"framework/reader_op_kernel.h",
"framework/register_types.h",
+ "framework/register_types_traits.h",
"framework/resource_mgr.h",
"framework/resource_op_kernel.h",
"framework/selective_registration.h",
@@ -611,6 +612,7 @@ tf_gen_op_libs(
"list_ops",
"lookup_ops",
"logging_ops",
+ "manip_ops",
"math_ops",
"nn_ops",
"no_op",
@@ -693,6 +695,7 @@ cc_library(
":list_ops_op_lib",
":logging_ops_op_lib",
":lookup_ops_op_lib",
+ ":manip_ops_op_lib",
":math_ops_op_lib",
":nn_ops_op_lib",
":no_op_op_lib",
@@ -831,6 +834,7 @@ cc_library(
"//tensorflow/core/kernels:list_kernels",
"//tensorflow/core/kernels:lookup",
"//tensorflow/core/kernels:logging",
+ "//tensorflow/core/kernels:manip",
"//tensorflow/core/kernels:math",
"//tensorflow/core/kernels:multinomial_op",
"//tensorflow/core/kernels:nn",
@@ -1153,6 +1157,7 @@ cc_library(
deps = [
":protos_all_cc_impl",
"//third_party/eigen3",
+ "@nsync//:nsync_cpp",
"@protobuf_archive//:protobuf",
],
alwayslink = 1,
diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
index 8da76684e5..97fd39f647 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
@@ -16,5 +16,6 @@ END
description: <<END
Note that this routine only supports wildcard characters in the
basename portion of the pattern, not in the directory portion.
+Note also that the order of filenames returned can be non-deterministic.
END
}
diff --git a/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
new file mode 100644
index 0000000000..b308ad1f9d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
@@ -0,0 +1,52 @@
+op {
+ graph_op_name: "Roll"
+ in_arg {
+ name: "shift"
+ description: <<END
+Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+elements are shifted positively (towards larger indices) along the dimension
+specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+direction.
+END
+ }
+ in_arg {
+ name: "axis"
+ description: <<END
+Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+`shift[i]` should occur. If the same axis is referenced more than once, the
+total shift for that axis will be the sum of all the shifts that belong to that
+axis.
+END
+ }
+ out_arg {
+ name: "output"
+ description: <<END
+Has the same shape and size as the input. The elements are shifted
+positively (towards larger indices) by the offsets of `shift` along the
+dimensions of `axis`.
+END
+ }
+ summary: "Rolls the elements of a tensor along an axis."
+ description: <<END
+The elements are shifted positively (towards larger indices) by the offset of
+`shift` along the dimension of `axis`. Negative `shift` values will shift
+elements in the opposite direction. Elements that roll passed the last position
+will wrap around to the first and vice versa. Multiple shifts along multiple
+axes may be specified.
+
+For example:
+
+```
+# 't' is [0, 1, 2, 3, 4]
+roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+
+# shifting along multiple dimensions
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+
+# shifting along the same axis multiple times
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
new file mode 100644
index 0000000000..97c380700a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
@@ -0,0 +1,32 @@
+op {
+ graph_op_name: "UnravelIndex"
+ in_arg {
+ name: "indices"
+ description: <<END
+An 0-D or 1-D `int` Tensor whose elements are indices into the
+flattened version of an array of dimensions dims.
+END
+ }
+ in_arg {
+ name: "dims"
+ description: <<END
+An 1-D `int` Tensor. The shape of the array to use for unraveling
+indices.
+END
+ }
+ out_arg {
+ name: "output"
+ description: <<END
+An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+same shape as the indices array.
+END
+ }
+ summary: "Converts a flat index or array of flat indices into a tuple of"
+ description: <<END
+coordinate arrays.
+
+@compatibility(numpy)
+Equivalent to np.unravel_index
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 04b5541863..a9485a835e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -762,7 +762,8 @@ int64 MinSystemMemory(int64 available_memory) {
// is necessary.
min_system_memory *= 2;
#endif
-#if defined(NVIDIA_TEGRA)
+
+#if defined(ANDROID_TEGRA)
// 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
// and Video RAM
min_system_memory = 1 << 30;
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index f4ee841032..9e152aa082 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -145,6 +145,7 @@ cc_library(
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
+ "//tensorflow/core:worker_proto_cc",
],
)
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index dcc25e4426..878a1398c9 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1448,6 +1448,7 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
const auto count = run_state->count;
pss.collect_timeline =
req.options().trace_level() == RunOptions::FULL_TRACE;
+ pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
pss.report_tensor_allocations_upon_oom =
req.options().report_tensor_allocations_upon_oom();
@@ -1610,6 +1611,7 @@ Status MasterSession::DoRunWithLocalExecution(
TRACEPRINTF("stepid %llu", step_id);
pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
+ pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
pss.report_tensor_allocations_upon_oom =
req.options().report_tensor_allocations_upon_oom();
// Build the cost model every 'build_cost_model_every' steps after skipping an
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 95811476f7..b20e744a97 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -444,6 +444,24 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
});
}
+void GrpcWorker::LoggingAsync(const LoggingRequest* request,
+ LoggingResponse* response, StatusCallback done) {
+ auto env = this->env();
+ if (env) {
+ auto session_mgr = (SessionMgr*)env->session_mgr;
+ if (session_mgr) {
+ session_mgr->SetLogging(request->rpc_logging());
+ for (const auto& step_id : request->fetch_step_id()) {
+ session_mgr->RetrieveLogs(step_id, response);
+ }
+ if (request->clear()) {
+ session_mgr->ClearLogs();
+ }
+ }
+ }
+ done(Status::OK());
+}
+
WorkerEnv* GrpcWorker::env() { return env_; }
std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 78a21fd9f6..fbddbda9e6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -40,6 +40,9 @@ class GrpcWorker : public Worker {
::grpc::ByteBuffer* response,
StatusCallback done);
+ virtual void LoggingAsync(const LoggingRequest* request,
+ LoggingResponse* response, StatusCallback done);
+
WorkerEnv* env();
private:
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 8db49e7f15..90664c3612 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -64,8 +64,13 @@ Status SessionMgr::CreateSession(const string& session,
TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
}
+ if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
+ worker_cache->SetLogging(this->is_logging_active_);
+ }
+
CHECK(!worker_env_->local_devices.empty())
<< "The WorkerEnv must have at least one device in `local_devices`.";
+
std::vector<Device*> renamed_devices;
for (Device* d : worker_env_->local_devices) {
renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
@@ -113,4 +118,77 @@ std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
return legacy_session_;
}
+void SessionMgr::SetLogging(bool active) {
+ mutex_lock l(mu_);
+ this->is_logging_active_ = active;
+ // Legacy Session
+ if (legacy_session_) {
+ auto* worker_cache = legacy_session_->worker_cache.get();
+ if (worker_cache) {
+ worker_cache->SetLogging(active);
+ }
+ }
+
+ for (const auto& session_kv : sessions_) {
+ auto session = session_kv.second.get();
+ if (session) {
+ auto* worker_cache = session->worker_cache.get();
+ if (worker_cache) {
+ worker_cache->SetLogging(active);
+ }
+ }
+ }
+}
+
+void SessionMgr::RetrieveLogs(tensorflow::int64 step_id,
+ LoggingResponse* response) {
+ mutex_lock l(mu_);
+ // Legacy Session
+ if (legacy_session_) {
+ auto* worker_cache = legacy_session_->worker_cache.get();
+ if (worker_cache) {
+ auto step_stats = StepStats();
+ if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+ auto* labeled_step_stats = response->add_step();
+ labeled_step_stats->set_step_id(step_id);
+ labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+ }
+ }
+ }
+ for (const auto& session_kv : sessions_) {
+ auto session = session_kv.second.get();
+ if (session) {
+ auto* worker_cache = session->worker_cache.get();
+ if (worker_cache) {
+ auto step_stats = StepStats();
+ if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+ auto* labeled_step_stats = response->add_step();
+ labeled_step_stats->set_step_id(step_id);
+ labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+ }
+ }
+ }
+ }
+}
+
+void SessionMgr::ClearLogs() {
+ mutex_lock l(mu_);
+ // Legacy Session
+ if (legacy_session_) {
+ auto* worker_cache = legacy_session_->worker_cache.get();
+ if (worker_cache) {
+ worker_cache->ClearLogs();
+ }
+ }
+
+ for (const auto& session_kv : sessions_) {
+ auto session = session_kv.second.get();
+ if (session) {
+ auto* worker_cache = session->worker_cache.get();
+ if (worker_cache) {
+ worker_cache->ClearLogs();
+ }
+ }
+ }
+}
} // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 3ce260d12e..4c9702d522 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -22,6 +22,7 @@ limitations under the License.
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
namespace tensorflow {
@@ -56,6 +57,12 @@ class SessionMgr {
static string WorkerNameFromServerDef(const ServerDef& server_def);
+ void SetLogging(bool active);
+
+ void RetrieveLogs(tensorflow::int64 step_id, LoggingResponse* response);
+
+ void ClearLogs();
+
private:
const WorkerEnv* const worker_env_; // Not owned.
@@ -75,6 +82,8 @@ class SessionMgr {
std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
std::shared_ptr<WorkerSession> legacy_session_;
+ bool is_logging_active_ = false;
+
const WorkerCacheFactory worker_cache_factory_;
std::shared_ptr<WorkerSession> WorkerSessionForSessionUnlocked(
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index e448a60f5e..e90596980f 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -53,7 +53,7 @@ limitations under the License.
*/
#if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION) || \
- defined(NVIDIA_TEGRA)
+ defined(ANDROID_TEGRA)
// All types are supported, so all macros are invoked.
//
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index 395329da3b..ee07db1aee 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -182,7 +182,7 @@ Status VariantDeviceCopy(
// Special casing UnaryOpFn per op and per device.
UnaryVariantOpRegistry::VariantUnaryOpFn* UnaryVariantOpRegistry::GetUnaryOpFn(
VariantUnaryOp op, StringPiece device, StringPiece type_name) {
- auto found = unary_op_fns.find(std::make_tuple(op, device, type_name));
+ auto found = unary_op_fns.find({op, device, type_name});
if (found == unary_op_fns.end()) return nullptr;
return &found->second;
}
@@ -195,12 +195,10 @@ void UnaryVariantOpRegistry::RegisterUnaryOpFn(
CHECK_EQ(existing, nullptr)
<< "Unary VariantUnaryOpFn for type_name: " << type_name
<< " already registered for device type: " << device;
- unary_op_fns.insert(
- std::pair<std::tuple<VariantUnaryOp, StringPiece, StringPiece>,
- VariantUnaryOpFn>(
- std::make_tuple(op, GetPersistentStringPiece(device),
- GetPersistentStringPiece(type_name)),
- unary_op_fn));
+ unary_op_fns.insert(std::pair<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn>(
+ {op, GetPersistentStringPiece(device),
+ GetPersistentStringPiece(type_name)},
+ unary_op_fn));
}
namespace {
@@ -229,7 +227,7 @@ REGISTER_VARIANT_ZEROS_LIKE_TYPE(bool);
UnaryVariantOpRegistry::VariantBinaryOpFn*
UnaryVariantOpRegistry::GetBinaryOpFn(VariantBinaryOp op, StringPiece device,
StringPiece type_name) {
- auto found = binary_op_fns.find(std::make_tuple(op, device, type_name));
+ auto found = binary_op_fns.find({op, device, type_name});
if (found == binary_op_fns.end()) return nullptr;
return &found->second;
}
@@ -242,12 +240,10 @@ void UnaryVariantOpRegistry::RegisterBinaryOpFn(
CHECK_EQ(existing, nullptr)
<< "Unary VariantBinaryOpFn for type_name: " << type_name
<< " already registered for device type: " << device;
- binary_op_fns.insert(
- std::pair<std::tuple<VariantBinaryOp, StringPiece, StringPiece>,
- VariantBinaryOpFn>(
- std::make_tuple(op, GetPersistentStringPiece(device),
- GetPersistentStringPiece(type_name)),
- add_fn));
+ binary_op_fns.insert(std::pair<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn>(
+ {op, GetPersistentStringPiece(device),
+ GetPersistentStringPiece(type_name)},
+ add_fn));
}
namespace {
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 13f6908cae..e94100e994 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -166,6 +166,21 @@ class UnaryVariantOpRegistry {
device_copy_fns;
// Map std::tuple<Op, device, type_name> to function.
+
+ // this breaks by falling victim to "too perfect forwarding"
+ // see https://stackoverflow.com/questions/44475317/variadic-template-issue
+ // and references therein
+ template <typename Op>
+ struct FuncTuple {
+ FuncTuple(const Op& op, const StringPiece& dev, const StringPiece& tname)
+ : op_type_(op), device_(dev), typename_(tname){};
+ Op op_type_;
+ StringPiece device_, typename_;
+ };
+ // friend declaration for operator==
+ // needed for clang
+ template <typename Op>
+ friend bool operator==(const FuncTuple<Op>& l, const FuncTuple<Op>& r);
struct TupleHash {
template <typename Op>
std::size_t operator()(
@@ -176,18 +191,25 @@ class UnaryVariantOpRegistry {
ret = Hash64Combine(ret, sp_hasher_(std::get<2>(x)));
return ret;
}
+
+ template <typename Op>
+ std::size_t operator()(const FuncTuple<Op>& x) const {
+ // The hash of an enum is just its value as a std::size_t.
+ std::size_t ret = static_cast<std::size_t>(x.op_type_);
+ ret = Hash64Combine(ret, sp_hasher_(x.device_));
+ ret = Hash64Combine(ret, sp_hasher_(x.typename_));
+ return ret;
+ }
StringPieceHasher sp_hasher_;
};
- std::unordered_map<std::tuple<VariantUnaryOp, StringPiece, StringPiece>,
- VariantUnaryOpFn, TupleHash>
+ std::unordered_map<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn, TupleHash>
unary_op_fns;
- std::unordered_map<std::tuple<VariantBinaryOp, StringPiece, StringPiece>,
- VariantBinaryOpFn, TupleHash>
+ std::unordered_map<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn, TupleHash>
binary_op_fns;
// Find or insert a string into a persistent string storage
- // container; return the StringPiece pointing to the permanent
- // string location.
+ // container; return the StringPiece pointing to the permanent string
+ // location.
static StringPiece GetPersistentStringPiece(const string& str) {
const auto string_storage = PersistentStringStorage();
auto found = string_storage->find(str);
@@ -199,7 +221,12 @@ class UnaryVariantOpRegistry {
}
}
};
-
+template <typename Op>
+inline bool operator==(const UnaryVariantOpRegistry::FuncTuple<Op>& lhs,
+ const UnaryVariantOpRegistry::FuncTuple<Op>& rhs) {
+ return (lhs.op_type_ == rhs.op_type_) && (lhs.device_ == rhs.device_) &&
+ (lhs.typename_ == rhs.typename_);
+}
// Gets a TensorShape from a Tensor containing a scalar Variant.
// Returns an Internal error if the Variant does not have a registered shape
// function, or if it's a serialized Variant that cannot be decoded.
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 68c3136019..7d3be15299 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -42,7 +42,7 @@ limitations under the License.
namespace tensorflow {
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
// This pass implements rewriting of graph to support following scenarios:
// (A) Merging nodes in the graph
@@ -2211,7 +2211,7 @@ Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
return Status::OK();
}
-#else // INTEL_MKL_DNN
+#else // INTEL_MKL_ML
// This pass implements rewriting of graph to support following scenarios:
// (A) Merging nodes in the graph
@@ -2452,9 +2452,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
// NOTE: names are alphabetically sorted.
rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
CopyAttrsAddN, AddNRewrite});
- /* rinfo_.push_back({csinfo_.add,
- mkl_op_registry::GetMklOpName(csinfo_.add),
- CopyAttrsDataType, AlwaysRewrite}); */
+ rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
+ CopyAttrsDataType, AlwaysRewrite});
rinfo_.push_back({csinfo_.avg_pool,
mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
CopyAttrsPooling, AlwaysRewrite});
@@ -2502,14 +2501,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
rinfo_.push_back({csinfo_.max_pool_grad,
mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
CopyAttrsPooling, AlwaysRewrite});
- /*
+
rinfo_.push_back({csinfo_.maximum,
mkl_op_registry::GetMklOpName(csinfo_.maximum),
CopyAttrsDataType, AlwaysRewrite});
rinfo_.push_back({csinfo_.mul,
mkl_op_registry::GetMklOpName(csinfo_.mul),
CopyAttrsDataType, AlwaysRewrite});
- */
rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
CopyAttrsDataType, AlwaysRewrite});
rinfo_.push_back({csinfo_.relu_grad,
@@ -2529,14 +2527,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
rinfo_.push_back({csinfo_.softmax,
mkl_op_registry::GetMklOpName(csinfo_.softmax),
CopyAttrsDataType, AlwaysRewrite});
- /*
+
rinfo_.push_back({csinfo_.squared_difference,
mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
CopyAttrsDataType, AlwaysRewrite});
rinfo_.push_back({csinfo_.sub,
mkl_op_registry::GetMklOpName(csinfo_.sub),
CopyAttrsDataType, AlwaysRewrite});
- */
// Add info about which ops to add workspace edge to and the slots.
wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
@@ -4317,7 +4314,7 @@ Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
return Status::OK();
}
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
} // namespace tensorflow
#endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 320d5a48c7..5e2a465e22 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
namespace tensorflow {
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
namespace {
@@ -1899,7 +1899,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
} // namespace
-#else // INTEL_MKL_DNN
+#else // INTEL_MKL_ML
namespace {
@@ -3532,7 +3532,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
} // namespace
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
} // namespace tensorflow
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index d5b026eae3..0d88d1ff72 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -273,6 +273,16 @@ Node* Reverse(Graph* g, Node* tensor, Node* axis) {
return Binary(g, "ReverseV2", tensor, axis);
}
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) {
+ Node* ret;
+ TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Roll", g->op_registry())
+ .Input(input)
+ .Input(shift)
+ .Input(axis)
+ .Finalize(g, &ret));
+ return ret;
+}
+
Node* Error(Graph* g, Node* input, const string& errmsg) {
Node* ret;
TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 06597778bb..eb9038d619 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -117,6 +117,10 @@ Node* RandomGamma(Graph* g, Node* shape, Node* alpha);
// Output dtype determined by lam.
Node* RandomPoisson(Graph* g, Node* shape, Node* lam);
+// Rolls tensor by an offset of <shift> along the corresponding
+// <axis> dimensions.
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis);
+
// Generates random parameters from the truncated standard normal distribution
// of the nput shape
Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index fd99409c9b..e7192ec42f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -629,6 +629,7 @@ cc_library(
":transpose_op",
":unique_op",
":unpack_op",
+ ":unravel_index_op",
":where_op",
],
)
@@ -884,6 +885,12 @@ tf_kernel_library(
)
tf_kernel_library(
+ name = "unravel_index_op",
+ prefix = "unravel_index_op",
+ deps = ARRAY_DEPS,
+)
+
+tf_kernel_library(
name = "where_op",
srcs = ["where_op.cc"],
hdrs = ["where_op.h"],
@@ -2582,6 +2589,45 @@ tf_cc_tests(
],
)
+cc_library(
+ name = "manip",
+ deps = [
+ ":roll_op",
+ ],
+)
+
+MANIP_DEPS = [
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:manip_ops_op_lib",
+ "//third_party/eigen3",
+]
+
+tf_kernel_library(
+ name = "roll_op",
+ prefix = "roll_op",
+ deps = MANIP_DEPS,
+)
+
+tf_cc_test(
+ name = "roll_op_test",
+ size = "small",
+ srcs = ["roll_op_test.cc"],
+ deps = [
+ ":ops_testutil",
+ ":ops_util",
+ ":roll_op",
+ "//tensorflow/core:core_cpu",
+ "//tensorflow/core:core_cpu_internal",
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:protos_all_cc",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ "//tensorflow/core:testlib",
+ ],
+)
+
MATH_DEPS = [
":bounds_check",
":fill_functor",
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc
index 9f626a274a..224fe534e3 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc
@@ -110,7 +110,19 @@ struct ComputeShard<T,
typename TTypes<bool>::ConstMatrix input,
typename TTypes<uint8>::Matrix output, bool /*thresh*/, int64 start,
int64 limit) {
- // NOTE(ebrevdo): This assumes memory is little-endian.
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ for (int64 i = start; i < limit; ++i) {
+ uint8* out = output.data() + i;
+ const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
+ *out = ((((block & (1LL << (7 * 8))) >> (7 * 8 - 7))) |
+ (((block & (1LL << (6 * 8))) >> (6 * 8 - 6))) |
+ (((block & (1LL << (5 * 8))) >> (5 * 8 - 5))) |
+ (((block & (1LL << (4 * 8))) >> (4 * 8 - 4))) |
+ (((block & (1LL << (3 * 8))) >> (3 * 8 - 3))) |
+ (((block & (1LL << (2 * 8))) >> (2 * 8 - 2))) |
+ (((block & (1LL << 8)) >> (1 * 8 - 1))) | (((block & (1LL)))));
+ }
+#else
for (int64 i = start; i < limit; ++i) {
uint8* out = output.data() + i;
const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
@@ -123,6 +135,7 @@ struct ComputeShard<T,
(((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
(((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
}
+#endif
}
};
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index c778278e8f..b7d120a617 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -39,6 +39,13 @@ class DecodeBmpOp : public OpKernel {
errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ",
channels_));
}
+ inline int32 ByteSwapInt32ForBigEndian(int32 x) {
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ return le32toh(x);
+#else
+ return x;
+#endif
+ }
void Compute(OpKernelContext* context) override {
const Tensor& contents = context->input(0);
@@ -56,14 +63,18 @@ class DecodeBmpOp : public OpKernel {
input.size(), " bytes"));
const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
- const int32 header_size = internal::SubtleMustCopy(
+ int32 header_size_ = internal::SubtleMustCopy(
*(reinterpret_cast<const int32*>(img_bytes + 10)));
- const int32 width = internal::SubtleMustCopy(
+ const int32 header_size = ByteSwapInt32ForBigEndian(header_size_);
+ int32 width_ = internal::SubtleMustCopy(
*(reinterpret_cast<const int32*>(img_bytes + 18)));
- const int32 height = internal::SubtleMustCopy(
+ const int32 width = ByteSwapInt32ForBigEndian(width_);
+ int32 height_ = internal::SubtleMustCopy(
*(reinterpret_cast<const int32*>(img_bytes + 22)));
- const int32 bpp = internal::SubtleMustCopy(
+ const int32 height = ByteSwapInt32ForBigEndian(height_);
+ int32 bpp_ = internal::SubtleMustCopy(
*(reinterpret_cast<const int32*>(img_bytes + 28)));
+ const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
if (channels_) {
OP_REQUIRES(context, (channels_ == bpp / 8),
diff --git a/tensorflow/core/kernels/fractional_pool_common.h b/tensorflow/core/kernels/fractional_pool_common.h
index df0bbbfa06..2d7a230fc0 100644
--- a/tensorflow/core/kernels/fractional_pool_common.h
+++ b/tensorflow/core/kernels/fractional_pool_common.h
@@ -57,7 +57,7 @@ static inline void RandomShuffle(Iter first, Iter last, const Random& uniform) {
// * sum(generated_diff_pooling_sequence) = input_length
// * Let's define floor(input_length / output_length) = K, then
// K <= generated_diff_pooling_sequence[i] <= K+1
-// For example, when input_length = 10, output_length = 6, the followings are
+// For example, when input_length = 10, output_length = 6, the following are
// valid pooling sequence:
// * [1, 2, 2, 1, 2, 2]
// * [1, 1, 2, 2, 2, 2]
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 89d37d2f87..b539b00009 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -28,7 +28,7 @@ limitations under the License.
#include "mkl_dnn_types.h"
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::stream;
using mkldnn::sum;
@@ -37,7 +37,7 @@ using mkldnn::sum;
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename Device, typename T>
class MklAddNOp : public OpKernel {
@@ -285,7 +285,7 @@ class MklAddNOp : public OpKernel {
} MklAddNOpContext;
};
-#else // INTEL_MKL_DNN
+#else // INTEL_MKL_ML
template <typename Device, typename T>
class MklAddNOp : public OpKernel {
public:
@@ -317,8 +317,11 @@ class MklAddNOp : public OpKernel {
: src2_tensor.dims();
// if the shapes of two tensors are not same raise op error
TensorShape src1_shape, src2_shape;
- src1_shape = src1_tensor.shape();
- src2_shape = src2_tensor.shape();
+ src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape()
+ : src1_tensor.shape();
+ src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape()
+ : src2_tensor.shape();
+
if (!src1_shape.IsSameSize(src2_shape)) {
ctx->SetStatus(errors::InvalidArgument(
"Inputs to operation ", this->name(), " of type ",
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index a7c569ee05..d545d34fdf 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -24,7 +24,7 @@
#include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::algorithm;
using mkldnn::engine;
@@ -40,8 +40,7 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename Device, typename T>
class MklAvgPoolingOp : public OpKernel {
@@ -429,7 +428,7 @@ class MklAvgPoolingGradOp : public OpKernel {
TensorFormat data_format_;
}; // MklAvgPoolingGradOp
-#else // INTEL_MKL_DNN is defined
+#else
template <typename Device, typename T>
class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
@@ -466,6 +465,28 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
memory::dims output_dims_mkl_order;
this->GetOutputDims(pool_params, &output_dims_mkl_order);
+ // If input is an empty tensor, allocate an empty output tensor and return
+ if (input_tensor.NumElements() == 0) {
+ MklDnnShape output_mkl_shape;
+ output_mkl_shape.SetMklTensor(false);
+ TensorShape output_tf_shape;
+ if (pool_params.data_format == TensorFormat::FORMAT_NCHW) {
+ output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order);
+ } else {
+ memory::dims output_dims_NHWC_order;
+ output_dims_NHWC_order = {pool_params.tensor_in_batch,
+ static_cast<int>(pool_params.out_height),
+ static_cast<int>(pool_params.out_width),
+ pool_params.out_depth};
+ output_tf_shape = MklDnnDimsToTFShape(output_dims_NHWC_order);
+ }
+ const int kOutputIndex = 0;
+ AllocateOutputSetMklShape(context, kOutputIndex, &output_tensor,
+ output_tf_shape, output_mkl_shape);
+ CHECK_NOTNULL(output_tensor);
+ return;
+ }
+
// If input is in Mkl layout, then just get the memory format from it
// directly, instead of using input data_format to AvgPool.
if (dnn_shape_input.IsMklTensor()) {
@@ -678,7 +699,7 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
}
}; // MklAvgPoolingGradOp
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
.Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 7da63604d2..f1f267e849 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -30,7 +30,7 @@ limitations under the License.
#include "mkl_dnn_types.h"
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::concat;
@@ -62,7 +62,7 @@ class EigenConcatBaseOp : public OpKernel {
// we need to have empty Compute because Compute is pure virtual function.
void Compute(OpKernelContext* c) {}
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
const Tensor* concat_dim_tensor;
@@ -230,7 +230,7 @@ class EigenConcatBaseOp : public OpKernel {
#endif
};
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
// --------------------------------------------------------------------------
// Mkl Concat Op
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index ef3f8cfec1..1401bc65a4 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -42,7 +42,7 @@ limitations under the License.
#include "mkl_dnn_types.h"
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::convolution_backward_weights;
@@ -55,7 +55,7 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename Device, class T>
class MklConv2DCustomBackpropFilterOp : public OpKernel {
@@ -655,7 +655,7 @@ class MklConv2DCustomBackpropFilterOp
TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
#undef REGISTER_MKL_FILTER_KERNELS
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index a6745489f4..eeed009531 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -44,7 +44,7 @@ limitations under the License.
#include "tensorflow/core/util/use_cudnn.h"
#include "tensorflow/core/util/work_sharder.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::convolution_backward_data;
@@ -56,7 +56,7 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename Device, class T>
class MklConv2DCustomBackpropInputOp : public OpKernel {
@@ -493,7 +493,7 @@ class MklConv2DCustomBackpropInputOp
}
};
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
#define REGISTER_MKL_CPU_KERNELS(T) \
REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput") \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index e44fba754b..2953426d58 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -41,7 +41,8 @@ limitations under the License.
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
+
#include "mkldnn.hpp"
using mkldnn::prop_kind;
@@ -58,8 +59,8 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
template <typename Device, typename T, bool biasEnabled>
class MklConv2DOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 8b65eaea0d..9dd88221a8 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -40,7 +40,7 @@ limitations under the License.
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::prop_kind;
@@ -52,7 +52,7 @@ using mkldnn::convolution_forward;
namespace tensorflow {
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
class MklDnnConvUtil {
protected:
@@ -553,7 +553,7 @@ class MklConv2DBackpropCommonOp : public OpKernel {
Padding padding_;
TensorFormat data_format_;
};
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
/////////////////////////////////////////////////////////////////////
/// Dummy Mkl op that is just used for operators that are intermediate
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
index c065724e0d..58f0c30f32 100644
--- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0(the "License");
you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 0b6d838e09..8313224d7f 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
#include "mkl_dnn_types.h"
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::batch_normalization_backward;
@@ -41,7 +41,7 @@ using mkldnn::use_scale_shift;
namespace tensorflow {
using CPUDevice = Eigen::ThreadPoolDevice;
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename Device, typename T>
class MklFusedBatchNormOp : public OpKernel {
@@ -683,7 +683,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
};
#endif
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
template <typename Device, typename T>
class MklFusedBatchNormOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index 9ee27ee21c..6c027f8e72 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -28,14 +28,14 @@ limitations under the License.
#include "mkl_dnn_types.h"
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
#endif
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename Device, typename T>
class MklIdentityOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 73d41efce1..5a8799ae93 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -31,7 +31,7 @@ limitations under the License.
#include "tensorflow/core/kernels/mkl_tfconv_op.h"
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::stream;
@@ -59,7 +59,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
// convert the TF format input to MKL format
///////////////////////////////////////////////////////////
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename Device, typename T>
class MklInputConversionOp : public OpKernel {
public:
@@ -293,14 +293,58 @@ class MklInputConversionOp : public OpKernel {
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// If both inputs are in MKL format
if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
- // If both have the same shape, pass them through
if (tf_shapes_are_same) {
- VLOG(1) << "MklInputConversionOp: No conversion needed, "
- << "copying MKL inputs with identical shapes to output";
-
- ForwardMklTensorInToOut(context, 0, 0);
- ForwardMklTensorInToOut(context, 1, 1);
- return;
+ auto input0_md = input_shape_0.GetMklLayout();
+ auto input1_md = input_shape_1.GetMklLayout();
+
+ // If both have the same shape and same format, pass them through
+ if (input0_md.data.format == input1_md.data.format) {
+ VLOG(1) << "MklInputConversionOp: No conversion needed, "
+ << "copying MKL inputs with identical shapes to output";
+
+ ForwardMklTensorInToOut(context, 0, 0);
+ ForwardMklTensorInToOut(context, 1, 1);
+ return;
+ } else {
+ VLOG(1) << "MklInputConversionOp: Shape is same, but format is "
+ "different, "
+ << "need to convert to same format";
+
+ // Convert input0, and keep input1 unchanged
+ // Create MklDnnShape for output mkl tensor based on input0
+ Tensor* tensor_out;
+ MklDnnShape mkl_output_mkl_shape;
+ mkl_output_mkl_shape.SetMklTensor(true);
+ mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+ mkl_output_mkl_shape.SetTfLayout(input_shape_0.GetDimension(),
+ input_shape_0.GetSizesAsMklDnnDims(),
+ input_shape_0.GetTfDataFormat());
+
+ // Get MKL layout from input1 as destination layout
+ mkl_output_mkl_shape.SetMklLayout(&input1_md);
+
+ // Create output Mkl tensor for index 0
+ AllocateOutputSetMklShape(context, 0, &tensor_out,
+ input_tensor_0.shape(),
+ mkl_output_mkl_shape);
+
+ // Create MklDnnData object for input0 tesnsor
+ auto cpu_engine = engine(engine::cpu, 0);
+ MklDnnData<T> input(&cpu_engine);
+ input.SetUsrMem(input0_md, &input_tensor_0);
+
+ // Create reorder from input0's layout to input1's layout
+ std::vector<primitive> net;
+ CHECK_EQ(input.CheckReorderToOpMem(
+ memory::primitive_desc(input1_md, cpu_engine),
+ tensor_out, &net),
+ true);
+ stream(stream::kind::eager).submit(net).wait();
+
+ // Input1 will be passed through
+ ForwardMklTensorInToOut(context, 1, 1);
+ return;
+ }
}
// Sanity check
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index a8b45004b7..5f0a12a1fb 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -38,7 +38,7 @@ limitations under the License.
#include "tensorflow/core/util/work_sharder.h"
#endif
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::lrn_across_channels;
using mkldnn::lrn_backward;
@@ -67,7 +67,7 @@ void GetBandMatrix(int depth, int depth_radius,
} // namespace
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename T>
class MklLRNOp : public OpKernel {
@@ -1343,7 +1343,7 @@ class MklLRNGradOp : public OpKernel {
float beta_;
};
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
#define REGISTER_MKL_LRN_CPU(T) \
REGISTER_KERNEL_BUILDER(Name("_MklLRN") \
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 0de27ccd60..14607f26e0 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
#include "tensorflow/core/util/mkl_util.h"
#include "tensorflow/core/util/padding.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include <algorithm>
#include "mkldnn.hpp"
using mkldnn::algorithm;
@@ -39,8 +39,8 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
// An implementation of MaxPooling (forward).
template <typename Device, typename T>
@@ -494,7 +494,7 @@ class MklMaxPoolingGradOp : public OpKernel {
bool workspace_enabled_;
}; // MklMaxPoolingGradOp
-#else // INTEL_MKL_DNN is defined
+#else
// An implementation of MaxPooling (forward).
template <typename Device, typename T>
@@ -793,7 +793,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
}
}; // MklMaxPoolingGradOp
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
.Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index ef8597b057..5ef6ce2a57 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -42,7 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
Init(context, ksize, stride, padding, data_format);
}
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
// Initialization for MKL format
void MklPoolParameters::Init(OpKernelContext* context,
const std::vector<int32>& ksize,
@@ -72,7 +72,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
Init(context, ksize, stride, padding, data_format);
}
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
// Common Initialization for TensorFlow and MKL formats
void MklPoolParameters::Init(OpKernelContext* context,
const std::vector<int32>& ksize,
@@ -107,7 +107,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
tensor_in_cols, window_cols, col_stride,
padding, &out_width, &pad_left, &pad_right));
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
// TF can work with int64, but mkldnn only supports int32
// Fail if the height or width are greater than MAX_INT
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 880e45ab1e..279167aba2 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -22,7 +22,7 @@ limitations under the License.
#include "tensorflow/core/util/mkl_util.h"
#include "tensorflow/core/util/padding.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::memory;
using mkldnn::pooling_backward;
@@ -85,7 +85,7 @@ struct MklPoolParameters {
void Init(OpKernelContext* context, const std::vector<int32>& ksize,
const std::vector<int32>& stride, Padding padding,
TensorFormat data_format, const TensorShape& tensor_in_shape);
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
void Init(OpKernelContext* context, const std::vector<int32>& ksize,
const std::vector<int32>& stride, Padding padding,
TensorFormat data_format, const MklShape* mkl_in_shape);
@@ -102,7 +102,7 @@ struct MklPoolParameters {
TensorFormat data_format);
};
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
template <class T>
class MklPoolingOpBase : public OpKernel {
@@ -395,7 +395,7 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
}
};
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
//-------------------------------------------------------------------
// Utility functions
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 873aca30ca..51db3991e2 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -28,7 +28,7 @@ limitations under the License.
#include "tensorflow/core/platform/default/logging.h"
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::algorithm;
@@ -58,7 +58,7 @@ struct MklReluHelpers {
}
};
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename Device, typename T>
class MklReluOp : public OpKernel {
@@ -368,7 +368,7 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
mkl_context.MklCleanup();
}
-#else // INTEL_MKL_DNN
+#else // INTEL_MKL_ML
template <typename Device, typename T, algorithm alg_kind>
class MklReluOpBase : public OpKernel {
@@ -849,7 +849,7 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
MklReluGradOp<CPUDevice, type>);
TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
// register dnn kernels for supported operations and supported types
#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type) \
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 7d471e1e4c..5dbc4a2709 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -28,7 +28,7 @@ limitations under the License.
#include "mkl_dnn_types.h"
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::stream;
#endif
@@ -40,7 +40,7 @@ class MklReshapeOp : public OpKernel {
public:
explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
void Compute(OpKernelContext* context) override {
const Tensor& input = MklGetInput(context, 0);
const Tensor& sizes = MklGetInput(context, 1);
@@ -312,7 +312,7 @@ class MklReshapeOp : public OpKernel {
}
}
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
private:
const int kInputSlotIdx = 0;
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index c46eabdde1..aceef1e234 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
// See docs in ../ops/nn_ops.cc.
#ifdef INTEL_MKL
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/numeric_op.h"
@@ -156,5 +156,5 @@ TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
} // namespace tensorflow
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
#endif // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index c4d5a45d3c..5fafa14b5d 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -35,7 +35,7 @@ limitations under the License.
#include "mkl_dnn_types.h"
#include "tensorflow/core/util/mkl_util.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
using mkldnn::stream;
#endif
@@ -61,7 +61,7 @@ class MklToTfOp : public OpKernel {
VLOG(1) << "MKLToTFConversion complete successfully.";
}
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
string data_format_str, DataType op_data_type,
bool has_avx512f, uint input_number) {
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
new file mode 100644
index 0000000000..bcbdbee058
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -0,0 +1,334 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/register_types_traits.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+#define EIGEN_USE_THREADS
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+// you need to skip in order to make it over from one side of a dimension
+// to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+// back to the front
+template <typename T>
+void DoRoll(OpKernelContext* context, const int64 num_elements,
+ const int num_dims, const gtl::ArraySlice<int>& dim_size,
+ const T* input, T* output, const gtl::ArraySlice<int>& threshold,
+ const gtl::ArraySlice<int64>& dim_range) {
+ auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range](
+ int64 start, int64 end) {
+ // array of indices for each dimension
+ gtl::InlinedVector<int, 4> indices(num_dims);
+ int offset = 0; // the shift along the flattened tensor for current element
+ // initialize indices and offset
+ for (int i = 0; i < num_dims; i++) {
+ // stride is the number of indices over in the flattened tensor
+ // you need to skip in order to make it over to an adjacent element
+ // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+ const int64 stride = dim_range[i] / dim_size[i];
+ const int shift = dim_size[i] - threshold[i];
+ const int indx = (start / stride) % dim_size[i];
+ indices[i] = indx;
+ // calculate dimension index after the shift
+ const int shifted_indx = (indx + shift) % dim_size[i];
+ offset += (shifted_indx - indx) * stride;
+ }
+
+ for (int64 i = start; i < end; i++) {
+ output[i + offset] = input[i];
+ // create next combination of indices
+ // while at it adjust offset if needed
+ for (int j = num_dims - 1; j >= 0; j--) {
+ const int indx = (indices[j] + 1) % dim_size[j];
+ indices[j] = indx;
+ if (indx != 0) {
+ if (indx == threshold[j]) { // we've reached the threshold
+ // dim_range[j] = threshold[j] + shift[j]
+ // offset = shift[j] + ... other offsets
+ // offset - dim_range[j] = -threshold[j] + ... other offsets
+ // thus we undo our previous offset as well as add a new offset of
+ // -threshold[j] in one operation
+ offset -= dim_range[j]; // now wraps around
+ }
+ break; // indx != 0 don't need to carry
+ } else if (threshold[j] != 0) { // if threshold is 0 shift is 0
+ offset += dim_range[j]; // indx became 0 so reverse wrap around
+ }
+ }
+ }
+ };
+ // Shard
+ auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+ // 15 - expiramentally determined with float and bool types
+ const int cost_per_element = 15 * sizeof(T); // rough esitmate
+ Shard(worker_threads->num_threads, worker_threads->workers, num_elements,
+ cost_per_element, std::move(work));
+}
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+// you need to skip in order to make it over from one side of a dimension
+// to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+// back to the front
+// isd - inner shift dimension
+template <typename T>
+// Use memcpy to copy memory in groups when the data type supports memcpy
+void DoRollWithMemcpy(OpKernelContext* context, const int64 num_elements,
+ const int num_dims, const gtl::ArraySlice<int>& dim_size,
+ const T* input, T* output,
+ const gtl::ArraySlice<int>& threshold,
+ const gtl::ArraySlice<int64>& dim_range,
+ const int64 isd) {
+ auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd](
+ int64 start, int64 end) {
+ // the number of indices over in the flattened tensor you need to skip in
+ // order to make it over from one side of the isd to the other
+ const int64 isd_range = std::max<int>(dim_range[isd], 1);
+ // the distance along the flattend tensor to the next element in the isd
+ const int64 isd_stride = isd_range / std::max<int>(dim_size[isd], 1);
+
+ // start and end represent the i-th group currently so we will convert
+ // them into numbers representing the i-th elements.
+ // there are 2 groups per isd one for all elements before threshold[isd]
+ // and another for all elements after threshold[isd].
+ const int64 start_remainder = (start % 2) * threshold[isd] * isd_stride;
+ const int64 end_remainder = (end % 2) * threshold[isd] * isd_stride;
+ start = (start / 2) * isd_range + start_remainder;
+ end = (end / 2) * isd_range + end_remainder;
+
+ const T* in_ptr = &input[0];
+ T* out_ptr = &output[0];
+ in_ptr += start;
+ out_ptr += start;
+
+ // array of indices for each dimension
+ // indicies = [i, j, k, l, m, n]
+ gtl::InlinedVector<int, 4> indicies(num_dims);
+ // the offset needed to make all inner non-shifting dimensions become 0
+ int64 remainder_offset = 0;
+ // initialize indicies
+ for (int i = 0; i < num_dims; i++) {
+ // stride is the number of indices over in the flattened tensor
+ // you need to skip in order to make it over to an adjacent element
+ // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+ const int64 stride = dim_range[i] / dim_size[i];
+ const int shift = dim_size[i] - threshold[i];
+ const int indx = (start / stride) % dim_size[i];
+ indicies[i] = indx;
+ // calculate dimension index after the shift
+ int out_indx = (indx + shift) % dim_size[i];
+ if (i > isd) {
+ // trailing zeroes for indices after the inner shifted dimension
+ out_indx = 0;
+ remainder_offset += (out_indx - indx) * stride;
+ }
+ out_ptr += (out_indx - indx) * stride;
+ }
+ // set trailing zeroes for indices after the inner shifted dimension
+ for (int i = num_dims - 1; i > isd; i--) indicies[i] = 0;
+
+ // the number of indices in the isd dimension the next group will skip
+ // to make it to the next threshold or end point
+ int isd_indx_skip = 0;
+ // the size of the next group
+ int64 group_size = 0;
+ // initialize isd_indx_skip and group_size
+ if (indicies[isd] < threshold[isd]) {
+ isd_indx_skip = threshold[isd] - indicies[isd];
+ group_size = isd_indx_skip * isd_stride + remainder_offset;
+ } else {
+ isd_indx_skip = dim_size[isd] - indicies[isd];
+ group_size = isd_indx_skip * isd_stride + remainder_offset;
+ }
+
+ int64 i = start;
+ while (i < end) {
+ // copy group of elements
+ memcpy(out_ptr, in_ptr, group_size * sizeof(T));
+
+ // shift i and the pointers over to the next group position
+ i += group_size;
+ out_ptr += group_size;
+ in_ptr += group_size;
+
+ // produce next combination of indices and adjust the out_ptr position
+ // to fix the offset if necessary
+ // the isd (inner shift dim) should skip to next threshold or endpoint
+ // all dimensions to the left increment by 1 when a digit is carried
+ // all dimensions to the right remain set to 0
+ // +1 +1 +1 +isd_indx_skip
+ // indicies = [i, j, k, l, 0, 0]
+ // ^isd
+ for (int j = isd; j >= 0; j--) {
+ int inc = 1;
+ if (j == isd) inc = isd_indx_skip;
+ const int indx = (indicies[j] + inc) % dim_size[j];
+ indicies[j] = indx;
+ if (indx != 0) {
+ if (indx == threshold[j]) {
+ out_ptr -= dim_range[j]; // now wraps around
+ }
+ break; // indx != 0 don't need to carry
+ } else if (threshold[j] != 0) { // if threshold is 0 shift is 0
+ out_ptr += dim_range[j]; // indx became 0 so reverse wrap around
+ }
+ }
+
+ // set isd_indx_skip and group_size for next iteration
+ if (indicies[isd] < threshold[isd]) {
+ isd_indx_skip = threshold[isd] - indicies[isd];
+ group_size = isd_indx_skip * isd_stride;
+ } else {
+ isd_indx_skip = dim_size[isd] - indicies[isd];
+ group_size = isd_indx_skip * isd_stride;
+ }
+ }
+ };
+ // Shard
+ auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+ const int64 ave_group_size = dim_range[isd] / 2;
+ const int total_work = 2 * num_elements / std::max<int>(dim_range[isd], 1);
+ // 25000 - expiramentally determined with float and bool types
+ const int cost_per_group = 25000 * sizeof(T) * ave_group_size;
+ Shard(worker_threads->num_threads, worker_threads->workers, total_work,
+ cost_per_group, std::move(work));
+}
+
+template <typename Device, typename T, typename Tshift, typename Taxis>
+class RollOp : public OpKernel {
+ public:
+ explicit RollOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ // Grab the input tensor
+ const Tensor& input = context->input(0);
+ const Tensor& shift = context->input(1);
+ const Tensor& axis = context->input(2);
+
+ auto shift_flat = shift.flat<Tshift>();
+ auto axis_flat = axis.flat<Taxis>();
+
+ OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+ errors::InvalidArgument("input must be 1-D or higher"));
+ OP_REQUIRES(context, shift.shape().dims() <= 1,
+ errors::InvalidArgument(
+ "shift must be a scalar or a 1-D vector. Found: ",
+ shift.shape().DebugString()));
+ OP_REQUIRES(context, axis.shape().dims() <= 1,
+ errors::InvalidArgument(
+ "axis must be a scalar or a 1-D vector. Found: ",
+ axis.shape().DebugString()));
+ OP_REQUIRES(
+ context, shift.shape() == axis.shape(),
+ errors::InvalidArgument("shift and axis must have the same size"));
+ const int64 num_elements = input.NumElements();
+ const int num_shifts = static_cast<int>(shift_flat.size());
+ const int num_dims = input.dims();
+
+ // if there are any duplicate axes, shift_mod_sum will have the
+ // total modulo sum of shifts for each dimension
+ gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
+ for (int i = 0; i < num_shifts; i++) {
+ const int axis = axis_flat(i);
+ OP_REQUIRES(context, axis < num_dims,
+ errors::InvalidArgument("axis ", axis, " is out of range"));
+ const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
+ const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
+ // modulo that works with negatives: ((x % y) + y) % y
+ shift_mod_sum[axis] = (sum % ds + ds) % ds;
+ }
+ // the size of each dimension
+ gtl::InlinedVector<int, 4> dim_size(num_dims);
+ // threshold[i] is the index that the roll starts to wrap back to the front
+ gtl::InlinedVector<int, 4> threshold(num_dims);
+ // dim_range is the number of indices over in the flattened tensor
+ // you need to skip in order to make it over from one side of a dimension
+ // to the other. Used to make the shifts wrap around after a threshold.
+ gtl::InlinedVector<int64, 4> dim_range(num_dims);
+ int64 dim_size_prod = 1; // dimension size product
+ // inner shift dimension (inner most shifted dimension)
+ int64 isd = 0;
+ for (int i = num_dims - 1; i >= 0; i--) {
+ if (isd == 0 && shift_mod_sum[i] != 0) isd = i;
+ const int ds = std::max<int>(static_cast<int>(input.dim_size(i)), 1);
+ dim_size[i] = ds;
+ threshold[i] = (ds - shift_mod_sum[i]) % ds;
+ dim_size_prod *= static_cast<int64>(input.dim_size(i));
+ dim_range[i] = dim_size_prod;
+ }
+
+ Tensor* output = NULL;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, input.shape(), &output));
+ auto input_flat = input.flat<T>().data();
+ auto output_flat = output->flat<T>().data();
+
+ if (std::is_same<Device, CPUDevice>::value) {
+ if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+ // V2 copies memory in groups instead of element by element
+ DoRollWithMemcpy<T>(context, num_elements, num_dims, dim_size,
+ input_flat, output_flat, threshold, dim_range, isd);
+ } else {
+ // incase memcpy does not work for current data type
+ DoRoll<T>(context, num_elements, num_dims, dim_size, input_flat,
+ output_flat, threshold, dim_range);
+ }
+ }
+ }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(type) \
+ REGISTER_KERNEL_BUILDER(Name("Roll") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int32>("Tshift") \
+ .TypeConstraint<int32>("Taxis"), \
+ RollOp<CPUDevice, type, int32, int32>) \
+ REGISTER_KERNEL_BUILDER(Name("Roll") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int64>("Tshift") \
+ .TypeConstraint<int32>("Taxis"), \
+ RollOp<CPUDevice, type, int64, int32>) \
+ REGISTER_KERNEL_BUILDER(Name("Roll") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int32>("Tshift") \
+ .TypeConstraint<int64>("Taxis"), \
+ RollOp<CPUDevice, type, int32, int64>) \
+ REGISTER_KERNEL_BUILDER(Name("Roll") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int64>("Tshift") \
+ .TypeConstraint<int64>("Taxis"), \
+ RollOp<CPUDevice, type, int64, int64>)
+
+TF_CALL_ALL_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
new file mode 100644
index 0000000000..90b6f8d0f3
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -0,0 +1,484 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class RollOpTest : public OpsTestBase {
+ protected:
+ void MakeOp(DataType data_type, DataType index_type) {
+ TF_ASSERT_OK(NodeDefBuilder("myop", "Roll")
+ .Input(FakeInput(data_type))
+ .Input(FakeInput(index_type))
+ .Input(FakeInput(index_type))
+ .Finalize(node_def()));
+ TF_ASSERT_OK(InitOp());
+ }
+};
+
+TEST_F(RollOpTest, ScalarIndices) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({}), {3});
+ AddInputFromArray<int32>(TensorShape({}), {0});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+ test::FillValues<float>(&expected, {2, 3, 4, 0, 1});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_NoMemcpy) {
+ MakeOp(DT_STRING, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({5}), {"a", "b", "c", "d", "e"});
+ AddInputFromArray<int32>(TensorShape({}), {3});
+ AddInputFromArray<int32>(TensorShape({}), {0});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_STRING, TensorShape({5}));
+ test::FillValues<string>(&expected, {"c", "d", "e", "a", "b"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_Complex) {
+ MakeOp(DT_COMPLEX64, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<std::complex<float>>(
+ TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
+ std::complex<float>(2, 12), std::complex<float>(3, 13),
+ std::complex<float>(4, 14)});
+ AddInputFromArray<int32>(TensorShape({}), {3});
+ AddInputFromArray<int32>(TensorShape({}), {0});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_COMPLEX64, TensorShape({5}));
+ test::FillValues<std::complex<float>>(
+ &expected, {std::complex<float>(2, 12), std::complex<float>(3, 13),
+ std::complex<float>(4, 14), std::complex<float>(0, 10),
+ std::complex<float>(1, 11)});
+ test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({3, 5}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+ AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+ AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+ test::FillValues<float>(&expected,
+ {6, 7, 8, 9, 5, 11, 12, 13, 14, 10, 1, 2, 3, 4, 0});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32_NoMemcpy) {
+ MakeOp(DT_STRING, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({3, 5}),
+ {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+ "k", "l", "m", "n", "o"});
+ AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+ AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+ test::FillValues<string>(&expected, {"g", "h", "i", "j", "f", "l", "m", "n",
+ "o", "k", "b", "c", "d", "e", "a"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({2, 2, 3}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+ AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+ test::FillValues<float>(&expected, {10, 11, 9, 7, 8, 6, 4, 5, 3, 1, 2, 0});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) {
+ MakeOp(DT_STRING, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<string>(
+ TensorShape({2, 2, 3}),
+ {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+ AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+ test::FillValues<string>(
+ &expected, {"k", "l", "j", "h", "i", "g", "e", "f", "d", "b", "c", "a"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64) {
+ MakeOp(DT_FLOAT, DT_INT64);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 3}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+ AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+ AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+ test::FillValues<float>(&expected,
+ {5, 3, 4, 8, 6, 7, 11, 9, 10, 14, 12, 13, 2, 0, 1});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64_NoMemcpy) {
+ MakeOp(DT_STRING, DT_INT64);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({5, 3}),
+ {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+ "k", "l", "m", "n", "o"});
+ AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+ AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_STRING, TensorShape({5, 3}));
+ test::FillValues<string>(&expected, {"f", "d", "e", "i", "g", "h", "l", "j",
+ "k", "o", "m", "n", "c", "a", "b"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64) {
+ MakeOp(DT_FLOAT, DT_INT64);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({4, 1, 3}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+ AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+ AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 1, 3}));
+ test::FillValues<float>(&expected, {1, 2, 0, 4, 5, 3, 7, 8, 6, 10, 11, 9});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) {
+ MakeOp(DT_STRING, DT_INT64);
+
+ // Feed and run
+ AddInputFromArray<string>(
+ TensorShape({4, 1, 3}),
+ {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+ AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+ AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_STRING, TensorShape({4, 1, 3}));
+ test::FillValues<string>(
+ &expected, {"b", "c", "a", "e", "f", "d", "h", "i", "g", "k", "l", "j"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({2, 2, 3}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+ test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) {
+ MakeOp(DT_STRING, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<string>(
+ TensorShape({2, 2, 3}),
+ {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+ AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+ test::FillValues<string>(
+ &expected, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({5, 0, 0}), {});
+ AddInputFromArray<int32>(TensorShape({}), {1});
+ AddInputFromArray<int32>(TensorShape({}), {0});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 0, 0}));
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32_NoMemcpy) {
+ MakeOp(DT_STRING, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({5, 0, 0}), {});
+ AddInputFromArray<int32>(TensorShape({}), {1});
+ AddInputFromArray<int32>(TensorShape({}), {0});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_STRING, TensorShape({5, 0, 0}));
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({1, 1, 1}), {5});
+ AddInputFromArray<int32>(TensorShape({}), {1});
+ AddInputFromArray<int32>(TensorShape({}), {0});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1}));
+ test::FillValues<float>(&expected, {5});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32_NoMemcpy) {
+ MakeOp(DT_STRING, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({1, 1, 1}), {"a"});
+ AddInputFromArray<int32>(TensorShape({}), {1});
+ AddInputFromArray<int32>(TensorShape({}), {0});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_STRING, TensorShape({1, 1, 1}));
+ test::FillValues<string>(&expected, {"a"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({3, 5}),
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+ AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+ AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+ test::FillValues<float>(&expected,
+ {11, 12, 13, 14, 10, 1, 2, 3, 4, 0, 6, 7, 8, 9, 5});
+ test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32_NoMemcpy) {
+ MakeOp(DT_STRING, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<string>(TensorShape({3, 5}),
+ {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+ "k", "l", "m", "n", "o"});
+ AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+ AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+ TF_ASSERT_OK(RunOpKernel());
+
+ // Check the output.
+ Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+ test::FillValues<string>(&expected, {"l", "m", "n", "o", "k", "b", "c", "d",
+ "e", "a", "g", "h", "i", "j", "f"});
+ test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({}), {7});
+ AddInputFromArray<int32>(TensorShape({}), {1});
+ AddInputFromArray<int32>(TensorShape({}), {0});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+ << s;
+}
+
+TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({}), {1});
+ AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("axis must be a scalar or a 1-D vector"))
+ << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+ AddInputFromArray<int32>(TensorShape({}), {1});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("shift must be a scalar or a 1-D vector"))
+ << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({1}), {1});
+ AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString())
+ .contains("shift and axis must have the same size"))
+ << s;
+}
+
+TEST_F(RollOpTest, Error_AxisOutOfRange) {
+ MakeOp(DT_FLOAT, DT_INT32);
+
+ // Feed and run
+ AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
+ AddInputFromArray<int32>(TensorShape({}), {1});
+ AddInputFromArray<int32>(TensorShape({}), {1});
+ Status s = RunOpKernel();
+ EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+}
+
+// isd - (inner shift dimension) The inner most dimension to be shifted.
+// All outer dimensions will also be shifted for testing.
+static Graph* RollGraph(const TensorShape& shape, int isd) {
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor input(DT_FLOAT, shape);
+ input.flat<float>().setRandom();
+ const int dims = static_cast<int>(input.dims());
+ Tensor shift(DT_INT32, TensorShape({dims}));
+ for (int i = 0; i < dims; i++) {
+ // shift the inner shift dimension and all outer dimensions
+ shift.flat<int32>()(i) = (i <= isd) ? 2 : 0;
+ }
+ Tensor axis(DT_INT32, TensorShape({dims}));
+ for (int i = 0; i < dims; i++) {
+ axis.flat<int32>()(i) = i;
+ }
+ test::graph::Roll(g, test::graph::Constant(g, input),
+ test::graph::Constant(g, shift),
+ test::graph::Constant(g, axis));
+ return g;
+}
+
+#define BM_ROLL_OUTER(DEVICE) \
+ static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) { \
+ TensorShape shape{rows, columns}; \
+ const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+ testing::ItemsProcessed(num_items); \
+ testing::BytesProcessed(num_items * sizeof(float)); \
+ testing::UseRealTime(); \
+ test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters); \
+ } \
+ BENCHMARK(BM_##DEVICE##_roll_outer) \
+ ->ArgPair(256, 256) \
+ ->ArgPair(512, 512) \
+ ->ArgPair(1024, 1024) \
+ ->ArgPair(2048, 2048)
+
+#define BM_ROLL_ALL(DEVICE) \
+ static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) { \
+ TensorShape shape{rows, columns}; \
+ const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+ testing::ItemsProcessed(num_items); \
+ testing::BytesProcessed(num_items * sizeof(float)); \
+ testing::UseRealTime(); \
+ test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters); \
+ } \
+ BENCHMARK(BM_##DEVICE##_roll_all) \
+ ->ArgPair(256, 256) \
+ ->ArgPair(512, 512) \
+ ->ArgPair(1024, 1024) \
+ ->ArgPair(2048, 2048)
+
+BM_ROLL_OUTER(cpu);
+BM_ROLL_ALL(cpu);
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
new file mode 100644
index 0000000000..a61272675b
--- /dev/null
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+namespace {
+template <typename T>
+struct mod_op {
+ const T operator()(const T& a, const T& b) const { return a % b; }
+};
+} // namespace
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Tidx>
+class UnravelIndexOp : public OpKernel {
+ public:
+ explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& indices_tensor = ctx->input(0);
+ OP_REQUIRES(ctx,
+ TensorShapeUtils::IsVector(indices_tensor.shape()) ||
+ TensorShapeUtils::IsScalar(indices_tensor.shape()),
+ errors::InvalidArgument(
+ "The indices can only be scalar or vector, got \"",
+ indices_tensor.shape().DebugString(), "\""));
+
+ const Tensor& dims_tensor = ctx->input(1);
+ OP_REQUIRES(
+ ctx, TensorShapeUtils::IsVector(dims_tensor.shape()),
+ errors::InvalidArgument("The indices can only be 1-D, got \"",
+ dims_tensor.shape().DebugString(), "\""));
+
+ auto dims = dims_tensor.vec<Tidx>();
+
+ Eigen::array<bool, 1> reverse({true});
+
+ Tensor strides_tensor;
+ OP_REQUIRES_OK(ctx,
+ ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+ TensorShape({dims_tensor.NumElements()}),
+ &strides_tensor));
+
+ auto strides = strides_tensor.vec<Tidx>();
+ strides = dims.reverse(reverse)
+ .scan(0, Eigen::internal::ProdReducer<Tidx>(), false)
+ .reverse(reverse);
+
+ Tensor strides_shifted_tensor;
+ OP_REQUIRES_OK(ctx,
+ ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+ TensorShape({dims_tensor.NumElements()}),
+ &strides_shifted_tensor));
+
+ auto strides_shifted = strides_shifted_tensor.vec<Tidx>();
+ strides_shifted = dims.reverse(reverse)
+ .scan(0, Eigen::internal::ProdReducer<Tidx>(), true)
+ .reverse(reverse);
+
+ Tensor* output_tensor = nullptr;
+ if (TensorShapeUtils::IsScalar(indices_tensor.shape())) {
+ OP_REQUIRES_OK(
+ ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}),
+ &output_tensor));
+
+ auto output = output_tensor->vec<Tidx>();
+
+ output = output.constant(indices_tensor.scalar<Tidx>()());
+ output = output.binaryExpr(strides, mod_op<Tidx>()) / strides_shifted;
+ } else {
+ OP_REQUIRES_OK(
+ ctx, ctx->allocate_output(0,
+ TensorShape({dims_tensor.NumElements(),
+ indices_tensor.NumElements()}),
+ &output_tensor));
+
+ auto output = output_tensor->matrix<Tidx>();
+
+ Eigen::array<int64, 2> reshape{{dims_tensor.NumElements(), 1}};
+ Eigen::array<int64, 2> bcast({1, indices_tensor.NumElements()});
+ Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}};
+ Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1});
+
+ output = indices_tensor.vec<Tidx>()
+ .reshape(indices_reshape)
+ .broadcast(indices_bcast);
+ output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast),
+ mod_op<Tidx>()) /
+ strides_shifted.reshape(reshape).broadcast(bcast);
+ }
+ }
+};
+
+#define REGISTER_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("UnravelIndex").Device(DEVICE_CPU).TypeConstraint<type>("Tidx"), \
+ UnravelIndexOp<type>);
+TF_CALL_int32(REGISTER_KERNEL) TF_CALL_int64(REGISTER_KERNEL)
+#undef REGISTER_KERNEL
+
+} // namespace tensorflow
diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index 8b8c1392a1..09336e79cd 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -57,6 +57,43 @@ Status RandomAccessInputStream::ReadNBytes(int64 bytes_to_read,
return Status::OK();
}
+// To limit memory usage, the default implementation of SkipNBytes() only reads
+// 8MB at a time.
+static constexpr int64 kMaxSkipSize = 8 * 1024 * 1024;
+
+Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) {
+ if (bytes_to_skip < 0) {
+ return errors::InvalidArgument("Can't skip a negative number of bytes");
+ }
+ std::unique_ptr<char[]> scratch(new char[kMaxSkipSize]);
+ // Try to read 1 bytes first, if we could complete the read then EOF is
+ // not reached yet and we could return.
+ if (bytes_to_skip > 0) {
+ StringPiece data;
+ Status s = file_->Read(pos_ + bytes_to_skip - 1, 1, &data, scratch.get());
+ if ((s.ok() || errors::IsOutOfRange(s)) && data.size() == 1) {
+ pos_ += bytes_to_skip;
+ return Status::OK();
+ }
+ }
+ // Read kDefaultSkipSize at a time till bytes_to_skip.
+ while (bytes_to_skip > 0) {
+ int64 bytes_to_read = std::min<int64>(kMaxSkipSize, bytes_to_skip);
+ StringPiece data;
+ Status s = file_->Read(pos_, bytes_to_read, &data, scratch.get());
+ if (s.ok() || errors::IsOutOfRange(s)) {
+ pos_ += data.size();
+ } else {
+ return s;
+ }
+ if (data.size() < bytes_to_read) {
+ return errors::OutOfRange("reached end of file");
+ }
+ bytes_to_skip -= bytes_to_read;
+ }
+ return Status::OK();
+}
+
int64 RandomAccessInputStream::Tell() const { return pos_; }
} // namespace io
diff --git a/tensorflow/core/lib/io/random_inputstream.h b/tensorflow/core/lib/io/random_inputstream.h
index 09ebe9ba49..bdbdbd71ff 100644
--- a/tensorflow/core/lib/io/random_inputstream.h
+++ b/tensorflow/core/lib/io/random_inputstream.h
@@ -34,6 +34,8 @@ class RandomAccessInputStream : public InputStreamInterface {
Status ReadNBytes(int64 bytes_to_read, string* result) override;
+ Status SkipNBytes(int64 bytes_to_skip) override;
+
int64 Tell() const override;
Status Seek(int64 position) {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 87dfa77689..267ce88440 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -335,6 +335,13 @@ REGISTER_OP("Unpack")
return Status::OK();
});
+REGISTER_OP("UnravelIndex")
+ .Input("indices: Tidx")
+ .Input("dims: Tidx")
+ .Output("output: Tidx")
+ .Attr("Tidx: {int32, int64} = DT_INT32")
+ .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
+
// --------------------------------------------------------------------------
// TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
// in the N == 1 case to remove the node.
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index ef2ac267cc..a62e2d782b 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -586,6 +586,17 @@ REGISTER_OP("NonMaxSuppression")
.Output("selected_indices: int32")
.Attr("iou_threshold: float = 0.5")
.SetShapeFn([](InferenceContext* c) {
+ // Get inputs and validate ranks.
+ ShapeHandle boxes;
+ TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+ ShapeHandle scores;
+ TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+ ShapeHandle max_output_size;
+ TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+ // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+ DimensionHandle unused;
+ TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
c->set_output(0, c->Vector(c->UnknownDim()));
return Status::OK();
});
@@ -597,6 +608,19 @@ REGISTER_OP("NonMaxSuppressionV2")
.Input("iou_threshold: float")
.Output("selected_indices: int32")
.SetShapeFn([](InferenceContext* c) {
+ // Get inputs and validate ranks.
+ ShapeHandle boxes;
+ TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+ ShapeHandle scores;
+ TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+ ShapeHandle max_output_size;
+ TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+ ShapeHandle iou_threshold;
+ TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
+ // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+ DimensionHandle unused;
+ TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
c->set_output(0, c->Vector(c->UnknownDim()));
return Status::OK();
});
diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
new file mode 100644
index 0000000000..95b4774fe6
--- /dev/null
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+REGISTER_OP("Roll")
+ .Input("input: T")
+ .Input("shift: Tshift")
+ .Input("axis: Taxis")
+ .Output("output: T")
+ .Attr("T: type")
+ .Attr("Tshift: {int32,int64}")
+ .Attr("Taxis: {int32,int64}")
+ .SetShapeFn(shape_inference::UnchangedShape);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 62661fe4bd..67481fd202 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1818,7 +1818,7 @@ REGISTER_OP("_MklMaxPool")
.Input("input: T")
.Input("mkl_input: uint8")
.Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
.Output("workspace: T")
#else
.Output("workspace: uint8")
@@ -1844,7 +1844,7 @@ REGISTER_OP("_MklMaxPoolGrad")
.Input("orig_input: T")
.Input("orig_output: T")
.Input("grad: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
.Input("workspace: T")
#else
.Input("workspace: uint8")
@@ -1916,7 +1916,7 @@ REGISTER_OP("_MklLRN")
.Input("input: T")
.Input("mkl_input: uint8")
.Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
.Output("workspace: T")
#else
.Output("workspace: uint8")
@@ -1944,7 +1944,7 @@ REGISTER_OP("_MklLRNGrad")
.Input("input_grads: T")
.Input("input_image: T")
.Input("output_image: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
.Input("workspace: T")
#else
.Input("workspace: uint8")
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index b0d7b3a67a..b570658158 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -97,14 +97,17 @@ std::once_flag g_cpu_feature_guard_warn_once_flag;
void InfoAboutUnusedCPUFeatures() {
std::call_once(g_cpu_feature_guard_warn_once_flag, [] {
string missing_instructions;
-#ifdef PLATFORM_WINDOWS
+#if defined(_MSC_VER) && !defined(__clang__)
+
#ifndef __AVX__
CheckIfFeatureUnused(CPUFeature::AVX, "AVX", missing_instructions);
#endif // __AVX__
#ifndef __AVX2__
CheckIfFeatureUnused(CPUFeature::AVX2, "AVX2", missing_instructions);
#endif // __AVX2__
-#else // ifdef platform windows
+
+#else // if defined(_MSC_VER) && !defined(__clang__)
+
#ifndef __SSE__
CheckIfFeatureUnused(CPUFeature::SSE, "SSE", missing_instructions);
#endif // __SSE__
@@ -132,7 +135,7 @@ void InfoAboutUnusedCPUFeatures() {
#ifndef __FMA__
CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
#endif // __FMA__
-#endif // else of ifdef platform windows
+#endif // else of if defined(_MSC_VER) && !defined(__clang__)
if (!missing_instructions.empty()) {
LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
<< "binary was not compiled to use:" << missing_instructions;
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 2da20bb1b8..7b580c8bf6 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -42,7 +42,7 @@ namespace profile_utils {
class CpuUtils {
public:
// Constant for invalid frequency.
- // This value is returned when the furequency is not obtained somehow.
+ // This value is returned when the frequency is not obtained somehow.
static constexpr int64 INVALID_FREQUENCY = -1;
static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
@@ -105,7 +105,7 @@ class CpuUtils {
static int64 GetCycleCounterFrequency();
#endif
- // Return micro secound per each clock
+ // Return micro second per each clock
// As this method caches the cpu frequency internally,
// the first call will incur overhead, but not subsequent calls.
static double GetMicroSecPerClock();
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index ebda3a2065..52bf0d4694 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -22,6 +22,7 @@ limitations under the License.
#include <aws/core/Aws.h>
#include <aws/core/config/AWSProfileConfigLoader.h>
#include <aws/core/utils/FileSystemUtils.h>
+#include <aws/core/utils/StringUtils.h>
#include <aws/core/utils/logging/AWSLogging.h>
#include <aws/core/utils/logging/LogSystemInterface.h>
#include <aws/s3/S3Client.h>
@@ -128,6 +129,15 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
return cfg;
};
+void ShutdownClient(Aws::S3::S3Client* s3_client) {
+ if (s3_client != nullptr) {
+ delete s3_client;
+ Aws::SDKOptions options;
+ Aws::ShutdownAPI(options);
+ AWSLogSystem::ShutdownAWSLogging();
+ }
+}
+
Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
string* object) {
if (!bucket || !object) {
@@ -155,12 +165,12 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
class S3RandomAccessFile : public RandomAccessFile {
public:
- S3RandomAccessFile(const string& bucket, const string& object)
- : bucket_(bucket), object_(object) {}
+ S3RandomAccessFile(const string& bucket, const string& object,
+ std::shared_ptr<Aws::S3::S3Client> s3_client)
+ : bucket_(bucket), object_(object), s3_client_(s3_client) {}
Status Read(uint64 offset, size_t n, StringPiece* result,
char* scratch) const override {
- Aws::S3::S3Client s3Client(GetDefaultClientConfig());
Aws::S3::Model::GetObjectRequest getObjectRequest;
getObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
string bytes = strings::StrCat("bytes=", offset, "-", offset + n - 1);
@@ -168,7 +178,7 @@ class S3RandomAccessFile : public RandomAccessFile {
getObjectRequest.SetResponseStreamFactory([]() {
return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag);
});
- auto getObjectOutcome = s3Client.GetObject(getObjectRequest);
+ auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest);
if (!getObjectOutcome.IsSuccess()) {
n = 0;
*result = StringPiece(scratch, n);
@@ -186,13 +196,16 @@ class S3RandomAccessFile : public RandomAccessFile {
private:
string bucket_;
string object_;
+ std::shared_ptr<Aws::S3::S3Client> s3_client_;
};
class S3WritableFile : public WritableFile {
public:
- S3WritableFile(const string& bucket, const string& object)
+ S3WritableFile(const string& bucket, const string& object,
+ std::shared_ptr<Aws::S3::S3Client> s3_client)
: bucket_(bucket),
object_(object),
+ s3_client_(s3_client),
sync_needed_(true),
outfile_(Aws::MakeShared<Aws::Utils::TempFile>(
kS3FileSystemAllocationTag, "/tmp/s3_filesystem_XXXXXX",
@@ -231,17 +244,13 @@ class S3WritableFile : public WritableFile {
if (!sync_needed_) {
return Status::OK();
}
- Aws::Client::ClientConfiguration clientConfig = GetDefaultClientConfig();
- clientConfig.connectTimeoutMs = 300000;
- clientConfig.requestTimeoutMs = 600000;
- Aws::S3::S3Client s3Client(clientConfig);
Aws::S3::Model::PutObjectRequest putObjectRequest;
putObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
long offset = outfile_->tellp();
outfile_->seekg(0);
putObjectRequest.SetBody(outfile_);
putObjectRequest.SetContentLength(offset);
- auto putObjectOutcome = s3Client.PutObject(putObjectRequest);
+ auto putObjectOutcome = this->s3_client_->PutObject(putObjectRequest);
outfile_->clear();
outfile_->seekp(offset);
if (!putObjectOutcome.IsSuccess()) {
@@ -256,6 +265,7 @@ class S3WritableFile : public WritableFile {
private:
string bucket_;
string object_;
+ std::shared_ptr<Aws::S3::S3Client> s3_client_;
bool sync_needed_;
std::shared_ptr<Aws::Utils::TempFile> outfile_;
};
@@ -274,31 +284,46 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
} // namespace
-S3FileSystem::S3FileSystem() {
- AWSLogSystem::InitializeAWSLogging();
-
- Aws::SDKOptions options;
- options.cryptoOptions.sha256Factory_create_fn = []() {
- return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
- };
- options.cryptoOptions.sha256HMACFactory_create_fn = []() {
- return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag);
- };
- Aws::InitAPI(options);
-}
-
-S3FileSystem::~S3FileSystem() {
- Aws::SDKOptions options;
- Aws::ShutdownAPI(options);
+S3FileSystem::S3FileSystem()
+ : s3_client_(nullptr, ShutdownClient), client_lock_() {}
+
+S3FileSystem::~S3FileSystem() {}
+
+// Initializes s3_client_, if needed, and returns it.
+std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
+ std::lock_guard<mutex> lock(this->client_lock_);
+
+ if (this->s3_client_.get() == nullptr) {
+ AWSLogSystem::InitializeAWSLogging();
+
+ Aws::SDKOptions options;
+ options.cryptoOptions.sha256Factory_create_fn = []() {
+ return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
+ };
+ options.cryptoOptions.sha256HMACFactory_create_fn = []() {
+ return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag);
+ };
+ Aws::InitAPI(options);
+
+ // The creation of S3Client disables virtual addressing:
+ // S3Client(clientConfiguration, signPayloads, useVirtualAdressing = true)
+ // The purpose is to address the issue encountered when there is an `.`
+ // in the bucket name. Due to TLS hostname validation or DNS rules,
+ // the bucket may not be resolved. Disabling of virtual addressing
+ // should address the issue. See GitHub issue 16397 for details.
+ this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(new Aws::S3::S3Client(
+ GetDefaultClientConfig(),
+ Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false));
+ }
- AWSLogSystem::ShutdownAWSLogging();
+ return this->s3_client_;
}
Status S3FileSystem::NewRandomAccessFile(
const string& fname, std::unique_ptr<RandomAccessFile>* result) {
string bucket, object;
TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
- result->reset(new S3RandomAccessFile(bucket, object));
+ result->reset(new S3RandomAccessFile(bucket, object, this->GetS3Client()));
return Status::OK();
}
@@ -306,7 +331,7 @@ Status S3FileSystem::NewWritableFile(const string& fname,
std::unique_ptr<WritableFile>* result) {
string bucket, object;
TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
- result->reset(new S3WritableFile(bucket, object));
+ result->reset(new S3WritableFile(bucket, object, this->GetS3Client()));
return Status::OK();
}
@@ -321,7 +346,7 @@ Status S3FileSystem::NewAppendableFile(const string& fname,
string bucket, object;
TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
- result->reset(new S3WritableFile(bucket, object));
+ result->reset(new S3WritableFile(bucket, object, this->GetS3Client()));
while (true) {
status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk,
@@ -372,7 +397,6 @@ Status S3FileSystem::GetChildren(const string& dir,
prefix.push_back('/');
}
- Aws::S3::S3Client s3Client(GetDefaultClientConfig());
Aws::S3::Model::ListObjectsRequest listObjectsRequest;
listObjectsRequest.WithBucket(bucket.c_str())
.WithPrefix(prefix.c_str())
@@ -383,7 +407,8 @@ Status S3FileSystem::GetChildren(const string& dir,
Aws::S3::Model::ListObjectsResult listObjectsResult;
do {
- auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+ auto listObjectsOutcome =
+ this->GetS3Client()->ListObjects(listObjectsRequest);
if (!listObjectsOutcome.IsSuccess()) {
string error = strings::StrCat(
listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -417,11 +442,10 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
string bucket, object;
TF_RETURN_IF_ERROR(ParseS3Path(fname, true, &bucket, &object));
- Aws::S3::S3Client s3Client(GetDefaultClientConfig());
if (object.empty()) {
Aws::S3::Model::HeadBucketRequest headBucketRequest;
headBucketRequest.WithBucket(bucket.c_str());
- auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
+ auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest);
if (!headBucketOutcome.IsSuccess()) {
string error = strings::StrCat(
headBucketOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -439,7 +463,7 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
headObjectRequest.SetResponseStreamFactory(
[]() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
- auto headObjectOutcome = s3Client.HeadObject(headObjectRequest);
+ auto headObjectOutcome = this->GetS3Client()->HeadObject(headObjectRequest);
if (headObjectOutcome.IsSuccess()) {
stats->length = headObjectOutcome.GetResult().GetContentLength();
stats->is_directory = 0;
@@ -457,7 +481,8 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
.WithMaxKeys(1);
listObjectsRequest.SetResponseStreamFactory(
[]() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
- auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+ auto listObjectsOutcome =
+ this->GetS3Client()->ListObjects(listObjectsRequest);
if (listObjectsOutcome.IsSuccess()) {
if (listObjectsOutcome.GetResult().GetContents().size() > 0) {
stats->length = 0;
@@ -475,11 +500,11 @@ Status S3FileSystem::DeleteFile(const string& fname) {
string bucket, object;
TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
- Aws::S3::S3Client s3Client(GetDefaultClientConfig());
Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
deleteObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
- auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
+ auto deleteObjectOutcome =
+ this->GetS3Client()->DeleteObject(deleteObjectRequest);
if (!deleteObjectOutcome.IsSuccess()) {
string error = strings::StrCat(
deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -494,10 +519,9 @@ Status S3FileSystem::CreateDir(const string& dirname) {
TF_RETURN_IF_ERROR(ParseS3Path(dirname, true, &bucket, &object));
if (object.empty()) {
- Aws::S3::S3Client s3Client(GetDefaultClientConfig());
Aws::S3::Model::HeadBucketRequest headBucketRequest;
headBucketRequest.WithBucket(bucket.c_str());
- auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
+ auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest);
if (!headBucketOutcome.IsSuccess()) {
return errors::NotFound("The bucket ", bucket, " was not found.");
}
@@ -517,7 +541,6 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
string bucket, object;
TF_RETURN_IF_ERROR(ParseS3Path(dirname, false, &bucket, &object));
- Aws::S3::S3Client s3Client(GetDefaultClientConfig());
string prefix = object;
if (prefix.back() != '/') {
prefix.push_back('/');
@@ -528,7 +551,8 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
.WithMaxKeys(2);
listObjectsRequest.SetResponseStreamFactory(
[]() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
- auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+ auto listObjectsOutcome =
+ this->GetS3Client()->ListObjects(listObjectsRequest);
if (listObjectsOutcome.IsSuccess()) {
auto contents = listObjectsOutcome.GetResult().GetContents();
if (contents.size() > 1 ||
@@ -568,8 +592,6 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
}
}
- Aws::S3::S3Client s3Client(GetDefaultClientConfig());
-
Aws::S3::Model::CopyObjectRequest copyObjectRequest;
Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
@@ -582,7 +604,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
Aws::S3::Model::ListObjectsResult listObjectsResult;
do {
- auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+ auto listObjectsOutcome =
+ this->GetS3Client()->ListObjects(listObjectsRequest);
if (!listObjectsOutcome.IsSuccess()) {
string error = strings::StrCat(
listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -595,13 +618,15 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
Aws::String src_key = object.GetKey();
Aws::String target_key = src_key;
target_key.replace(0, src_object.length(), target_object.c_str());
- Aws::String source = Aws::String(src_bucket.c_str()) + "/" + src_key;
+ Aws::String source = Aws::String(src_bucket.c_str()) + "/" +
+ Aws::Utils::StringUtils::URLEncode(src_key.c_str());
copyObjectRequest.SetBucket(target_bucket.c_str());
copyObjectRequest.SetKey(target_key);
copyObjectRequest.SetCopySource(source);
- auto copyObjectOutcome = s3Client.CopyObject(copyObjectRequest);
+ auto copyObjectOutcome =
+ this->GetS3Client()->CopyObject(copyObjectRequest);
if (!copyObjectOutcome.IsSuccess()) {
string error = strings::StrCat(
copyObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -612,7 +637,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
deleteObjectRequest.SetBucket(src_bucket.c_str());
deleteObjectRequest.SetKey(src_key.c_str());
- auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
+ auto deleteObjectOutcome =
+ this->GetS3Client()->DeleteObject(deleteObjectRequest);
if (!deleteObjectOutcome.IsSuccess()) {
string error = strings::StrCat(
deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 31ba3cecc5..31264be621 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -16,7 +16,9 @@ limitations under the License.
#ifndef TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
#define TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
+#include <aws/s3/S3Client.h>
#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
namespace tensorflow {
@@ -53,6 +55,26 @@ class S3FileSystem : public FileSystem {
Status GetFileSize(const string& fname, uint64* size) override;
Status RenameFile(const string& src, const string& target) override;
+
+ private:
+ // Returns the member S3 client, initializing as-needed.
+ // When the client tries to access the object in S3, e.g.,
+ // s3://bucket-name/path/to/object
+ // the behavior could be controlled by various environmental
+ // variables.
+ // By default S3 access regional endpoint, with region
+ // controlled by `AWS_REGION`. The endpoint could be overridden
+ // explicitly with `S3_ENDPOINT`. S3 uses HTTPS by default.
+ // If S3_USE_HTTPS=0 is specified, HTTP is used. Also,
+ // S3_VERIFY_SSL=0 could disable SSL verification in case
+ // HTTPS is used.
+ // This S3 Client does not support Virtual Hosted–Style Method
+ // for a bucket.
+ std::shared_ptr<Aws::S3::S3Client> GetS3Client();
+
+ std::shared_ptr<Aws::S3::S3Client> s3_client_;
+ // Lock held when checking for s3_client_ initialization.
+ mutex client_lock_;
};
} // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 0b42f5fcec..d4411d9865 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -130,6 +130,8 @@ TEST_F(S3FileSystemTest, NewReadOnlyMemoryRegionFromFile) {
TEST_F(S3FileSystemTest, FileExists) {
const string fname = TmpDir("FileExists");
+ // Ensure the file doesn't yet exist.
+ TF_ASSERT_OK(s3fs.DeleteFile(fname));
EXPECT_EQ(error::Code::NOT_FOUND, s3fs.FileExists(fname).code());
TF_ASSERT_OK(WriteString(fname, "test"));
TF_EXPECT_OK(s3fs.FileExists(fname));
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index d6e78dbc8f..f20939d3c0 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -22,8 +22,10 @@ limitations under the License.
// Byte order defines provided by gcc. MSVC doesn't define those so
// we define them here.
// We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
#define __ORDER_LITTLE_ENDIAN__ 0x4d2
#define __ORDER_BIG_ENDIAN__ 0x10e1
#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
#endif // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 460f935e4a..57d76eb4cb 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -240,8 +240,9 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
# can also generate memory profile using `-select bytes`
tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile=<filename> -trim_name_regexes .*apply_op.*
-# Use pprof to visualize the generated file.
-pprof -png --nodecount=100 --sample_index=1 <filename>
+# Use google-pprof, from the google-perftools package to visualize the generated file.
+# On Ubuntu you can install it with `apt-get install it google-perftools`.
+google-pprof --pdf --nodecount=100 <filename>
```
![PprofGraph](g3doc/pprof.jpg)
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index 0790cb0ca6..db148c936c 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -83,7 +83,7 @@ class TFStats {
const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
const Options& opts) const;
- // A a (partial) graph to existing graph.
+ // Add a (partial) graph to existing graph.
void AddGraph(std::unique_ptr<GraphDef> graph);
// Add a step of run time meta data.
@@ -118,7 +118,7 @@ class TFStats {
MultiGraphNodeProto empty_multi_graph_node_;
std::map<int64, string> id_to_string_;
- // Graph nodes covered by RunMetdata, that is traced with run time stats.
+ // Graph nodes covered by RunMetadata, that is traced with run time stats.
std::set<int64> covered_nodes_;
};
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 2cc212d589..808e3c853b 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -206,8 +206,12 @@ int Run(int argc, char** argv) {
"graph_path,op_log_path,run_meta_path\n");
std::unique_ptr<GraphDef> graph(new GraphDef());
if (!FLAGS_graph_path.empty()) {
- TF_CHECK_OK(
- ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
+ s = ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false);
+ if (!s.ok()) {
+ fprintf(stderr, "Failed to read graph_path: %s\n",
+ s.ToString().c_str());
+ return 1;
+ }
}
std::unique_ptr<OpLogProto> op_log(new OpLogProto());
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 67da7bf452..b02f899b87 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
// TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
// "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
#define TF_STR_HELPER(x) #x
#define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 864e7e39c2..db4c5c35e3 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -35,7 +35,7 @@ limitations under the License.
#include "tensorflow/core/util/padding.h"
#include "tensorflow/core/util/tensor_format.h"
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
#include "mkldnn.hpp"
using mkldnn::engine;
@@ -325,7 +325,7 @@ class MklShape {
nullptr; // TF dimension corresponding to this MKL dimension
};
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
// Forward decl
TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
@@ -660,7 +660,7 @@ class MklDnnShape {
typedef std::vector<MklShape> MklShapeList;
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
typedef std::vector<MklDnnShape> MklDnnShapeList;
#endif
@@ -674,7 +674,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) {
return true;
}
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
template <typename T>
inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
const MklShape& mkl_shape) {
@@ -725,7 +725,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
sizeof(uint8));
}
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
mklshape->DeSerializeMklDnnShape(
ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
@@ -749,7 +749,7 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
ctext->input_list(name, input_tensors);
}
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
MklShapeList* mkl_shapes) {
@@ -779,7 +779,7 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
#endif
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
/// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
/// If the input tensor is in MKL layout, then obtains TensorShape from
/// MklShape.
@@ -814,7 +814,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
second_tensor->flat<uint8>().size() * sizeof(uint8));
}
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
// Allocate the second output tensor that will contain
// the MKL shape serialized
inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -851,7 +851,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
second_tensor->flat<uint8>().size() * sizeof(uint8));
}
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
// Allocate the output tensor, create a second output tensor that will contain
// the MKL shape serialized
inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -875,7 +875,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
// Allocates a temp tensor and returns the data buffer for temporary storage.
// Currently
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
template <typename T>
inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
const memory::primitive_desc& pd, void** buf_out) {
@@ -994,7 +994,7 @@ inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
context->set_output(idx_meta_out, meta_output);
}
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
int idx_out,
const TensorShape& shape) {
@@ -1032,7 +1032,7 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
}
#endif
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
int idx_out) {
@@ -1090,7 +1090,7 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in,
}
}
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
int idx_in, int idx_out,
const MklDnnShape& mkl_shape) {
@@ -1132,7 +1132,7 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context,
AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
}
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
// We don't need these functions in MKLDNN. We have defined equality operator
// on MklDnnShape class directly.
@@ -1242,7 +1242,7 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
// -------------------------------------------------------------------
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
/// Return MKL-DNN data type (memory::data_type) for input type T
///
@@ -1753,7 +1753,7 @@ class MklDnnData {
}
};
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
} // namespace tensorflow
#endif // INTEL_MKL
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 8b73eadb40..cd1d0713ad 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
namespace tensorflow {
namespace {
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
TEST(MklUtilTest, MklDnnTfShape) {
auto cpu_engine = engine(engine::cpu, 0);
@@ -84,7 +84,7 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
}
-#endif // INTEL_MKL_DNN
+#endif // INTEL_MKL_ML
} // namespace
} // namespace tensorflow