aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow')
-rw-r--r--tensorflow/BUILD1
-rw-r--r--tensorflow/c/version_script.lds4
-rw-r--r--tensorflow/cc/BUILD1
-rw-r--r--tensorflow/cc/gradients/math_grad.cc12
-rw-r--r--tensorflow/cc/gradients/math_grad_test.cc12
-rw-r--r--tensorflow/cc/gradients/nn_grad.cc82
-rw-r--r--tensorflow/cc/gradients/nn_grad_test.cc27
-rw-r--r--tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc2
-rw-r--r--tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc24
-rw-r--r--tensorflow/compiler/xla/service/hlo_verifier.cc9
-rw-r--r--tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java81
-rw-r--r--tensorflow/contrib/estimator/python/estimator/extenders.py2
-rw-r--r--tensorflow/contrib/layers/__init__.py1
-rw-r--r--tensorflow/contrib/layers/python/layers/feature_column.py5
-rw-r--r--tensorflow/contrib/layers/python/layers/feature_column_test.py30
-rw-r--r--tensorflow/contrib/layers/python/layers/layers.py7
-rw-r--r--tensorflow/contrib/layers/python/layers/layers_test.py7
-rw-r--r--tensorflow/contrib/learn/python/learn/datasets/mnist.py22
-rwxr-xr-xtensorflow/contrib/makefile/compile_nsync.sh7
-rw-r--r--tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in4
-rw-r--r--tensorflow/contrib/metrics/python/ops/metric_ops.py9
-rw-r--r--tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py5
-rw-r--r--tensorflow/contrib/timeseries/README.md2
-rw-r--r--tensorflow/contrib/verbs/rdma.cc254
-rw-r--r--tensorflow/contrib/verbs/rdma.h50
-rw-r--r--tensorflow/core/BUILD6
-rw-r--r--tensorflow/core/common_runtime/mkl_cpu_allocator.h4
-rw-r--r--tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc5
-rw-r--r--tensorflow/core/distributed_runtime/worker_cache_partial.cc18
-rw-r--r--tensorflow/core/graph/mkl_layout_pass.cc167
-rw-r--r--tensorflow/core/graph/mkl_layout_pass_test.cc288
-rw-r--r--tensorflow/core/graph/mkl_tfconversion_pass.cc159
-rw-r--r--tensorflow/core/kernels/BUILD45
-rw-r--r--tensorflow/core/kernels/bias_op_gpu.cu.cc23
-rw-r--r--tensorflow/core/kernels/cuda_solvers.cc38
-rw-r--r--tensorflow/core/kernels/cuda_solvers.h16
-rw-r--r--tensorflow/core/kernels/cwise_ops.h4
-rw-r--r--tensorflow/core/kernels/cwise_ops_common.cc2
-rw-r--r--tensorflow/core/kernels/decode_raw_op.cc1
-rw-r--r--tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc30
-rw-r--r--tensorflow/core/kernels/fill_functor.cc2
-rw-r--r--tensorflow/core/kernels/mkl_aggregate_ops.cc273
-rw-r--r--tensorflow/core/kernels/mkl_conv_ops.cc2
-rw-r--r--tensorflow/core/kernels/mkl_cwise_ops_common.cc88
-rw-r--r--tensorflow/core/kernels/mkl_identity_op.cc4
-rw-r--r--tensorflow/core/kernels/mkl_input_conversion_op.cc259
-rw-r--r--tensorflow/core/kernels/mkl_tfconv_op.h136
-rw-r--r--tensorflow/core/kernels/svd_op_gpu.cu.cc413
-rw-r--r--tensorflow/core/kernels/tensor_array_ops.cc2
-rw-r--r--tensorflow/core/ops/math_ops.cc103
-rw-r--r--tensorflow/core/ops/nn_ops.cc23
-rw-r--r--tensorflow/core/ops/ops.pbtxt19
-rw-r--r--tensorflow/core/ops/parsing_ops.cc2
-rw-r--r--tensorflow/core/ops/string_ops.cc2
-rw-r--r--tensorflow/core/platform/cuda_libdevice_path_test.cc2
-rw-r--r--tensorflow/core/public/version.h4
-rw-r--r--tensorflow/core/util/cuda_kernel_helper.h76
-rw-r--r--tensorflow/core/util/mkl_util.h131
-rw-r--r--tensorflow/docs_src/about/bib.md2
-rw-r--r--tensorflow/docs_src/extend/estimators.md2
-rw-r--r--tensorflow/docs_src/get_started/get_started.md2
-rw-r--r--tensorflow/docs_src/install/install_c.md2
-rw-r--r--tensorflow/docs_src/install/install_go.md2
-rw-r--r--tensorflow/docs_src/install/install_java.md18
-rw-r--r--tensorflow/docs_src/install/install_linux.md22
-rw-r--r--tensorflow/docs_src/install/install_mac.md10
-rw-r--r--tensorflow/docs_src/install/install_sources.md4
-rw-r--r--tensorflow/docs_src/install/install_windows.md3
-rw-r--r--tensorflow/docs_src/programmers_guide/graphs.md2
-rw-r--r--tensorflow/docs_src/programmers_guide/tensors.md2
-rw-r--r--tensorflow/docs_src/tutorials/layers.md2
-rw-r--r--tensorflow/examples/android/README.md2
-rw-r--r--tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java2
-rw-r--r--tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java128
-rw-r--r--tensorflow/examples/ios/README.md2
-rw-r--r--tensorflow/examples/speech_commands/train.py7
-rw-r--r--tensorflow/java/src/main/java/org/tensorflow/Tensor.java3
-rw-r--r--tensorflow/java/src/main/native/tensor_jni.cc6
-rw-r--r--tensorflow/python/BUILD17
-rw-r--r--tensorflow/python/kernel_tests/decode_raw_op_test.py16
-rw-r--r--tensorflow/python/kernel_tests/metrics_test.py4
-rw-r--r--tensorflow/python/kernel_tests/segment_reduction_ops_test.py1
-rw-r--r--tensorflow/python/kernel_tests/svd_op_test.py47
-rw-r--r--tensorflow/python/layers/base.py10
-rw-r--r--tensorflow/python/layers/convolutional.py16
-rw-r--r--tensorflow/python/layers/core.py7
-rw-r--r--tensorflow/python/layers/maxout.py108
-rw-r--r--tensorflow/python/layers/maxout_test.py61
-rw-r--r--tensorflow/python/layers/normalization.py12
-rw-r--r--tensorflow/python/layers/pooling.py14
-rw-r--r--tensorflow/python/layers/utils.py5
-rw-r--r--tensorflow/python/ops/metrics_impl.py5
-rw-r--r--tensorflow/python/training/moving_averages.py12
-rw-r--r--tensorflow/tensorflow.bzl1
-rw-r--r--tensorflow/tf_exported_symbols.lds4
-rw-r--r--tensorflow/tf_version_script.lds4
-rw-r--r--tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh3
-rw-r--r--tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh3
-rw-r--r--tensorflow/tools/docker/Dockerfile.devel2
-rw-r--r--tensorflow/tools/docker/Dockerfile.devel-gpu2
-rwxr-xr-xtensorflow/tools/pip_package/build_pip_package.sh26
-rw-r--r--tensorflow/tools/pip_package/setup.py8
-rw-r--r--tensorflow/workspace.bzl11
103 files changed, 3001 insertions, 628 deletions
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 315f0bee5d..eef327d690 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -472,6 +472,7 @@ cc_binary(
"//tensorflow:darwin": [
"-Wl,-exported_symbols_list", # This line must be directly followed by the exported_symbols.lds file
"//tensorflow/c:exported_symbols.lds",
+ "-Wl,-install_name,@rpath/libtensorflow.so",
],
"//tensorflow:windows": [],
"//tensorflow:windows_msvc": [],
diff --git a/tensorflow/c/version_script.lds b/tensorflow/c/version_script.lds
index 5855782003..c352a1440d 100644
--- a/tensorflow/c/version_script.lds
+++ b/tensorflow/c/version_script.lds
@@ -1,8 +1,8 @@
VERS_1.0 {
# Export symbols in c_api.h.
global:
- TF_*;
- TFE_*;
+ *TF_*;
+ *TFE_*;
# Hide everything else.
local:
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 7606e193a9..1eebc8f6a6 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -336,6 +336,7 @@ cc_library(
":cc_ops",
":cc_ops_internal",
":grad_op_registry",
+ ":gradients",
],
alwayslink = 1,
)
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index aba17cfe0c..ac288b1d83 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -696,6 +696,18 @@ Status MeanGrad(const Scope& scope, const Operation& op,
}
REGISTER_GRADIENT_OP("Mean", MeanGrad);
+Status LgammaGrad(const Scope& scope, const Operation& op,
+ const std::vector<Output>& grad_inputs,
+ std::vector<Output>* grad_outputs) {
+ auto grad = grad_inputs[0];
+ Scope grad_scope = scope.WithControlDependencies(grad);
+ auto x = ConjugateHelper(grad_scope, op.input(0));
+ auto dx = Mul(scope, grad, Digamma(scope, x));
+ grad_outputs->push_back(dx);
+ return scope.status();
+}
+REGISTER_GRADIENT_OP("Lgamma", LgammaGrad);
+
Status MinOrMaxGrad(const Scope& scope, const Operation& op,
const std::vector<Output>& grad_inputs,
std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 3534f16e8f..a174f223ad 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -821,5 +821,17 @@ TEST_F(NaryGradTest, Minimum) {
RunTest(x, x_init_value, y, shape);
}
+TEST_F(NaryGradTest, Lgamma) {
+ TensorShape shape({3, 2});
+ auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+ auto y = Lgamma(scope_, x);
+ // Select values to avoid instability when computing finite differences.
+ // Ref: https://en.wikipedia.org/wiki/File:Gamma_plot.svg
+ Tensor x_init_value =
+ test::AsTensor<float>({-3.5f, -2.5f, -1.5f, 1.0f, 2.0f, 3.5f}, {3, 2});
+ RunTest(x, x_init_value, y, shape);
+ // TODO(suharshs): add test case for complex values
+}
+
} // namespace
} // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index ccb58e7f91..fcc3fc9dae 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -18,6 +18,7 @@ limitations under the License.
#include "tensorflow/cc/ops/standard_ops.h"
#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/framework/gradients.h"
namespace tensorflow {
namespace ops {
@@ -118,6 +119,87 @@ Status BiasAddGradHelper(const Scope& scope, const Operation& op,
}
REGISTER_GRADIENT_OP("BiasAdd", BiasAddGradHelper);
+Status Conv2DGrad(const Scope& scope, const Operation& op,
+ const std::vector<Output>& grad_inputs,
+ std::vector<Output>* grad_outputs) {
+ string data_format;
+ string padding;
+ std::vector<int32> strides;
+ bool use_cudnn_on_gpu;
+ auto attrs = op.output(0).node()->attrs();
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "use_cudnn_on_gpu",
+ &use_cudnn_on_gpu));
+ Conv2DBackpropInput::Attrs input_attrs;
+ input_attrs.DataFormat(data_format);
+ input_attrs.UseCudnnOnGpu(use_cudnn_on_gpu);
+ auto dx_1 = Conv2DBackpropInput(scope, Shape(scope, op.input(0)),
+ op.input(1), grad_inputs[0],
+ strides, padding, input_attrs);
+ grad_outputs->push_back(dx_1);
+ Conv2DBackpropFilter::Attrs filter_attrs;
+ filter_attrs.DataFormat(data_format);
+ filter_attrs.UseCudnnOnGpu(use_cudnn_on_gpu);
+ auto dx_2 = Conv2DBackpropFilter(scope, op.input(0),
+ Shape(scope, op.input(1)), grad_inputs[0],
+ strides, padding, filter_attrs);
+ grad_outputs->push_back(dx_2);
+ return scope.status();
+}
+REGISTER_GRADIENT_OP("Conv2D", Conv2DGrad);
+
+Status MaxPoolGradHelper(const Scope& scope, const Operation& op,
+ const std::vector<Output>& grad_inputs,
+ std::vector<Output>* grad_outputs) {
+ string data_format;
+ string padding;
+ std::vector<int32> strides;
+ std::vector<int32> ksize;
+ auto attrs = op.output(0).node()->attrs();
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize));
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
+ internal::MaxPoolGrad::Attrs grad_attrs;
+ grad_attrs.DataFormat(data_format);
+ auto dx = internal::MaxPoolGrad(scope, op.input(0),
+ op.output(0),
+ grad_inputs[0],
+ ksize, strides,
+ padding, grad_attrs);
+ grad_outputs->push_back(dx);
+ return scope.status();
+}
+REGISTER_GRADIENT_OP("MaxPool", MaxPoolGradHelper);
+
+Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op,
+ const std::vector<Output>& grad_inputs,
+ std::vector<Output>* grad_outputs) {
+ string data_format;
+ string padding;
+ auto attrs = op.output(0).node()->attrs();
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
+ TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
+ MaxPoolGradV2::Attrs grad_attrs;
+ grad_attrs.DataFormat(data_format);
+ auto dx = MaxPoolGradV2(scope, op.input(0),
+ op.output(0),
+ grad_inputs[0],
+ op.input(1),
+ op.input(2),
+ padding,
+ grad_attrs);
+ grad_outputs->push_back(dx);
+ grad_outputs->push_back(NoGradient());
+ grad_outputs->push_back(NoGradient());
+ return scope.status();
+}
+REGISTER_GRADIENT_OP("MaxPoolV2", MaxPoolGradV2Helper);
+
+
+
} // anonymous namespace
} // namespace ops
} // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index 64f1f76066..23545f75ac 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -138,5 +138,32 @@ TEST_F(NNGradTest, BiasAddGradHelper) {
RunTest({x, bias}, {shape, bias_shape}, {y}, {shape});
}
+TEST_F(NNGradTest, Conv2DGrad) {
+ TensorShape shape({1, 2, 2, 1});
+ auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+ Tensor filter = test::AsTensor<float>({0.5f}, {1, 1, 1, 1});
+ const std::vector<int> strides{1, 1, 1, 1};
+ auto y = Conv2D(scope_, x, filter, strides, "SAME");
+ RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, MaxPoolGradHelper) {
+ TensorShape shape({1, 2, 2, 1});
+ auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+ const std::vector<int> ksize{1, 2, 2, 1};
+ const std::vector<int> strides{1, 1, 1, 1};
+ auto y = MaxPool(scope_, x, ksize, strides, "SAME");
+ RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, MaxPoolGradV2Helper) {
+ TensorShape shape({1, 2, 2, 1});
+ auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+ Tensor ksize = test::AsTensor<int>({1, 2, 2, 1}, {4});
+ Tensor strides = test::AsTensor<int>({1, 1, 1, 1}, {4});
+ auto y = MaxPoolV2(scope_, x, ksize, strides, "SAME");
+ RunTest(x, shape, y, shape);
+}
+
} // namespace
} // namespace tensorflow
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index f7ddee7b61..b8bdd78da8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -999,7 +999,7 @@ Status IrEmitterUnnested::EmitRowReduction(
// for (shuffle_distance = 16; shuffle_distance > 0; shuffle_distance /= 2)
// partial_result = Reducer(
// partial_result,
- // __shfl_down(partial_result, shuffle_distance));
+ // __shfl_down_sync(CUDA_WARP_ALL, partial_result, shuffle_distance));
// if (lane_id == 0)
// AtomicReducer(&output[y], partial_result);
// }
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 2e7765c4c6..b24fe417ff 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -71,7 +71,18 @@ const int kDefaultInlineThreshold = 1100;
// Gets the libdevice filename for a particular compute capability. When
// presented with a GPU we don't recognize, we just return the libdevice from
// compute_20.
-static string GetLibdeviceFilename(std::pair<int, int> compute_capability) {
+static string GetLibdeviceFilename(const string& libdevice_dir_path,
+ std::pair<int, int> compute_capability) {
+ // Since CUDA 9.0, all GPU versions are included in a single file
+ const char* unified_libdevice_filename = "libdevice.10.bc";
+ std::vector<string> unified_libdevice_files;
+ const tensorflow::Status status =
+ tensorflow::Env::Default()->GetMatchingPaths(
+ tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename),
+ &unified_libdevice_files);
+ if (status.ok() && unified_libdevice_files.size() == 1) {
+ return unified_libdevice_filename;
+ }
// There are only four libdevice files: compute_{20,30,35,50}. Each GPU
// version gets mapped to one of these. Note in particular that sm_60 and
// sm_61 map to libdevice.compute_30.
@@ -101,7 +112,7 @@ static string GetLibdeviceFilename(std::pair<int, int> compute_capability) {
}
// Gets the GPU name as it's known to LLVM for a given compute capability. If
-// we see an unrecognized compute capability, we return "sm_20".
+// we see an unrecognized compute capability, we return "sm_30".
static string GetSmName(std::pair<int, int> compute_capability) {
static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
{{2, 1}, 21},
@@ -114,8 +125,10 @@ static string GetSmName(std::pair<int, int> compute_capability) {
{{5, 3}, 53},
{{6, 0}, 60},
{{6, 1}, 61},
- {{6, 2}, 62}});
- int sm_version = 20;
+ {{6, 2}, 62},
+ // TODO: Change this to 70 once LLVM NVPTX supports it
+ {{7, 0}, 60}});
+ int sm_version = 30;
auto it = m->find(compute_capability);
if (it != m->end()) {
sm_version = it->second;
@@ -306,7 +319,8 @@ tensorflow::Status LinkLibdeviceIfNecessary(
llvm::Linker linker(*module);
string libdevice_path = tensorflow::io::JoinPath(
- libdevice_dir_path, GetLibdeviceFilename(compute_capability));
+ libdevice_dir_path, GetLibdeviceFilename(libdevice_dir_path,
+ compute_capability));
TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path));
VLOG(1) << "Linking with libdevice from: " << libdevice_path;
std::unique_ptr<llvm::Module> libdevice_module =
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index c85214e9a4..54cd26502a 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -531,6 +531,15 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
<< " computation: " << computation.get();
}
}
+ if (instruction->opcode() == HloOpcode::kBroadcast) {
+ // If you see this failure then someone has confused the difference
+ // between the HLO broadcast op, and the UserComputation broadcast
+ // op. See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I
+ // or ComputationLowerer::Visit()
+ TF_RET_CHECK(instruction->dimensions().size() ==
+ ShapeUtil::Rank(instruction->operand(0)->shape()))
+ << "Broadcast HLO has invalid number of dimensions.";
+ }
auto previous = instructions.find(instruction->name());
TF_RET_CHECK(previous == instructions.end())
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index 6389ef1f5d..f60bd8282c 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -20,6 +20,7 @@ import android.os.Build.VERSION;
import android.os.Trace;
import android.text.TextUtils;
import android.util.Log;
+import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -78,10 +79,35 @@ public class TensorFlowInferenceInterface {
throw new RuntimeException("Failed to load model from '" + model + "'", e);
}
}
+
try {
- loadGraph(is, g);
+ if (VERSION.SDK_INT >= 18) {
+ Trace.beginSection("initializeTensorFlow");
+ Trace.beginSection("readGraphDef");
+ }
+
+ // TODO(ashankar): Can we somehow mmap the contents instead of copying them?
+ byte[] graphDef = new byte[is.available()];
+ final int numBytesRead = is.read(graphDef);
+ if (numBytesRead != graphDef.length) {
+ throw new IOException(
+ "read error: read only "
+ + numBytesRead
+ + " of the graph, expected to read "
+ + graphDef.length);
+ }
+
+ if (VERSION.SDK_INT >= 18) {
+ Trace.endSection(); // readGraphDef.
+ }
+
+ loadGraph(graphDef, g);
is.close();
Log.i(TAG, "Successfully loaded model from '" + model + "'");
+
+ if (VERSION.SDK_INT >= 18) {
+ Trace.endSection(); // initializeTensorFlow.
+ }
} catch (IOException e) {
throw new RuntimeException("Failed to load model from '" + model + "'", e);
}
@@ -105,8 +131,30 @@ public class TensorFlowInferenceInterface {
this.runner = sess.runner();
try {
- loadGraph(is, g);
+ if (VERSION.SDK_INT >= 18) {
+ Trace.beginSection("initializeTensorFlow");
+ Trace.beginSection("readGraphDef");
+ }
+
+ int baosInitSize = is.available() > 16384 ? is.available() : 16384;
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(baosInitSize);
+ int numBytesRead;
+ byte[] buf = new byte[16384];
+ while ((numBytesRead = is.read(buf, 0, buf.length)) != -1) {
+ baos.write(buf, 0, numBytesRead);
+ }
+ byte[] graphDef = baos.toByteArray();
+
+ if (VERSION.SDK_INT >= 18) {
+ Trace.endSection(); // readGraphDef.
+ }
+
+ loadGraph(graphDef, g);
Log.i(TAG, "Successfully loaded model from the input stream");
+
+ if (VERSION.SDK_INT >= 18) {
+ Trace.endSection(); // initializeTensorFlow.
+ }
} catch (IOException e) {
throw new RuntimeException("Failed to load model from the input stream", e);
}
@@ -269,8 +317,8 @@ public class TensorFlowInferenceInterface {
/**
* Copy a byte sequence into the input Tensor with name {@link inputName} as a string-valued
- * scalar tensor. In the TensorFlow type system, a "string" is an arbitrary sequence of
- * bytes, not a Java {@code String} (which is a sequence of characters).
+ * scalar tensor. In the TensorFlow type system, a "string" is an arbitrary sequence of bytes, not
+ * a Java {@code String} (which is a sequence of characters).
*/
public void feedString(String inputName, byte[] src) {
addFeed(inputName, Tensor.create(src));
@@ -278,9 +326,8 @@ public class TensorFlowInferenceInterface {
/**
* Copy an array of byte sequences into the input Tensor with name {@link inputName} as a
- * string-valued one-dimensional tensor (vector). In the TensorFlow type system, a "string"
- * is an arbitrary sequence of bytes, not a Java {@code String} (which is a sequence of
- * characters).
+ * string-valued one-dimensional tensor (vector). In the TensorFlow type system, a "string" is an
+ * arbitrary sequence of bytes, not a Java {@code String} (which is a sequence of characters).
*/
public void feedString(String inputName, byte[][] src) {
addFeed(inputName, Tensor.create(src));
@@ -458,27 +505,10 @@ public class TensorFlowInferenceInterface {
}
}
- private void loadGraph(InputStream is, Graph g) throws IOException {
+ private void loadGraph(byte[] graphDef, Graph g) throws IOException {
final long startMs = System.currentTimeMillis();
if (VERSION.SDK_INT >= 18) {
- Trace.beginSection("loadGraph");
- Trace.beginSection("readGraphDef");
- }
-
- // TODO(ashankar): Can we somehow mmap the contents instead of copying them?
- byte[] graphDef = new byte[is.available()];
- final int numBytesRead = is.read(graphDef);
- if (numBytesRead != graphDef.length) {
- throw new IOException(
- "read error: read only "
- + numBytesRead
- + " of the graph, expected to read "
- + graphDef.length);
- }
-
- if (VERSION.SDK_INT >= 18) {
- Trace.endSection(); // readGraphDef.
Trace.beginSection("importGraphDef");
}
@@ -490,7 +520,6 @@ public class TensorFlowInferenceInterface {
if (VERSION.SDK_INT >= 18) {
Trace.endSection(); // importGraphDef.
- Trace.endSection(); // loadGraph.
}
final long endMs = System.currentTimeMillis();
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 6e2c1aa033..2e944cbdd9 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -54,7 +54,7 @@ def add_metrics(estimator, metric_fn):
```
Args:
- estimator: A ${tf.estimator.Esitmator} object.
+ estimator: A ${tf.estimator.Estimator} object.
metric_fn: A function which should obey the following signature:
- Args: can only have following four arguments in any order:
* predictions: Predictions `Tensor` or dict of `Tensor` created by given
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index ea8d9e0c63..9309678d90 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -51,6 +51,7 @@ See the @{$python/contrib.layers} guide.
@@unit_norm
@@bow_encoder
@@embed_sequence
+@@maxout
@@apply_regularization
@@l1_l2_regularizer
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index da16bf6ce6..226d933d85 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -939,6 +939,11 @@ class _OneHotColumn(
weighted_column = sparse_ops.sparse_merge(sp_ids=sparse_id_column,
sp_values=weight_tensor,
vocab_size=self.length)
+ # Remove (?, -1) index
+ weighted_column = sparse_ops.sparse_slice(
+ weighted_column,
+ [0, 0],
+ weighted_column.dense_shape)
return sparse_ops.sparse_tensor_to_dense(weighted_column)
dense_id_tensor = sparse_ops.sparse_tensor_to_dense(sparse_id_column,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index ab65e47af8..5ae885b720 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.feature_column import feature_column as fc_core
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.ops import lookup_ops
from tensorflow.python.ops import parsing_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import variable_scope
@@ -319,6 +320,35 @@ class FeatureColumnTest(test.TestCase):
self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights")
self.assertEqual(one_hot.length, 3)
+ def testMissingValueInOneHotColumnForWeightedSparseColumn(self):
+ # Github issue 12583
+ ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
+ weighted_ids = fc.weighted_sparse_column(ids, "weights")
+ one_hot = fc.one_hot_column(weighted_ids)
+ features = {
+ 'ids': constant_op.constant([['marlo', 'unknown', 'omar']]),
+ 'weights': constant_op.constant([[2., 4., 6.]])
+ }
+ one_hot_tensor = feature_column_ops.input_from_feature_columns(
+ features, [one_hot])
+ with self.test_session() as sess:
+ sess.run(variables.global_variables_initializer())
+ sess.run(lookup_ops.tables_initializer())
+ self.assertAllEqual([[2., 6., 0.]], one_hot_tensor.eval())
+
+ def testMissingValueInOneHotColumnForSparseColumnWithKeys(self):
+ ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
+ one_hot = fc.one_hot_column(ids)
+ features = {
+ 'ids': constant_op.constant([['marlo', 'unknown', 'omar']])
+ }
+ one_hot_tensor = feature_column_ops.input_from_feature_columns(
+ features, [one_hot])
+ with self.test_session() as sess:
+ sess.run(variables.global_variables_initializer())
+ sess.run(lookup_ops.tables_initializer())
+ self.assertAllEqual([[1., 1., 0.]], one_hot_tensor.eval())
+
def testOneHotColumnDeepCopy(self):
a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
column = fc.one_hot_column(a)
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 39e0f1fa23..cc494e9200 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -50,6 +50,7 @@ from tensorflow.python.ops import standard_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import variables as tf_variables
from tensorflow.python.training import moving_averages
+from tensorflow.python.layers.maxout import maxout
# TODO(b/28426988): Replace legacy_* fns migrated from slim.
# TODO(b/28426988): Remove legacy_* when all uses have migrated to new API.
@@ -92,7 +93,8 @@ __all__ = ['avg_pool2d',
'unit_norm',
'legacy_fully_connected',
'legacy_linear',
- 'legacy_relu']
+ 'legacy_relu',
+ 'maxout']
DATA_FORMAT_NCHW = 'NCHW'
DATA_FORMAT_NHWC = 'NHWC'
@@ -811,7 +813,8 @@ def batch_norm(inputs,
if data_format == DATA_FORMAT_NCHW:
mean = array_ops.reshape(mean, params_shape_broadcast)
variance = array_ops.reshape(variance, params_shape_broadcast)
- beta = array_ops.reshape(beta, params_shape_broadcast)
+ if beta is not None:
+ beta = array_ops.reshape(beta, params_shape_broadcast)
if gamma is not None:
gamma = array_ops.reshape(gamma, params_shape_broadcast)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 61c5fbafed..d1d18016f7 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -2636,6 +2636,13 @@ class BatchNormTest(test.TestCase):
data_format='NCHW', shape=shape, is_training=True)
self.assertAllClose(nhwc, nchw, atol=1e-4, rtol=1e-4)
+ def testBatchNormBeta(self):
+ # Test case for 11673
+ with self.test_session() as sess:
+ a = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
+ b = _layers.batch_norm(a, center=False, data_format='NCHW',
+ zero_debias_moving_mean=True)
+ sess.run(variables_lib.global_variables_initializer())
class LayerNormTest(test.TestCase):
diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
index a90b9264f8..1f3295747e 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import random_seed
from tensorflow.python.platform import gfile
# CVDF mirror of http://yann.lecun.com/exdb/mnist/
-SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'
+DEFAULT_SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'
def _read32(bytestream):
@@ -215,7 +215,8 @@ def read_data_sets(train_dir,
dtype=dtypes.float32,
reshape=True,
validation_size=5000,
- seed=None):
+ seed=None,
+ source_url=DEFAULT_SOURCE_URL):
if fake_data:
def fake():
@@ -227,28 +228,31 @@ def read_data_sets(train_dir,
test = fake()
return base.Datasets(train=train, validation=validation, test=test)
+ if not source_url: # empty string check
+ source_url = DEFAULT_SOURCE_URL
+
TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
- SOURCE_URL + TRAIN_IMAGES)
+ source_url + TRAIN_IMAGES)
with gfile.Open(local_file, 'rb') as f:
train_images = extract_images(f)
local_file = base.maybe_download(TRAIN_LABELS, train_dir,
- SOURCE_URL + TRAIN_LABELS)
+ source_url + TRAIN_LABELS)
with gfile.Open(local_file, 'rb') as f:
train_labels = extract_labels(f, one_hot=one_hot)
local_file = base.maybe_download(TEST_IMAGES, train_dir,
- SOURCE_URL + TEST_IMAGES)
+ source_url + TEST_IMAGES)
with gfile.Open(local_file, 'rb') as f:
test_images = extract_images(f)
local_file = base.maybe_download(TEST_LABELS, train_dir,
- SOURCE_URL + TEST_LABELS)
+ source_url + TEST_LABELS)
with gfile.Open(local_file, 'rb') as f:
test_labels = extract_labels(f, one_hot=one_hot)
@@ -262,13 +266,13 @@ def read_data_sets(train_dir,
train_images = train_images[validation_size:]
train_labels = train_labels[validation_size:]
-
+
options = dict(dtype=dtype, reshape=reshape, seed=seed)
-
+
train = DataSet(train_images, train_labels, **options)
validation = DataSet(validation_images, validation_labels, **options)
test = DataSet(test_images, test_labels, **options)
-
+
return base.Datasets(train=train, validation=validation, test=test)
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index 4db9cce5ed..e85a79c279 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -214,12 +214,12 @@ for arch in $archs; do
armeabi-v7a) toolchain="arm-linux-androideabi-4.9"
sysroot_arch="arm"
bin_prefix="arm-linux-androideabi"
- march_option="-march=armv7-a"
+ march_option="-march=armv7-a -mfloat-abi=softfp -mfpu=neon"
;;
armeabi-v7a-hard) toolchain="arm-linux-androideabi-4.9"
sysroot_arch="arm"
bin_prefix="arm-linux-androideabi"
- march_option="-march=armv7-a"
+ march_option="-march=armv7-a -mfpu=neon"
;;
mips) toolchain="mipsel-linux-android-4.9"
sysroot_arch="mips"
@@ -265,8 +265,7 @@ for arch in $archs; do
-I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/'"$arch"'/include \
-I../../platform/c++11 -I../../platform/gcc \
-I../../platform/posix -pthread
- PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' \
- -mfloat-abi=softfp -mfpu=neon -fPIE
+ PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE
PLATFORM_LDFLAGS=-pthread
MKDEP=${CC} -M -std=c++11
PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
diff --git a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
index 631d52235a..26c1ad4947 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
@@ -52,7 +52,9 @@ $(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS)
@mkdir -p $(dir $@)
$(CXX) $(CXXFLAGS) $(INCLUDES) \
-o $@ $(INFERENCE_OBJS) $(LIB_OBJS) \
- $(LIBFLAGS) $(LDFLAGS) -shared $(LIBS)
+ $(LIBFLAGS) $(LDFLAGS) \
+ -shared -Wl,-soname,$(INFERENCE_SO_NAME) \
+ $(LIBS)
$(INFERENCE_SO_NAME): $(INFERENCE_SO_PATH)
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 463bd60300..76986d0156 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import metrics_impl
from tensorflow.python.ops import nn
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import weights_broadcast_ops
from tensorflow.python.util.deprecation import deprecated
@@ -651,7 +652,7 @@ def _streaming_confusion_matrix_at_thresholds(
label_is_neg = math_ops.logical_not(label_is_pos)
if weights is not None:
- broadcast_weights = _broadcast_weights(
+ broadcast_weights = weights_broadcast_ops.broadcast_weights(
math_ops.to_float(weights), predictions)
weights_tiled = array_ops.tile(array_ops.reshape(
broadcast_weights, [1, -1]), [num_thresholds, 1])
@@ -955,7 +956,7 @@ def streaming_specificity_at_sensitivity(
def streaming_sensitivity_at_specificity(
predictions, labels, specificity, weights=None, num_thresholds=200,
metrics_collections=None, updates_collections=None, name=None):
- """Computes the specificity at a given sensitivity.
+ """Computes the sensitivity at a given specificity.
The `streaming_sensitivity_at_specificity` function creates four local
variables, `true_positives`, `true_negatives`, `false_positives` and
@@ -1924,7 +1925,7 @@ def streaming_covariance(predictions,
weighted_predictions = predictions
weighted_labels = labels
else:
- weights = _broadcast_weights(weights, labels)
+ weights = weights_broadcast_ops.broadcast_weights(weights, labels)
batch_count = math_ops.reduce_sum(weights) # n_B in eqn
weighted_predictions = math_ops.multiply(predictions, weights)
weighted_labels = math_ops.multiply(labels, weights)
@@ -2051,7 +2052,7 @@ def streaming_pearson_correlation(predictions,
# Broadcast weights here to avoid duplicate broadcasting in each call to
# `streaming_covariance`.
if weights is not None:
- weights = _broadcast_weights(weights, labels)
+ weights = weights_broadcast_ops.broadcast_weights(weights, labels)
cov, update_cov = streaming_covariance(
predictions, labels, weights=weights, name='covariance')
var_predictions, update_var_predictions = streaming_covariance(
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
index 506f4bd877..96606b9c0e 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
@@ -228,7 +228,10 @@ class TFExampleDecoderTest(test.TestCase):
image_shape = (2, 3, 3)
unused_image, serialized_example = self.GenerateImage(
image_format='jpeg', image_shape=image_shape)
- with self.assertRaises(TypeError):
+ # decode_raw support uint16 now so ValueError will be thrown instead.
+ with self.assertRaisesRegexp(
+ ValueError,
+ 'true_fn and false_fn must have the same type: uint16, uint8'):
unused_decoded_image = self.RunDecodeExample(
serialized_example,
tfexample_decoder.Image(dtype=dtypes.uint16),
diff --git a/tensorflow/contrib/timeseries/README.md b/tensorflow/contrib/timeseries/README.md
index 2b36ade986..0e15d162dd 100644
--- a/tensorflow/contrib/timeseries/README.md
+++ b/tensorflow/contrib/timeseries/README.md
@@ -2,7 +2,7 @@
TensorFlow Time Series (TFTS) is a collection of ready-to-use classic models
(state space, autoregressive), and flexible infrastructure for building
-high-performance time series models whatever the architecture. It includes tools
+high-performance time series models with custom architectures. It includes tools
for chunking and batching a series, and for saving model state across chunks,
making use of parallel computation even when training sequential models on long
series (using truncated backpropagation).
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index ec5adfdaa0..26e18b28aa 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -165,9 +165,10 @@ void RdmaAdapter::Process_CQ() {
RdmaBuffer* ab = rc->tx_ack_buffer_;
ab->SendNextItem();
// find buffer
- RdmaBuffer* tb = rc->FindBuffer(rm.name_);
+ RdmaTensorBuffer* tb =
+ reinterpret_cast<RdmaTensorBuffer*>(rc->FindBuffer(rm.name_));
tb->SetBufferStatus(remote, idle);
- worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
+ worker_env_->compute_pool->Schedule([tb]() { tb->ReSendNextItem(); });
} else if (rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) {
// remote host requests to create a tensor buffer;
// send ack to release remote tx message buffer
@@ -198,7 +199,8 @@ void RdmaAdapter::Process_CQ() {
RdmaBuffer* ab = rc->tx_ack_buffer_;
ab->SendNextItem();
// find buffer
- RdmaBuffer* tb = rc->FindBuffer(rm.name_);
+ RdmaTensorBuffer* tb =
+ reinterpret_cast<RdmaTensorBuffer*>(rc->FindBuffer(rm.name_));
CHECK(rm.buffer_size_ == tb->size_)
<< "rm.buffer_size = " << rm.buffer_size_
<< "tb->size_ = " << tb->size_ << "rm.name_ = " << rm.name_;
@@ -208,7 +210,7 @@ void RdmaAdapter::Process_CQ() {
tb->SetRemoteMR(rmr, true);
tb->SetBufferStatus(local, idle);
tb->SetBufferStatus(remote, idle);
- worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
+ worker_env_->compute_pool->Schedule([tb]() { tb->ReSendNextItem(); });
} else if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
// tensor RDMA write completed
worker_env_->compute_pool->Schedule([rm, rc]() {
@@ -624,6 +626,12 @@ RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name)
RdmaTensorBuffer::RdmaTensorBuffer(RdmaChannel* channel, string name)
: RdmaBuffer(channel, name) {}
+RdmaTensorBuffer::~RdmaTensorBuffer() {
+ for (Itable it = retable.begin(); it != retable.end(); ++it) {
+ delete (it->second);
+ }
+}
+
// Send the next ack from the buffer's job queue.
void RdmaAckBuffer::SendNextItem() {
uint32_t imm_data = LookupBufferIndex("rx_ack_buffer");
@@ -655,6 +663,99 @@ void RdmaMessageBuffer::SendNextItem() {
}
}
+Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
+ const string& key_with_step_id, const string& key, int64 step_id,
+ const Rendezvous::ParsedKey& parsed) {
+ Rendezvous::DoneCallback cb = [this, key_with_step_id, key, step_id, parsed](
+ const Status& status, const Rendezvous::Args& send_args,
+ const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
+ CHECK(status.ok()) << "RecvLocalAsync was not ok, key" << key_with_step_id
+ << " error message: " << status.error_message();
+ size_t buffer_size = RdmaMessage::kMessageTotalBytes;
+ size_t tensor_bytes = 0;
+ // Figures out which device the tensor is hosted on.
+ Device* src_dev = nullptr;
+ Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
+ parsed.src_device, &src_dev);
+ CHECK(s.ok()) << "src device not found";
+ // Does the device have the right incarnation number we expect?
+ CHECK(src_dev->attributes().incarnation() == parsed.src_incarnation)
+ << "RecvTensor expects a different device incarnation: "
+ << parsed.src_incarnation << " vs. "
+ << src_dev->attributes().incarnation()
+ << ". Your worker job was probably restarted. Check your "
+ << "worker job for the reason why it was restarted.";
+ Device* dst_dev = nullptr;
+ // destination is on CPU.
+ s = channel_->adapter_->worker_env_->device_mgr->LookupDevice("CPU:0",
+ &dst_dev);
+ CHECK(s.ok()) << "dst device not found";
+ AllocatorAttributes dst_alloc_attr;
+ dst_alloc_attr.set_on_host(true);
+
+ bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+ // string tensor needs to be serialized
+ Tensor copy;
+ TensorProto proto;
+ if (src_dev->tensorflow_gpu_device_info() &&
+ (!send_args.alloc_attrs.on_host())) {
+ CHECK(send_args.device_context)
+ << "send dev name: " << src_dev->name()
+ << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+
+ if (can_memcpy) {
+ AllocatorAttributes host_alloc_attrs;
+ host_alloc_attrs.set_gpu_compatible(true);
+ host_alloc_attrs.set_on_host(true);
+ Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+ copy = Tensor(alloc, in.dtype(), in.shape());
+ tensor_bytes = in.TotalBytes();
+ buffer_size += tensor_bytes;
+ GPUUtil::CopyGPUTensorToCPU(
+ src_dev, send_args.device_context, &in, &copy,
+ [this, copy, tensor_bytes, buffer_size, key, in, step_id,
+ key_with_step_id, is_dead, send_args, recv_args](const Status& s) {
+ CHECK(s.ok()) << "copy tensor from gpu sync";
+ StringPiece copy_buf;
+ copy_buf = copy.tensor_data();
+ PostCopyOperations(true, buffer_size, tensor_bytes, key, in,
+ step_id, is_dead, key_with_step_id, &copy,
+ NULL, &copy_buf, send_args, recv_args);
+ });
+ } else {
+ // "val" is on a GPU. No longer uses GPUUtil to fill the proto, use
+ // aync instead
+ GPUUtil::SetProtoFromGPU(
+ in, src_dev, send_args.device_context, &proto, is_dead,
+ [this, proto, buffer_size, key, in, step_id, key_with_step_id,
+ is_dead, send_args, recv_args](const Status& s) mutable {
+ CHECK(s.ok()) << "copy proto from gpu sync";
+ auto tensor_bytes = proto.ByteSize();
+ buffer_size += tensor_bytes;
+ PostCopyOperations(false, buffer_size, tensor_bytes, key, in,
+ step_id, is_dead, key_with_step_id, NULL,
+ &proto, NULL, send_args, recv_args);
+ });
+ }
+ } else {
+ // tensor is in CPU memory.
+ StringPiece copy_buf;
+ if (can_memcpy) {
+ copy_buf = in.tensor_data();
+ tensor_bytes = in.TotalBytes();
+ } else {
+ in.AsProtoTensorContent(&proto);
+ tensor_bytes = proto.ByteSize();
+ }
+ buffer_size += tensor_bytes;
+ PostCopyOperations(can_memcpy, buffer_size, tensor_bytes, key, in,
+ step_id, is_dead, key_with_step_id, &copy, &proto,
+ &copy_buf, send_args, recv_args);
+ }
+ };
+ return cb;
+}
+
// Send the next tensor from the buffer's job queue.
void RdmaTensorBuffer::SendNextItem() {
// get the key
@@ -666,6 +767,7 @@ void RdmaTensorBuffer::SendNextItem() {
queue_.pop();
}
}
+
// send the tensor if a key is acquired.
if (key_with_step_id != "") {
VLOG(2) << "try to send tensor: " << key_with_step_id;
@@ -675,107 +777,54 @@ void RdmaTensorBuffer::SendNextItem() {
CHECK(key.compare(name_) == 0);
Rendezvous::ParsedKey parsed;
Rendezvous::ParseKey(key, &parsed);
- Rendezvous::DoneCallback cb = [this, key_with_step_id, key, step_id,
- parsed](const Status& status,
- const Rendezvous::Args& send_args,
- const Rendezvous::Args& recv_args,
- const Tensor& in, bool is_dead) {
- CHECK(status.ok()) << "RecvLocalAsync was not ok, key" << key_with_step_id
- << " error message: " << status.error_message();
- size_t buffer_size = RdmaMessage::kMessageTotalBytes;
- size_t tensor_bytes = 0;
- // Figures out which device the tensor is hosted on.
- Device* src_dev = nullptr;
- Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
- parsed.src_device, &src_dev);
- CHECK(s.ok()) << "src device not found";
- // Does the device have the right incarnation number we expect?
- CHECK(src_dev->attributes().incarnation() == parsed.src_incarnation)
- << "RecvTensor expects a different device incarnation: "
- << parsed.src_incarnation << " vs. "
- << src_dev->attributes().incarnation()
- << ". Your worker job was probably restarted. Check your "
- << "worker job for the reason why it was restarted.";
- Device* dst_dev = nullptr;
- // destination is on CPU.
- s = channel_->adapter_->worker_env_->device_mgr->LookupDevice("CPU:0",
- &dst_dev);
- CHECK(s.ok()) << "dst device not found";
- AllocatorAttributes dst_alloc_attr;
- dst_alloc_attr.set_on_host(true);
-
- bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
- // string tensor needs to be serialized
- Tensor copy;
- TensorProto proto;
- if (src_dev->tensorflow_gpu_device_info() &&
- (!send_args.alloc_attrs.on_host())) {
- CHECK(send_args.device_context)
- << "send dev name: " << src_dev->name()
- << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-
- if (can_memcpy) {
- AllocatorAttributes host_alloc_attrs;
- host_alloc_attrs.set_gpu_compatible(true);
- host_alloc_attrs.set_on_host(true);
- Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
- copy = Tensor(alloc, in.dtype(), in.shape());
- tensor_bytes = in.TotalBytes();
- buffer_size += tensor_bytes;
- GPUUtil::CopyGPUTensorToCPU(
- src_dev, send_args.device_context, &in, &copy,
- [this, copy, tensor_bytes, buffer_size, key, in, step_id,
- key_with_step_id, is_dead](const Status& s) {
- CHECK(s.ok()) << "copy tensor from gpu sync";
- StringPiece copy_buf;
- copy_buf = copy.tensor_data();
- PostCopyOperations(true, buffer_size, tensor_bytes, key, in,
- step_id, is_dead, key_with_step_id, &copy,
- NULL, &copy_buf);
- });
- } else {
- // "val" is on a GPU. No longer uses GPUUtil to fill the proto, use
- // aync instead
- GPUUtil::SetProtoFromGPU(
- in, src_dev, send_args.device_context, &proto, is_dead,
- [this, proto, buffer_size, key, in, step_id, key_with_step_id,
- is_dead](const Status& s) mutable {
- CHECK(s.ok()) << "copy proto from gpu sync";
- auto tensor_bytes = proto.ByteSize();
- buffer_size += tensor_bytes;
- PostCopyOperations(false, buffer_size, tensor_bytes, key, in,
- step_id, is_dead, key_with_step_id, NULL,
- &proto, NULL);
- });
- }
- } else {
- // tensor is in CPU memory.
- StringPiece copy_buf;
- if (can_memcpy) {
- copy_buf = in.tensor_data();
- tensor_bytes = in.TotalBytes();
- } else {
- in.AsProtoTensorContent(&proto);
- tensor_bytes = proto.ByteSize();
- }
- buffer_size += tensor_bytes;
- PostCopyOperations(can_memcpy, buffer_size, tensor_bytes, key, in,
- step_id, is_dead, key_with_step_id, &copy, &proto,
- &copy_buf);
- }
- // maybe some margin for string tensor?
- };
-
+ Rendezvous::DoneCallback cb =
+ getRecvTensorCallback(key_with_step_id, key, step_id, parsed);
channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(step_id,
parsed, cb);
}
}
+void RdmaTensorBuffer::ReSendNextItem() {
+ // get the key
+ string key_with_step_id = "";
+ {
+ mutex_lock lock{mu_};
+ if (!requeue.empty()) {
+ key_with_step_id = requeue.front();
+ requeue.pop();
+ }
+ }
+
+ // send the tensor if a key is acquired.
+ if (key_with_step_id != "") {
+ VLOG(2) << "try to send tensor: " << key_with_step_id;
+ string key;
+ int64 step_id;
+ VerbsUtil::GetKeyAndStepId(key_with_step_id, key, step_id);
+ CHECK(key.compare(name_) == 0);
+ Rendezvous::ParsedKey parsed;
+ Rendezvous::ParseKey(key, &parsed);
+ Rendezvous::DoneCallback cb =
+ getRecvTensorCallback(key_with_step_id, key, step_id, parsed);
+ ReItem* item;
+ {
+ mutex_lock lock{mu_};
+ Itable it = retable.find(key_with_step_id);
+ CHECK(it != retable.end()) << "Could not find dup-recv context";
+ item = it->second;
+ retable.erase(it);
+ }
+ cb(Status::OK(), item->send_args, item->recv_args, item->in, item->is_dead);
+ delete (item);
+ }
+}
+
void RdmaTensorBuffer::PostCopyOperations(
bool can_memcpy, size_t buffer_size, size_t tensor_bytes, const string& key,
const Tensor& in, int64 step_id, bool is_dead,
const string& key_with_step_id, const Tensor* copy,
- const TensorProto* proto, const StringPiece* copy_buf) {
+ const TensorProto* proto, const StringPiece* copy_buf,
+ const Rendezvous::Args& send_args, const Rendezvous::Args& recv_args) {
// prepare message
RdmaMessage rm;
rm.name_size_ = key.size();
@@ -793,9 +842,12 @@ void RdmaTensorBuffer::PostCopyOperations(
VLOG(2) << "Extend RDMA buffer from " << size_ << " to " << buffer_size;
}
CreateCPUBuffer(buffer_size, false);
+ // Need to be received again, put into the re-recv queue and the table
+ requeue.push(key_with_step_id);
+ ReItem* item = new ReItem(send_args, recv_args, in, is_dead);
+ retable.insert(std::pair<string, ReItem*>(key_with_step_id, item));
mu_.unlock();
- // put back the key since it is not sent;
- EnqueueItem(key_with_step_id);
+ // no longer used: put back the key since it is not sent;
// ask the remote to create the same buffer
rm.type_ = RDMA_MESSAGE_BUFFER_REQUEST;
rm.remote_addr_ = reinterpret_cast<uint64_t>(buffer_);
@@ -841,9 +893,11 @@ void RdmaTensorBuffer::PostCopyOperations(
}
Write(imm_data, buffer_size);
} else {
+ // Need to be received again, put into the re-recv queue and the table
+ requeue.push(key_with_step_id);
+ ReItem* item = new ReItem(send_args, recv_args, in, is_dead);
+ retable.insert(std::pair<string, ReItem*>(key_with_step_id, item));
mu_.unlock();
- // put back the key since it is not sent;
- EnqueueItem(key_with_step_id);
}
}
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index 16ef58bc62..e1e07db776 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -28,6 +28,7 @@ limitations under the License.
#include <vector>
#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/rendezvous.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/types.h"
@@ -224,14 +225,57 @@ class RdmaMessageBuffer : public RdmaBuffer {
class RdmaTensorBuffer : public RdmaBuffer {
public:
explicit RdmaTensorBuffer(RdmaChannel* channel, string name);
- virtual ~RdmaTensorBuffer() override {}
+ virtual ~RdmaTensorBuffer() override;
void SendNextItem() override;
void PostCopyOperations(bool can_memcpy, size_t buffer_size,
size_t tensor_bytes, const string& key,
const Tensor& in, int64 step_id, bool is_dead,
const string& key_with_step_id, const Tensor* copy,
- const TensorProto* proto,
- const StringPiece* copy_buf);
+ const TensorProto* proto, const StringPiece* copy_buf,
+ const Rendezvous::Args& send_args,
+ const Rendezvous::Args& recv_args);
+
+ void ReSendNextItem();
+
+ private:
+ Rendezvous::DoneCallback getRecvTensorCallback(
+ const string& key_with_step_id, const string& key, int64 step_id,
+ const Rendezvous::ParsedKey& parsed);
+
+ struct ReItem {
+ Rendezvous::Args send_args;
+ Rendezvous::Args recv_args;
+ Tensor in;
+ bool is_dead;
+
+ ReItem(const Rendezvous::Args& send_args_,
+ const Rendezvous::Args& recv_args_, const Tensor& in_, bool is_dead_)
+ : send_args(send_args_),
+ recv_args(recv_args_),
+ in(in_),
+ is_dead(is_dead_) {
+ if (send_args.device_context) {
+ send_args.device_context->Ref();
+ }
+ if (recv_args.device_context) {
+ recv_args.device_context->Ref();
+ }
+ }
+
+ ~ReItem() {
+ if (send_args.device_context) {
+ send_args.device_context->Unref();
+ }
+ if (recv_args.device_context) {
+ recv_args.device_context->Unref();
+ }
+ }
+ };
+ typedef std::map<string, ReItem*> Table;
+ typedef Table::iterator Itable;
+
+ std::queue<string> requeue GUARDED_BY(mu_);
+ Table retable GUARDED_BY(mu_);
};
struct RdmaMessage {
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 6c1896d7ab..188036b7aa 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -790,13 +790,16 @@ cc_library(
]) + if_mkl([
"//tensorflow/core/kernels:mkl_concat_op",
"//tensorflow/core/kernels:mkl_conv_op",
+ "//tensorflow/core/kernels:mkl_cwise_ops_common",
"//tensorflow/core/kernels:mkl_fused_batch_norm_op",
"//tensorflow/core/kernels:mkl_identity_op",
+ "//tensorflow/core/kernels:mkl_input_conversion_op",
"//tensorflow/core/kernels:mkl_lrn_op",
"//tensorflow/core/kernels:mkl_pooling_ops",
"//tensorflow/core/kernels:mkl_relu_op",
"//tensorflow/core/kernels:mkl_reshape_op",
"//tensorflow/core/kernels:mkl_tfconv_op",
+ "//tensorflow/core/kernels:mkl_aggregate_ops",
]),
)
@@ -2481,10 +2484,13 @@ tf_cc_test_mkl(
"//tensorflow/cc:cc_ops",
"//tensorflow/cc:scope",
"//tensorflow/cc:sendrecv_ops",
+ "//tensorflow/core/kernels:mkl_aggregate_ops",
"//tensorflow/core/kernels:mkl_concat_op",
"//tensorflow/core/kernels:mkl_conv_op",
+ "//tensorflow/core/kernels:mkl_cwise_ops_common",
"//tensorflow/core/kernels:mkl_fused_batch_norm_op",
"//tensorflow/core/kernels:mkl_identity_op",
+ "//tensorflow/core/kernels:mkl_input_conversion_op",
"//tensorflow/core/kernels:mkl_lrn_op",
"//tensorflow/core/kernels:mkl_pooling_ops",
"//tensorflow/core/kernels:mkl_relu_op",
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 005aabf9b8..f16da10d7a 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -75,12 +75,12 @@ class MklCPUAllocator : public Allocator {
// Hooks provided by this allocator for memory allocation routines from MKL
static inline void* MallocHook(size_t size) {
- VLOG(2) << "MklCPUAllocator: In MallocHook";
+ VLOG(3) << "MklCPUAllocator: In MallocHook";
return cpu_allocator()->AllocateRaw(kAlignment, size);
}
static inline void FreeHook(void* ptr) {
- VLOG(2) << "MklCPUAllocator: In FreeHook";
+ VLOG(3) << "MklCPUAllocator: In FreeHook";
cpu_allocator()->DeallocateRaw(ptr);
}
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index 29acad34e9..06695db779 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -69,9 +69,8 @@ class GrpcWorkerCache : public WorkerCachePartial {
} else {
SharedGrpcChannelPtr channel = channel_cache_->FindWorkerChannel(target);
if (!channel) return nullptr;
- WorkerInterface* ret = NewGrpcRemoteWorker(&live_rpc_counter_, channel,
- &completion_queue_, &logger_);
- return ret;
+ return NewGrpcRemoteWorker(&live_rpc_counter_, channel,
+ &completion_queue_, &logger_);
}
}
diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.cc b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
index 90d5e78884..61e5416234 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_partial.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
bool WorkerCachePartial::GetDeviceLocalityNonBlocking(
const string& device_name, DeviceLocality* locality) {
mutex_lock lock(mu_); // could use reader lock
- const auto& iter = device_status_cache_.find(device_name);
+ auto iter = device_status_cache_.find(device_name);
if (iter != device_status_cache_.end()) {
*locality = iter->second.locality();
return true;
@@ -44,16 +44,8 @@ void WorkerCachePartial::GetDeviceLocalityAsync(const string& device_name,
// If cache entry was empty, make one try to fill it by RPC.
SchedClosure([this, &device_name, locality, done]() {
Status s = RefreshDeviceStatus(device_name);
- if (s.ok()) {
- if (!GetDeviceLocalityNonBlocking(device_name, locality)) {
- mutex_lock lock(mu_);
- const auto& iter = device_status_cache_.find(device_name);
- if (iter == device_status_cache_.end()) {
- s = errors::Unavailable("No known remote device: ", device_name);
- } else {
- s = errors::Internal("Failed to find locality for ", device_name);
- }
- }
+ if (s.ok() && !GetDeviceLocalityNonBlocking(device_name, locality)) {
+ s = errors::Unavailable("No known remote device: ", device_name);
}
done(s);
});
@@ -70,7 +62,9 @@ Status WorkerCachePartial::RefreshDeviceStatus(const string& device_name) {
s = errors::InvalidArgument("Bad device name to RefreshDeviceStatus: ",
device_name);
}
- auto deleter = [this, task](WorkerInterface* wi) { ReleaseWorker(task, wi); };
+ auto deleter = [this, &task](WorkerInterface* wi) {
+ ReleaseWorker(task, wi);
+ };
std::unique_ptr<WorkerInterface, decltype(deleter)> rwi(CreateWorker(task),
deleter);
if (s.ok() && !rwi.get()) {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index cf5d6e8baa..90377e54c7 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -256,6 +256,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
public:
MklLayoutRewritePass() {
// NOTE: names are alphabetically sorted.
+ csinfo_.addn = "AddN";
csinfo_.avg_pool = "AvgPool";
csinfo_.avg_pool_grad = "AvgPoolGrad";
csinfo_.bias_add = "BiasAdd";
@@ -279,17 +280,31 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
csinfo_.mkl_conv2d_with_bias_backprop_bias =
"_MklConv2DWithBiasBackpropBias";
- csinfo_.relu = "Relu";
- csinfo_.relu_grad = "ReluGrad";
- csinfo_.reshape = "Reshape";
- csinfo_.split = "Split";
+ csinfo_.relu = "Relu";
+ csinfo_.relu_grad = "ReluGrad";
+ csinfo_.reshape = "Reshape";
+ csinfo_.split = "Split";
+ // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
+ // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
+ // MklInputConversion op is added before it.
+ csinfo_.add = "Add";
+ csinfo_.maximum = "Maximum";
+ csinfo_.mul = "Mul";
+ csinfo_.squared_difference = "SquaredDifference";
+ csinfo_.sub = "Sub";
+ // End - element-wise ops. See note above.
// NOTE: names are alphabetically sorted.
+ rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN,
+ AddNRewrite, nullptr});
+ rinfo_.push_back({csinfo_.add,
+ mkl_op_registry::GetMklOpName(csinfo_.add),
+ CopyAttrsDataType, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.avg_pool,
- GetMklOpName(csinfo_.avg_pool),
+ mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
CopyAttrsPooling, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.avg_pool_grad,
- GetMklOpName(csinfo_.avg_pool_grad),
+ mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
CopyAttrsPooling, AlwaysRewrite, nullptr});
// BiasAddGrad gets written into Conv2DWithBiasBackpropBias depending
// on if context contains Conv2D.
@@ -303,50 +318,62 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
CopyAttrsBiasAddGrad, ContextMatchRewrite,
&biasaddgrad_matmul_context_});
rinfo_.push_back({csinfo_.concat,
- GetMklOpName(csinfo_.concat),
+ mkl_op_registry::GetMklOpName(csinfo_.concat),
CopyAttrsConcat, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.concatv2,
- GetMklOpName(csinfo_.concatv2),
+ mkl_op_registry::GetMklOpName(csinfo_.concatv2),
CopyAttrsConcatV2, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.conv2d,
- GetMklOpName(csinfo_.conv2d),
+ mkl_op_registry::GetMklOpName(csinfo_.conv2d),
CopyAttrsConv2D, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.conv2d_grad_filter,
- GetMklOpName(csinfo_.conv2d_grad_filter),
+ mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
CopyAttrsConv2D, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.conv2d_grad_input,
- GetMklOpName(csinfo_.conv2d_grad_input),
+ mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
CopyAttrsConv2D, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.fused_batch_norm,
- GetMklOpName(csinfo_.fused_batch_norm),
+ mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.fused_batch_norm_grad,
- GetMklOpName(csinfo_.fused_batch_norm_grad),
+ mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.identity,
- GetMklOpName(csinfo_.identity),
+ mkl_op_registry::GetMklOpName(csinfo_.identity),
CopyAttrsIdentity, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.lrn,
- GetMklOpName(csinfo_.lrn),
+ mkl_op_registry::GetMklOpName(csinfo_.lrn),
CopyAttrsLRN, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.lrn_grad,
- GetMklOpName(csinfo_.lrn_grad),
+ mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
CopyAttrsLRN, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.max_pool,
- GetMklOpName(csinfo_.max_pool),
+ mkl_op_registry::GetMklOpName(csinfo_.max_pool),
CopyAttrsPooling, NonDepthBatchWisePoolRewrite, nullptr});
rinfo_.push_back({csinfo_.max_pool_grad,
- GetMklOpName(csinfo_.max_pool_grad),
+ mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
CopyAttrsPooling, AlwaysRewrite, nullptr});
+ rinfo_.push_back({csinfo_.maximum,
+ mkl_op_registry::GetMklOpName(csinfo_.maximum),
+ CopyAttrsDataType, AlwaysRewrite, nullptr});
+ rinfo_.push_back({csinfo_.mul,
+ mkl_op_registry::GetMklOpName(csinfo_.mul),
+ CopyAttrsDataType, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.relu,
- GetMklOpName(csinfo_.relu),
- CopyAttrsRelu, AlwaysRewrite, nullptr});
+ mkl_op_registry::GetMklOpName(csinfo_.relu),
+ CopyAttrsDataType, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.relu_grad,
- GetMklOpName(csinfo_.relu_grad),
- CopyAttrsRelu, AlwaysRewrite, nullptr});
+ mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
+ CopyAttrsDataType, AlwaysRewrite, nullptr});
rinfo_.push_back({csinfo_.reshape,
- GetMklOpName(csinfo_.reshape),
+ mkl_op_registry::GetMklOpName(csinfo_.reshape),
CopyAttrsReshape, AlwaysRewrite, nullptr});
+ rinfo_.push_back({csinfo_.squared_difference,
+ mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
+ CopyAttrsDataType, AlwaysRewrite, nullptr});
+ rinfo_.push_back({csinfo_.sub,
+ mkl_op_registry::GetMklOpName(csinfo_.sub),
+ CopyAttrsDataType, AlwaysRewrite, nullptr});
// Add info about which ops to add workspace edge to and the slots.
wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
@@ -429,6 +456,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
/// Structure to store all constant strings
/// NOTE: names are alphabetically sorted.
typedef struct {
+ string addn;
+ string add;
string avg_pool;
string avg_pool_grad;
string bias_add;
@@ -446,15 +475,19 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
string matmul;
string max_pool;
string max_pool_grad;
+ string maximum;
string mkl_conv2d;
string mkl_conv2d_grad_input;
string mkl_conv2d_grad_filter;
string mkl_conv2d_with_bias;
string mkl_conv2d_with_bias_backprop_bias;
+ string mul;
string relu;
string relu_grad;
string reshape;
string split;
+ string squared_difference;
+ string sub;
} ConstStringsInfo;
private:
@@ -502,15 +535,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
return N;
}
- // Get the name of Mkl op from original TensorFlow op
- // We prefix 'Mkl' to the original op to get Mkl op.
- // TODO(nhasabni) We should move this to mkl_util.h.
- inline string GetMklOpName(const string& name) const {
- // Prefix that we add to Tensorflow op name to construct Mkl op name.
- const char* const kMklOpPrefix = "_Mkl";
- return string(kMklOpPrefix) + name;
- }
-
// Can op represented by node 'n' run on DEVICE_CPU?
// Op can run on CPU with MKL if the runtime assigned device or the
// user requested device contains device CPU, or both are empty.
@@ -604,6 +628,19 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
return false;
}
+ static bool AddNRewrite(const Node* n, const ContextInfo* c) {
+ CHECK_NOTNULL(n);
+
+ int num;
+ CHECK_EQ(GetNodeAttr(n->def(), "N", &num).ok(), true);
+
+ // Condition that specifies non-batch-wise and non-depth-wise pooling.
+ if (num == 2) {
+ return true;
+ }
+
+ return false;
+ }
// Is BiasAddGrad node in 'n' is associated with Conv2DWithBias node
// specified in contextinfo 'ci'. Function updates fwd_node to point
// to Conv2DWithBias node if 'n' is associated with Conv2DWithBias.
@@ -907,15 +944,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
// We need operator-specific function to copy attributes because the framework
// does not provide any generic function for it.
// NOTE: names are alphabetically sorted.
+ static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
+ static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsIdentity(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
- static void CopyAttrsRelu(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
@@ -1334,7 +1372,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
for (auto ws : wsinfo_) {
if (orig_node->type_string() == ws.fwd_op &&
- mkl_op_registry::IsMklOp(GetMklOpName(orig_node->type_string()), T)) {
+ mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
// If this op is a fwd op, then we need to check if there is an
// edge from this node's fwd_slot to bwdop's bwd_slot. If there is
// an edge, then we just add an attribute on this node for setting
@@ -1360,7 +1398,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
nb->Attr("workspace_enabled", false);
}
} else if (orig_node->type_string() == ws.bwd_op &&
- mkl_op_registry::IsMklOp(GetMklOpName(orig_node->type_string()),
+ mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()),
T)) {
// If this op is a bwd op, then we need to add workspace edge and
// it's Mkl tensor edge between its corresponding fwd op and this
@@ -1376,7 +1414,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
if (e->src_output() == ws.fwd_slot &&
// We would have rewritten the forward op, so we need to use
// GetMklOpName call to get its Mkl name.
- e->src()->type_string() == GetMklOpName(ws.fwd_op) &&
+ e->src()->type_string() == mkl_op_registry::GetMklOpName(ws.fwd_op) &&
e->dst_input() == ws.bwd_slot) {
nb->Attr("workspace_enabled", true);
CHECK_NOTNULL(ws_tensors);
@@ -1455,6 +1493,20 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
}
+void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
+ NodeBuilder* nb) {
+ DataType T;
+ int N;
+
+ // Get all attributes from old node.
+ TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+ TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+ // Add attributes to new node.
+ nb->Attr("T", T);
+ nb->Attr("N", N);
+}
+
void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
NodeBuilder* nb) {
DataType T;
@@ -1527,8 +1579,8 @@ void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
nb->Attr("data_format", data_format);
}
-void MklLayoutRewritePass::CopyAttrsRelu(const Node* orig_node,
- NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
+ NodeBuilder* nb) {
DataType T;
// Get all attributes from old node.
@@ -1894,7 +1946,15 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
}
// Get all inputs.
- const int num_inputs = orig_node->in_edges().size();
+ int num_inputs = orig_node->in_edges().size();
+
+ // Drop count for control edges from inputs
+ for (const Edge* e : orig_node->in_edges()) {
+ if (e->IsControlEdge()) {
+ num_inputs--;
+ }
+ }
+
gtl::InlinedVector<Node*, 4> control_edges;
gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
FillInputs(orig_node, &control_edges, &inputs);
@@ -2008,7 +2068,34 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
// BiasAddGrad is not an Mkl layer, so we make an exception for it.
if (n->type_string() != csinfo_.bias_add_grad) {
- if (!mkl_op_registry::IsMklOp(GetMklOpName(n->type_string()), T)) {
+ if (!mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), T)) {
+ return nullptr;
+ }
+ }
+
+ // For elementwise node, we reuse the Eigen implementation and pass the MKL
+ // metadata tensor through so we can avoid conversions. However, if all
+ // incoming edges are in TF format, we don't need all this overhead, so
+ // replace the elementwise node only if at least one of its parents is a MKL
+ // node.
+ //
+ // TODO(vrane): Add implementation for element-wise ops that doesn't reuse
+ // eigen code to reduce cross-library dependency.
+ if (mkl_op_registry::IsMklElementWiseOp(
+ mkl_op_registry::GetMklOpName(n->type_string()), T)) {
+ bool incoming_mkl_edge = false;
+ for (auto parent : n->in_edges()) {
+ if (mkl_op_registry::IsMklOp(
+ mkl_op_registry::GetMklOpName(parent->src()->type_string()), T)) {
+ incoming_mkl_edge = true;
+ break;
+ } else {
+ VLOG(1) << "Non-MKL parent is: " << parent->src()->type_string();
+ }
+ }
+ if (incoming_mkl_edge == false) {
+ VLOG(1) << "Skipping replacement of elementwise node which has no MKL "
+ "parents.";
return nullptr;
}
}
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index bd1d74368e..6a41e3965a 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -133,19 +133,19 @@ TEST_F(MklLayoutPassTest, Basic) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }"
- "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(Mul);D(Mul)|"
+ "A(Input);B(Input);C(Zeta);D(Zeta)|"
"A->C;A->D;B->C:1;B->D:1");
}
// Test set 1: Conv2D + AddBias
-// C=_MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y) (for interleaved ordering)
-// C=_MklConv2D(A,B,M,N); E=BiasAdd(C,D); Z=Sub(E,Y) (for contiguous ordering)
+// C=_MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Zeta(E,Y) (for interleaved ordering)
+// C=_MklConv2D(A,B,M,N); E=BiasAdd(C,D); Z=Zeta(E,Y) (for contiguous ordering)
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
InitGraph(
@@ -166,18 +166,18 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['C', 'D'] }"
"node { name: 'Y' op: 'Input'}"
- "node { name: 'Z' op: 'Sub'"
+ "node { name: 'Z' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['E', 'Y']}");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
- "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->E;"
+ "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->E;"
"A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;M->E:3;"
"N->E:4;Y->Z:1");
}
-// C=_MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Sub(E,Y) (for interleaved)
-// C=_MklConv2D(A,B,M:1,N:1); E=BiasAdd(C,D); Z=Sub(E,Y) (for contiguous)
+// C=_MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Zeta(E,Y) (for interleaved)
+// C=_MklConv2D(A,B,M:1,N:1); E=BiasAdd(C,D); Z=Zeta(E,Y) (for contiguous)
// Test for correct output slots selected
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
@@ -199,17 +199,17 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['C', 'D'] }"
"node { name: 'Y' op: 'Input'}"
- "node { name: 'Z' op: 'Sub'"
+ "node { name: 'Z' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['E', 'Y']}");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
- "M(_MklInput2);N(_MklInput2);Y(Input);Z(Sub)|A->E;"
+ "M(_MklInput2);N(_MklInput2);Y(Input);Z(Zeta)|A->E;"
"A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;"
"M:1->E:3;N:1->E:4;Y->Z:1");
}
-// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y);
// This is a case of node rewrite followed by node merge.
// We will first rewrite Conv2D to _MklConv2D, and then merge _MklConv2D
// with BiasAdd to produce _MklConv2DWithBias.
@@ -231,12 +231,12 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['C', 'D'] }"
"node { name: 'Y' op: 'Input'}"
- "node { name: 'Z' op: 'Sub'"
+ "node { name: 'Z' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['E', 'Y']}");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
- "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Sub)|"
+ "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Zeta)|"
"A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
"A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
"DMT/_2->E:5;E->Z;Y->Z:1");
@@ -286,7 +286,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
"M(_MklInput);N(_MklInput)|A->C;B->C:1;D->F;E->F:1;M->C:2;N->C:3");
}
-// _MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
+// _MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Zeta).
// Merge should not be done in such case.
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
InitGraph(
@@ -308,12 +308,12 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['D', 'E'] }" // Conv2D has two outputs.
// No merge should happen.
- "node { name: 'G' op: 'Add'"
+ "node { name: 'G' op: 'Zeta'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'E'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);"
- "G(Add);M(_MklInput);N(_MklInput)|A->C;B->C:1;C->G;D->F;"
+ "G(Zeta);M(_MklInput);N(_MklInput)|A->C;B->C:1;C->G;D->F;"
"E->F:1;E->G:1;M->C:2;N->C:3");
}
@@ -362,7 +362,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B', 'C', 'M', 'N', 'O']}"
- "node { name: 'E' op: 'Sub'"
+ "node { name: 'E' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['D', 'A']}"
"node { name: 'F' op: 'Int32Input'}"
@@ -387,7 +387,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
" input: ['E'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
- "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
+ "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
"I(_MklConv2DBackpropInput);J(_MklConv2DWithBiasBackpropBias);"
"M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G;B->D:1;"
"B->I:1;C->D:2;D->E;DMT/_0->J:1;E->G:2;E->I:2;E->J;"
@@ -413,7 +413,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative1) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B', 'C', 'M', 'N', 'O']}"
- "node { name: 'E' op: 'Sub'"
+ "node { name: 'E' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['D', 'A']}"
"node { name: 'F' op: 'Int32Input'}"
@@ -438,7 +438,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative1) {
" input: ['E'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
- "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
+ "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
"I(_MklConv2DBackpropInput);J(BiasAddGrad);"
"M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
"B->I:1;C->D:2;D->E;E->G;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
@@ -463,7 +463,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['B', 'A', 'C', 'M', 'N', 'O']}"
- "node { name: 'E' op: 'Sub'"
+ "node { name: 'E' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['D', 'A']}"
"node { name: 'F' op: 'Int32Input'}"
@@ -488,7 +488,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
" input: ['E'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
- "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
+ "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
"I(_MklConv2DBackpropInput);J(BiasAddGrad);"
"M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
"B->I:1;C->D:2;D->E;E->G:2;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
@@ -512,7 +512,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B', 'C', 'M', 'N', 'O']}"
- "node { name: 'E' op: 'Sub'"
+ "node { name: 'E' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['D', 'A']}"
"node { name: 'F' op: 'Int32Input'}"
@@ -529,7 +529,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
" input: ['E'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
- "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);"
+ "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);"
"H(_MklConv2DWithBiasBackpropBias);M(_MklInput);N(_MklInput);"
"O(_MklInput)|A->D;A->E:1;A->G;B->D:1;C->D:2;D->E;DMT/_0->H:1;"
"E->G:2;E->H;E:control->DMT/_0:control;F->G:1;M->D:3;M->G:3;"
@@ -553,7 +553,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative1) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B', 'C', 'M', 'N', 'O']}"
- "node { name: 'E' op: 'Sub'"
+ "node { name: 'E' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['D', 'A']}"
"node { name: 'F' op: 'Int32Input'}"
@@ -570,7 +570,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative1) {
" input: ['E'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
- "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+ "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
"M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
"C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
"O->G:5");
@@ -593,7 +593,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['B', 'A', 'C', 'M', 'N', 'O']}"
- "node { name: 'E' op: 'Sub'"
+ "node { name: 'E' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['D', 'A']}"
"node { name: 'F' op: 'Int32Input'}"
@@ -610,7 +610,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) {
" input: ['E'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
- "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+ "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
"M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
"C->D:2;D->E;E->G:2;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
"O->G:5");
@@ -618,8 +618,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) {
// No _MklConv2DWithBias in context, but _MklConv2D in context.
// No rewrite for BiasAddGrad should happen.
-// C=_MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D) (for interleaved)
-// C=_MklConv2D(A,B,M,N); D=Sub(C,A); E=BiasAddGrad(D) (for contiguous)
+// C=_MklConv2D(A,M,B,N); D=Zeta(C,A); E=BiasAddGrad(D) (for interleaved)
+// C=_MklConv2D(A,B,M,N); D=Zeta(C,A); E=BiasAddGrad(D) (for contiguous)
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
InitGraph(
"node { name: 'A' op: 'Input'}"
@@ -633,7 +633,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B', 'M', 'N']}"
- "node { name: 'D' op: 'Sub'"
+ "node { name: 'D' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'A']}"
"node { name: 'E' op: 'BiasAddGrad'"
@@ -641,21 +641,21 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(_MklConv2D);D(Sub);E(BiasAddGrad);"
+ "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);"
"M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
"M->C:2;N->C:3");
}
// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+// C=Polygamma(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
- "node { name: 'C' op: 'Add'"
+ "node { name: 'C' op: 'Polygamma'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B']}"
- "node { name: 'D' op: 'Sub'"
+ "node { name: 'D' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'A']}"
"node { name: 'E' op: 'BiasAddGrad'"
@@ -663,13 +663,13 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+ "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
"A->C;A->D:1;B->C:1;C->D;D->E");
}
// No Conv2D in the context for BiasAddGrad, but MatMul in context.
// Rewrite should happen, but name of BiasAddGrad does not change.
-// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+// C=MatMul(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
InitGraph(
"node { name: 'A' op: 'Input'}"
@@ -679,7 +679,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
" attr { key: 'transpose_a' value { b: false } }"
" attr { key: 'transpose_b' value { b: false } }"
" input: ['A', 'B']}"
- "node { name: 'D' op: 'Sub'"
+ "node { name: 'D' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'A']}"
"node { name: 'E' op: 'BiasAddGrad'"
@@ -687,12 +687,12 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+ "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
"A->C;A->D:1;B->C:1;C->D;D->E");
}
// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
-// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+// C=MatMul(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
InitGraph(
"node { name: 'A' op: 'Input'}"
@@ -702,7 +702,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
" attr { key: 'transpose_a' value { b: false } }"
" attr { key: 'transpose_b' value { b: false } }"
" input: ['A', 'B']}"
- "node { name: 'D' op: 'Sub'"
+ "node { name: 'D' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'A']}"
"node { name: 'E' op: 'BiasAddGrad'"
@@ -710,20 +710,20 @@ TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
+ "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
"A->C;A->D:1;B->C:1;C->D;D->E");
}
// No MatMul in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+// C=Polygamma(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
- "node { name: 'C' op: 'Add'"
+ "node { name: 'C' op: 'Polygamma'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B']}"
- "node { name: 'D' op: 'Sub'"
+ "node { name: 'D' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'A']}"
"node { name: 'E' op: 'BiasAddGrad'"
@@ -731,7 +731,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
+ "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
"A->C;A->D:1;B->C:1;C->D;D->E");
}
@@ -752,10 +752,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
- "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['B', 'C'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(_MklConv2D);D(Mul);DMT/_0(Const);"
+ "A(Input);B(Input);C(_MklConv2D);D(Zeta);DMT/_0(Const);"
"DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
"A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
"DMT/_1->C:3");
@@ -781,11 +781,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'C']}"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
- "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;"
+ "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->C;A->D;"
"A:control->DMT/_0:control;A:control->DMT/_1:control;"
"A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
"C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
@@ -803,10 +803,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
- "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_HALF } }"
+ "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
" input: ['B', 'C'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(HalfInput);B(HalfInput);C(Conv2D);D(Mul)|"
+ "A(HalfInput);B(HalfInput);C(Conv2D);D(Zeta)|"
"A->C;B->C:1;B->D;C->D:1");
}
@@ -822,11 +822,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B', 'C']}"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
- "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|"
+ "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
"A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
"A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
"DMT/_1->D:4;DMT/_2->D:5");
@@ -844,11 +844,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['B', 'A', 'C']}"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
- "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|"
+ "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
"A->D:1;A->E;B->D;B:control->DMT/_0:control;"
"B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
"D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
@@ -869,11 +869,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'N' value { i: 2 } }"
" input: ['A', 'B:0', 'B:1']}"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
- "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D;A:control->DMT/_0:control;"
+ "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
"A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
"B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
}
@@ -908,12 +908,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'N' value { i: 2 } }"
" input: ['G', 'E', 'F']}"
- "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'H'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
"DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
- "F(_MklConv2D);G(Const);H(_MklConcat);I(Mul)|A->E;A->I;"
+ "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
"A:control->DMT/_2:control;A:control->DMT/_3:control;"
"B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
"D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
@@ -935,7 +935,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
- "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D']}"
"node { name: 'G' op: 'Const' "
" attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -946,12 +946,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'N' value { i: 2 } }"
" input: ['G', 'E', 'F']}"
- "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'H'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
- "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
- "H(_MklConcat);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
+ "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+ "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
"A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
"DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
"G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
@@ -973,11 +973,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
" attr { key: 'Tidx' value { type: DT_INT32 } }"
" attr { key: 'N' value { i: 2 } }"
" input: ['B:0', 'B:1', 'A']}"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
- "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D:2;B->D;B:1->D:1;"
+ "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D:2;B->D;B:1->D:1;"
"B:control->DMT/_0:control;B:control->DMT/_1:control;"
"B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
"DMT/_1->D:4;DMT/_2->D:5");
@@ -1014,12 +1014,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
" attr { key: 'Tidx' value { type: DT_INT32 } }"
" attr { key: 'N' value { i: 2 } }"
" input: ['E', 'F', 'G']}"
- "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'H'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
"DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
- "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Mul)|A->E;A->I;"
+ "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;"
"A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
"C:control->DMT/_0:control;C:control->DMT/_1:control;"
"D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
@@ -1041,7 +1041,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
- "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D']}"
"node { name: 'G' op: 'Const' "
" attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -1053,12 +1053,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
" attr { key: 'Tidx' value { type: DT_INT32 } }"
" attr { key: 'N' value { i: 2 } }"
" input: ['E', 'F', 'G']}"
- "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'H'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
- "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
- "H(_MklConcatV2);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
+ "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+ "H(_MklConcatV2);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
"A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
"DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;"
"E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
@@ -1071,10 +1071,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
"node { name: 'B' op: 'Relu'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(_MklRelu);C(Mul);DMT/_0(Const)|A->B;A->C;"
+ "A(Input);B(_MklRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;"
"A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
}
@@ -1085,10 +1085,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
"node { name: 'C' op: 'ReluGrad'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }"
- "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'C'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(_MklReluGrad);D(Mul);DMT/_0(Const);"
+ "A(Input);B(Input);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
"DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
"A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
}
@@ -1102,10 +1102,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
"node { name: 'C' op: 'ReluGrad'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }"
- "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'C'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(_MklRelu);C(_MklReluGrad);D(Mul);DMT/_0(Const);"
+ "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
"DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
"A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
"DMT/_1->C:2");
@@ -1121,10 +1121,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(_MklAvgPool);C(Mul);DMT/_0(Const)|A->B;A->C;"
+ "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"
"A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
}
@@ -1139,10 +1139,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
" input: ['A', 'B'] }"
- "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['B', 'C'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);"
+ "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
"DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
"A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
"DMT/_1->C:3");
@@ -1166,10 +1166,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
" input: ['I', 'B'] }"
- "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'C'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);"
+ "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
"DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
"B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
"I:control->DMT/_1:control");
@@ -1188,12 +1188,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
" attr { key: 'epsilon' value { f: 0.0001 } }"
" attr { key: 'is_training' value { b: true } }"
" input: ['A', 'B', 'C', 'D', 'E'] }"
- "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'F'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
"DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
- "F(_MklFusedBatchNormGrad);G(Mul)|A->F;A->G;"
+ "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"
"A:control->DMT/_0:control;A:control->DMT/_1:control;"
"A:control->DMT/_2:control;A:control->DMT/_3:control;"
"A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
@@ -1214,12 +1214,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
" attr { key: 'epsilon' value { f: 0.0001 } }"
" attr { key: 'is_training' value { b: true } }"
" input: ['A', 'B', 'C', 'D', 'E'] }"
- "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'F'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
"DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
- "F(_MklFusedBatchNorm);G(Mul)|A->F;A->G;"
+ "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"
"A:control->DMT/_0:control;A:control->DMT/_1:control;"
"A:control->DMT/_2:control;A:control->DMT/_3:control;"
"A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
@@ -1268,12 +1268,12 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
" attr { key: 'depth_radius' value { i: 2 } }"
" input: ['E', 'F', 'B'] }"
"node { name: 'H' op: 'Input'}"
- "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['H', 'G'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
"DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
- "I(Mul)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
+ "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
"B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
"C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
"E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
@@ -1301,11 +1301,11 @@ TEST_F(MklLayoutPassTest, LRN_Positive) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'depth_radius' value { i: 2 } }"
" input: ['C', 'D', 'B'] }"
- "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'E'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
- "DMT/_2(Const);E(_MklLRNGrad);F(Mul)|"
+ "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|"
"A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
"C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
"D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
@@ -1323,10 +1323,10 @@ TEST_F(MklLayoutPassTest, LRN_Negative1) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'depth_radius' value { i: 2 } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(_MklLRN);C(Mul);DMT/_0(Const)|"
+ "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|"
"A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
}
@@ -1344,11 +1344,11 @@ TEST_F(MklLayoutPassTest, LRN_Negative2) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'depth_radius' value { i: 2 } }"
" input: ['A', 'B', 'C'] }"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
- "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
+ "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
"A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
"A:control->DMT/_2:control;A:control->DMT/_3:control;"
"A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
@@ -1386,12 +1386,12 @@ TEST_F(MklLayoutPassTest, LRN_Negative3) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'depth_radius' value { i: 2 } }"
" input: ['C', 'B', 'D'] }"
- "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['E', 'F'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
"DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
- "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Mul)|A->B;"
+ "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Zeta)|A->B;"
"A:control->DMT/_0:control;B->E:2;"
"B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
"C:control->DMT/_1:control;C:control->DMT/_2:control;"
@@ -1421,11 +1421,11 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
" input: ['C', 'B', 'D'] }"
- "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'E'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
- "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Mul)|"
+ "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|"
"A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
"C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
"D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
@@ -1444,10 +1444,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(_MklMaxPool);C(Mul);DMT/_0(Const)|"
+ "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"
"A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
}
@@ -1466,11 +1466,11 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }"
" input: ['A', 'B', 'C'] }"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
- "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
+ "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
"A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
"A:control->DMT/_2:control;A:control->DMT/_3:control;"
"A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
@@ -1489,10 +1489,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
}
// Test MaxPool handling for batch-wise pooling (NCHW)
@@ -1507,10 +1507,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 2, i:1, i:1, i:1} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
}
// Test MaxPool handling for depth-wise pooling (NHWC)
@@ -1525,10 +1525,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
}
// Test MaxPool handling for depth-wise pooling (NCHW)
@@ -1543,10 +1543,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:2, i:1, i:1} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
}
// Test MaxPool handling for batch-wise pooling (NHWC)
@@ -1561,10 +1561,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
}
// Test MaxPool handling for batch-wise pooling (NHWC)
@@ -1579,10 +1579,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 2, i:1, i:1, i:1} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
}
// Test MaxPool handling for depth-wise pooling (NHWC)
@@ -1597,10 +1597,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
}
// Test MaxPool handling for depth-wise pooling (NHWC)
@@ -1615,10 +1615,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:2} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
}
/////////////////////////////////////////////////////////////////////
@@ -1636,10 +1636,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
- "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['B', 'C'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(Conv2D);D(Mul)|A->C;B->C:1;B->D;C->D:1");
+ "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
}
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
@@ -1657,7 +1657,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B', 'C', 'M', 'N', 'O']}"
- "node { name: 'E' op: 'Sub'"
+ "node { name: 'E' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['D', 'A']}"
"node { name: 'F' op: 'BiasAddGrad'"
@@ -1666,7 +1666,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
" input: ['E'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
- "E(Sub);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
+ "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
"O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
"M->D:3;N->D:4;O->D:5");
}
@@ -1683,10 +1683,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B', 'C']}"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'D'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Mul)|"
+ "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
"A->D;A->E;B->D:1;C->D:2;D->E:1");
}
@@ -1696,10 +1696,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
"node { name: 'B' op: 'Relu'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Relu);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
}
TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
@@ -1709,10 +1709,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
"node { name: 'C' op: 'ReluGrad'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }"
- "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'C'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(Input);C(ReluGrad);D(Mul)|A->C;A->D;B->C:1;C->D:1");
+ "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
}
TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
@@ -1725,10 +1725,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
}
TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
@@ -1741,10 +1741,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
" attr { key: 'padding' value { s: 'VALID' } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" input: ['A'] }"
- "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Input);B(AvgPool);C(Mul)|A->B;A->C;B->C:1");
+ "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
}
// Concat Op test: Concat with no Mkl layer feeding it
@@ -1762,10 +1762,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'N' value { i: 2 } }"
" input: ['A', 'B:0', 'B:1']}"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Const);B(InputList);C(Input);D(Concat);E(Mul)|A->D;"
+ "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
"B->D:1;B:1->D:2;C->E;D->E:1");
}
@@ -1784,10 +1784,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
" attr { key: 'Tidx' value { type: DT_INT32 } }"
" attr { key: 'N' value { i: 2 } }"
" input: ['B:0', 'B:1', 'A']}"
- "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
- "A(Const);B(InputList);C(Input);D(ConcatV2);E(Mul)|"
+ "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
"A->D:2;B->D;B:1->D:1;C->E;D->E:1");
}
@@ -1804,11 +1804,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
" attr { key: 'epsilon' value { f: 0.0001 } }"
" attr { key: 'is_training' value { b: true } }"
" input: ['A', 'B', 'C', 'D', 'E'] }"
- "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+ "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'F'] }", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(Input);D(Input);E(Input);"
- "F(FusedBatchNorm);G(Mul)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
+ "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
"E->F:4;F->G:1");
}
@@ -1832,12 +1832,12 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['C', 'D'] }"
"node { name: 'Y' op: 'Input'}"
- "node { name: 'Z' op: 'Sub'"
+ "node { name: 'Z' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['E', 'Y']}", kGPUDevice);
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
- "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->C;"
+ "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
"B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
}
@@ -1853,7 +1853,7 @@ static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
random::SimplePhilox rnd(&philox);
for (int op = 0; op < op_nodes; op++) {
s += strings::Printf(
- "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
+ "node { name: 'op%04d' op: 'Zeta' attr { key: 'T' value { "
"type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
op, rnd.Uniform(10), rnd.Uniform(10));
}
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 590b3d030f..3f8b0e86d0 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -64,6 +64,15 @@ namespace tensorflow {
// in the Mkl format. Non-compliant ops accept inputs and outputs in the
// TensorFlow format.
//
+// ADDENDUM: For element-wise ops, we may or may not need a conversion to
+// take place before we hit the op. For this, we add a new op before each
+// element-wise MKL op to deal with the inputs, called _MklInputConversion.
+// This pass has been enhanced to add this capability.
+//
+// The _MklInputConversion op will check the inputs to the elementwise op and
+// make sure that either both are in MKL format or both are in TF format,
+// depending on their initial state and whether broadcast is needed or not.
+
class MklToTfConversionPass : public GraphOptimizationPass {
public:
MklToTfConversionPass() {}
@@ -87,6 +96,16 @@ class MklToTfConversionPass : public GraphOptimizationPass {
return mkl_op_registry::IsMklOp(op_name, T);
}
+ // Is the input Op supported by Mkl-specific layout AND
+ // is it element-wise?
+ //
+ // @input op_name string of the op
+ // @input T Datatype to use for checking input op
+ // @return true if op is Mkl supported; false, otherwise.
+ inline bool IsMklElementWiseOp(const string& op_name, DataType T) const {
+ return mkl_op_registry::IsMklElementWiseOp(op_name, T);
+ }
+
// Insert layout conversion node on the edge pointed by 'e' from graph 'g'.
//
// Edge will be deleted once a call to this function is successful.
@@ -96,6 +115,17 @@ class MklToTfConversionPass : public GraphOptimizationPass {
// @return Success:OK() if insertion is successful, otherwise returns
// appropriate error status code.
Status InsertConversionNodeOnEdge(std::unique_ptr<Graph>* g, Edge*);
+
+ // For element-wise ops, we need to sanitize the inputs. For this, we add a
+ // new node at the input of the replacement element-wise node that checks
+ // the inputs and converts one/both of them as required. See the op code
+ // comments for details.
+ //
+ // Insert input conversion node as parent of 'n' from graph 'g'.
+ //
+ // @return Success:OK() if insertion is successful, otherwise returns
+ // appropriate error status code.
+ Status InsertInputConversionNode(std::unique_ptr<Graph>* g, Node*);
};
// We register MklToTf insertion for phase 2 in post-partition grouping
@@ -171,6 +201,92 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
return Status::OK();
}
+Status MklToTfConversionPass::InsertInputConversionNode(
+ std::unique_ptr<Graph>* g, Node* n) {
+ CHECK_NOTNULL(n);
+
+ // Get the input nodes and edges
+ std::vector<const Edge*> edges;
+ TF_CHECK_OK(n->input_edges(&edges));
+ if (edges.size() != 4) {
+ return Status(error::Code::INVALID_ARGUMENT,
+ "MKL Binary Element-wise op should have exactly 2 data"
+ " inputs and 2 metadata inputs");
+ }
+
+ // Sanity check: ensure that both inputs are of the expected type, and the
+ // same type as input type
+ CHECK_EQ(BaseType(edges[0]->src()->output_type(edges[0]->src_output())),
+ BaseType(edges[1]->src()->output_type(edges[1]->src_output())));
+ CHECK_EQ(BaseType(edges[0]->src()->output_type(edges[0]->src_output())),
+ BaseType(n->input_type(0)));
+
+ // Check ordering of edges
+ for (uint i = 0; i < 4; i++) {
+ CHECK_EQ((edges[i]->dst_input() == i), true);
+ }
+
+ // Build the conversion node and specify src as input.
+ Node* conversion_node = nullptr;
+
+ TF_CHECK_OK(
+ NodeBuilder((*g)->NewName("MklInputConversion"), "_MklInputConversion")
+ .Input(edges[0]->src(), edges[0]->src_output())
+ .Input(edges[1]->src(), edges[1]->src_output())
+ .Input(edges[2]->src(), edges[2]->src_output())
+ .Input(edges[3]->src(), edges[3]->src_output())
+ .Device(n->def().device())
+ .Attr("T", n->input_type(0))
+ .Finalize(&**g, &conversion_node));
+
+ CHECK_NOTNULL(conversion_node);
+
+ // Change the destination of any control edges to the InputConversion node
+ if (edges.size() != n->in_edges().size()) {
+ std::vector<const Edge*> edges_to_remove;
+ for (const Edge* e : n->in_edges()) {
+ if (e->IsControlEdge()) {
+ CHECK_NOTNULL((*g)->AddControlEdge(e->src(), conversion_node));
+ edges_to_remove.push_back(e);
+ }
+ }
+ for (const Edge* e : edges_to_remove) {
+ (*g)->RemoveEdge(e);
+ }
+ }
+
+ string data_format;
+ if (GetNodeAttr(edges[0]->src()->def(), "data_format", &data_format) ==
+ Status::OK()) {
+ conversion_node->AddAttr("data_format", data_format);
+ }
+
+ // Get assigned device from destination node and apply it to conversion node.
+ // We want conversion node to be on the same device as the destination node.
+ conversion_node->set_assigned_device_name(n->assigned_device_name());
+
+ // Set the Mkl op label for this op.
+ conversion_node->AddAttr("_kernel", mkl_op_registry::kMklOpLabel);
+
+ // Now that we have added edges from src->conversion_node, let's add edge from
+ // output of conversion_node to the element-wise node.
+ CHECK_NOTNULL((*g)->AddEdge(conversion_node, 0, n, edges[0]->dst_input()));
+ CHECK_NOTNULL((*g)->AddEdge(conversion_node, 1, n, edges[1]->dst_input()));
+ CHECK_NOTNULL((*g)->AddEdge(conversion_node, 2, n, edges[2]->dst_input()));
+ CHECK_NOTNULL((*g)->AddEdge(conversion_node, 3, n, edges[3]->dst_input()));
+
+ VLOG(1) << "MklToTfConversionPass - InputConversion: Inserting input "
+ << "conversion node on: " << n->type_string() << " successful.";
+
+ // Remove src->dst edge now.
+ (*g)->RemoveEdge(edges[0]);
+ (*g)->RemoveEdge(edges[1]);
+ (*g)->RemoveEdge(edges[2]);
+ (*g)->RemoveEdge(edges[3]);
+
+ return Status::OK();
+}
+
bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
bool result = false;
@@ -239,6 +355,49 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
DumpGraph("After MklToTfConversionPass", &**g);
+ //---------------------------------------------------------------------------
+ // Check all nodes and add an input-conversion-node if the node is an mkl
+ // element-wise node.
+ VLOG(1) << "Before running MklToTfConversionPass - InputConversion";
+
+ std::vector<Node*> candidate_nodes;
+ std::vector<Node*> order;
+ GetReversePostOrder(**g, &order); // This will give us topological sort.
+
+ for (Node* n : order) {
+ // If node is not an op or it does not have a datatype, then skip.
+ DataType datatype;
+ if (!n->IsOp() || (GetNodeAttr(n->def(), "T", &datatype) != Status::OK())) {
+ continue;
+ }
+ if (IsMklElementWiseOp(n->type_string(), datatype)) {
+ // If the input node is an input-conversion op, skip
+ Node* input_node = nullptr;
+ TF_CHECK_OK(n->input_node(0, &input_node));
+ DataType input_datatype;
+ if ((GetNodeAttr(n->def(), "T", &input_datatype) == Status::OK()) &&
+ (input_node->type_string().compare("_MklInputConversion") == 0)) {
+ continue;
+ }
+
+ VLOG(1) << "MklToTfConversionPass: InputConversion: Scheduled node "
+ << n->name() << " for inserting input conversion node";
+ candidate_nodes.push_back(const_cast<Node*>(n));
+ }
+ }
+
+ // Process all candidate edges and insert conversion nodes on them.
+ for (Node* n : candidate_nodes) {
+ // Even if we insert conversion node on a single node, we
+ // need to return true.
+ if (InsertInputConversionNode(g, n) == Status::OK()) {
+ VLOG(1) << "MklToTfConversionPass: Inserted conversion "
+ << "on node " << n->name();
+ result = true;
+ }
+ }
+ DumpGraph("After MklToTfConversionPass - InputConversion", &**g);
+
// We need to return true even if we insert one conversion node
// anywhere in the graph.
return result;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b6d7e3b4b2..cff6e30c04 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2340,7 +2340,10 @@ tf_kernel_library(
tf_kernel_library(
name = "svd_op",
prefix = "svd_op",
- deps = LINALG_DEPS,
+ deps = LINALG_DEPS + if_cuda([
+ ":cuda_solvers",
+ ":transpose_functor",
+ ]),
)
cc_library(
@@ -2938,7 +2941,7 @@ tf_kernel_library(
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:nn_ops_op_lib",
- ],
+ ] + if_cuda(["@cub_archive//:cub"]),
)
tf_kernel_library(
@@ -5502,6 +5505,22 @@ tf_mkl_kernel_library(
)
tf_mkl_kernel_library(
+ name = "mkl_input_conversion_op",
+ hdrs = ["mkl_tfconv_op.h"],
+ prefix = "mkl_input_conversion",
+ deps = [
+ ":bounds_check",
+ ":ops_util",
+ "//tensorflow/core:core_cpu",
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ "//tensorflow/core:nn_ops_op_lib",
+ "//third_party/mkl:intel_binary_blob",
+ ],
+)
+
+tf_mkl_kernel_library(
name = "mkl_pooling_ops",
srcs = [
"mkl_avgpooling_op.cc",
@@ -5544,6 +5563,14 @@ tf_mkl_kernel_library(
)
tf_mkl_kernel_library(
+ name = "mkl_aggregate_ops",
+ prefix = "mkl_aggregate_ops",
+ deps = MATH_DEPS + [
+ "//third_party/mkl:intel_binary_blob",
+ ],
+)
+
+tf_mkl_kernel_library(
name = "mkl_concat_op",
prefix = "mkl_concat_op",
deps = ARRAY_DEPS + [
@@ -5575,6 +5602,20 @@ tf_mkl_kernel_library(
],
)
+tf_mkl_kernel_library(
+ name = "mkl_cwise_ops_common",
+ hdrs = [
+ "cwise_ops.h",
+ "cwise_ops_common.h",
+ "cwise_ops_gradients.h",
+ ],
+ prefix = "mkl_cwise_ops_common",
+ deps = NN_DEPS + [
+ "cwise_op",
+ "//third_party/mkl:intel_binary_blob",
+ ],
+)
+
cc_library(
name = "dataset",
srcs = ["dataset.cc"],
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index ddc2d457b0..42f3db1d79 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -173,15 +173,20 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
// Accumulate the results in the shared memory into the first element.
// No syncthreads is needed since this is only in the same warp.
int32 thread_index = threadIdx.x;
- if (thread_index < 16) s_data[thread_index] += s_data[thread_index + 16];
- if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8];
- if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4];
- if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2];
- if (thread_index < 1) s_data[thread_index] += s_data[thread_index + 1];
-
- // The first thread writes out the accumulated result to the global location.
- if (thread_index == 0) {
- CudaAtomicAdd(bias_backprop + bias_index, T(s_data[0]));
+ if (thread_index < 16) {
+ s_data[thread_index] += s_data[thread_index + 16];
+ __syncwarp(0xFFFF);
+ if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8];
+ __syncwarp(0xFF);
+ if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4];
+ __syncwarp(0xF);
+ if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2];
+ __syncwarp(0x3);
+ if (thread_index == 0) {
+ T val = T(s_data[0] + s_data[1]);
+ // The first thread writes out the accumulated result to global location.
+ CudaAtomicAdd(bias_backprop + bias_index, val);
+ }
}
}
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 5c6b5eec82..43197d8cf4 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -174,7 +174,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync(
}
info_checker_callback(status, host_lapack_infos);
};
-
+
auto cb =
std::bind(wrapped_info_checker_callback, context_,
std::move(info_checker_callback), std::move(host_lapack_infos));
@@ -188,6 +188,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync(
// numeric types.
#define TF_CALL_LAPACK_TYPES(m) \
m(float, S) m(double, D) m(std::complex<float>, C) m(std::complex<double>, Z)
+#define TF_CALL_LAPACK_TYPES_NO_COMPLEX(m) m(float, S) m(double, D)
// Macros to construct cusolverDn method names.
#define DN_SOLVER_FN(method, lapack_prefix) cusolverDn##lapack_prefix##method
@@ -327,6 +328,41 @@ static inline Status GetrsImpl(SolverFnT solver, OpKernelContext* context,
TF_CALL_LAPACK_TYPES(GETRS_INSTANCE);
+template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
+static inline Status GesvdImpl(BufSizeFnT bufsize, SolverFnT solver,
+ OpKernelContext* context,
+ cusolverDnHandle_t cusolver_dn_handle,
+ signed char jobu, signed char jobvt, int m,
+ int n, Scalar* A, int lda, Scalar* S, Scalar* U,
+ int ldu, Scalar* VT, int ldvt,
+ int* dev_lapack_info) {
+ /* Get amount of workspace memory required. */
+ int lwork;
+ TF_RETURN_IF_CUSOLVER_ERROR(bufsize(cusolver_dn_handle, m, n, &lwork));
+ /* Allocate device memory for workspace. */
+ ScratchSpace<Scalar> dev_workspace(context, lwork, /* on_host */ false);
+ /* Launch the solver kernel. */
+ TF_RETURN_IF_CUSOLVER_ERROR(solver(
+ cusolver_dn_handle, jobu, jobvt, m, n, CUDAComplex(A), lda, S,
+ CUDAComplex(U), ldu, CUDAComplex(VT), ldvt,
+ CUDAComplex(dev_workspace.mutable_data()), lwork, NULL, dev_lapack_info));
+ return Status::OK();
+}
+
+#define GESVD_INSTANCE(Scalar, lapack_prefix) \
+ template <> \
+ Status CudaSolver::Gesvd<Scalar>( \
+ signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A, \
+ int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT, \
+ int ldvt, int* dev_lapack_info) const { \
+ return GesvdImpl(DN_BUFSIZE_FN(gesvd, lapack_prefix), \
+ DN_SOLVER_FN(gesvd, lapack_prefix), context_, \
+ cusolver_dn_handle_, jobu, jobvt, m, n, dev_A, lda, \
+ dev_S, dev_U, ldu, dev_VT, ldvt, dev_lapack_info); \
+ }
+
+TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVD_INSTANCE);
+
//=============================================================================
// Wrappers of cuBlas computational methods begin here.
//
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 0fd6450f98..7cbdc895dd 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -258,13 +258,23 @@ class CudaSolver {
Status Syevd(cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, Scalar*
dev_A, int lda, Scalar* dev_W, int* dev_lapack_info) const;
+*/
// Singular value decomposition.
// See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-gesvd
template <typename Scalar>
Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
- int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
- int ldvt, int* dev_lapack_info);
- */
+ int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
+ int ldvt, int* dev_lapack_info) const;
+ /*
+ // Batched linear solver using LU factorization from getrfBatched.
+ // See:
+ http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched
+ template <typename Scalar>
+ Status GetrsBatched(cublasOperation_t trans, int n, int nrhs,
+ const Scalar* dev_Aarray[], int lda, const int* devIpiv,
+ Scalar* dev_Barray[], int ldb, int* info, int batch_size)
+ const;
+ */
private:
OpKernelContext* context_; // not owned.
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index d935331904..ada39eae38 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -139,7 +139,7 @@ struct scalar_left : private Binary {
typedef Tout result_type;
const Tin* left;
- EIGEN_DEVICE_FUNC inline scalar_left(const scalar_left& other) = default;
+ inline scalar_left(const scalar_left& other) = default;
template <typename... Args>
EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c, Args... args)
@@ -169,7 +169,7 @@ struct scalar_right : private Binary {
typedef Tout result_type;
const Tin* right;
- EIGEN_DEVICE_FUNC inline scalar_right(const scalar_right& other) = default;
+ inline scalar_right(const scalar_right& other) = default;
template <typename... Args>
EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c, Args... args)
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index 192a4f732e..693c6467ac 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -20,7 +20,9 @@ namespace tensorflow {
BinaryOpShared::BinaryOpShared(OpKernelConstruction* ctx, DataType out,
DataType in)
: OpKernel(ctx) {
+#ifndef INTEL_MKL
OP_REQUIRES_OK(ctx, ctx->MatchSignature({in, in}, {out}));
+#endif
}
void BinaryOpShared::SetUnimplementedError(OpKernelContext* ctx) {
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index 9492a4e26d..1c0085cfea 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -105,6 +105,7 @@ REGISTER(Eigen::half);
REGISTER(float);
REGISTER(double);
REGISTER(int32);
+REGISTER(uint16);
REGISTER(uint8);
REGISTER(int16);
REGISTER(int8);
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index fcfcd188d2..ecfe51d599 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -22,6 +22,7 @@ limitations under the License.
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
#include "tensorflow/core/util/tensor_format.h"
+#include "external/cub_archive/cub/util_ptx.cuh"
#if !defined(_MSC_VER)
#define UNROLL _Pragma("unroll")
@@ -1015,6 +1016,21 @@ __global__ void __launch_bounds__(640, 2)
}
}
+// Device function to compute sub-warp sum reduction for a power-of-two group of
+// neighboring threads.
+template<int kWidth, typename T>
+__device__ __forceinline__ T WarpSumReduce(T val) {
+ // support only power-of-two widths.
+ assert(__popc(kWidth) == 1);
+ int sub_warp = cub::LaneId() / kWidth;
+ int zeros = sub_warp * kWidth;
+ unsigned mask = ((1UL << kWidth) - 1) << zeros;
+ for (int delta = kWidth / 2; delta > 0; delta /= 2) {
+ val += CudaShuffleXor(mask, val, delta);
+ }
+ return val;
+}
+
// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
// NHWC format, tailored for small images up to 32x32. Stride and depth
// multiplier must be 1. Padding must be 'SAME'. Only use this kernel if
@@ -1127,6 +1143,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
// Note: the condition to reach this is uniform across the entire block.
__syncthreads();
+ unsigned active_threads = CudaBallot(CUDA_WARP_ALL, depth_in_range);
if (depth_in_range) {
const T* const out_ptr = inout_offset + output;
@@ -1140,7 +1157,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
// Warp-accumulate pixels of the same depth and write to accumulator.
for (int delta = 16; delta >= kBlockSlices; delta /= 2) {
- val += CudaShuffleDown(val, delta);
+ val += CudaShuffleDown(active_threads, val, delta);
}
if (!(thread_idx & 32 - kBlockSlices) /* lane_idx < kBlockSlices */) {
*accum_ptr = val;
@@ -1164,9 +1181,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
if (filter_depth < in_depth) {
T val = accum_data[i];
// Warp-accumulate the pixels of the same depth from the accumulator.
- for (int delta = kAccumPixels / 2; delta > 0; delta /= 2) {
- val += CudaShuffleDown(val, delta);
- }
+ val = WarpSumReduce<kAccumPixels>(val);
if (!(thread_idx & kAccumPixels - 1)) {
CudaAtomicAdd(filter_offset + filter, val);
}
@@ -1382,6 +1397,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
// Note: the condition to reach this is uniform across the entire block.
__syncthreads();
+ unsigned active_threads = CudaBallot(CUDA_WARP_ALL, slice_in_range);
if (slice_in_range) {
const T* const out_ptr = inout_offset + output;
@@ -1395,7 +1411,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
// Warp-accumulate pixels of the same depth and write to accumulator.
for (int delta = 16 / kBlockSlices; delta > 0; delta /= 2) {
- val += CudaShuffleDown(val, delta);
+ val += CudaShuffleDown(active_threads, val, delta);
}
if (!(thread_idx & 32 / kBlockSlices - 1)) {
*accum_ptr = val;
@@ -1419,9 +1435,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
if (filter_depth < in_depth) {
T val = accum_data[i];
// Warp-accumulate pixels of the same depth from the accumulator.
- for (int delta = kAccumPixels / 2; delta > 0; delta /= 2) {
- val += CudaShuffleDown(val, delta);
- }
+ val = WarpSumReduce<kAccumPixels>(val);
if (!(thread_idx & kAccumPixels - 1)) {
CudaAtomicAdd(filter_offset + filter, val);
}
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 8a0a558eef..ea0cc139f3 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -20,6 +20,7 @@ limitations under the License.
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
namespace tensorflow {
namespace functor {
@@ -50,6 +51,7 @@ DEFINE_SETZERO_CPU(int32);
DEFINE_SETZERO_CPU(int64);
DEFINE_SETZERO_CPU(complex64);
DEFINE_SETZERO_CPU(complex128);
+DEFINE_SETZERO_CPU(Variant);
#undef DEFINE_SETZERO_CPU
#ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
new file mode 100644
index 0000000000..51ba127def
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -0,0 +1,273 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include <numeric>
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class MklAddNOp : public OpKernel {
+ public:
+ explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ const int num = ctx->num_inputs();
+ OP_REQUIRES(ctx, num / 2 == 2,
+ errors::InvalidArgument("Only additions of two arguments "
+ "supported by MKL. Num inputs: ",
+ num));
+
+ MklAddNOpContext mkl_context;
+ const Tensor& input0 = MklGetInput(ctx, 0);
+ GetMklShape(ctx, 0, &(mkl_context.input1_shape));
+ bool input1_in_mkl_format = mkl_context.input1_shape.IsMklTensor();
+
+ const Tensor& input1 = MklGetInput(ctx, 1);
+ GetMklShape(ctx, 1, &(mkl_context.input2_shape));
+ bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();
+
+ mkl_context.in_dims = input1_in_mkl_format
+ ? mkl_context.input1_shape.GetDimension()
+ : input0.dims();
+ mkl_context.in_dims = input2_in_mkl_format
+ ? mkl_context.input2_shape.GetDimension()
+ : input1.dims();
+ // Generate size, stride for input if input is in MKL format.
+ ExtractMklOpParams(&mkl_context.in1_sizes,
+ &mkl_context.in1_strides, input0, &mkl_context.input1_shape);
+ ExtractMklOpParams(&mkl_context.in2_sizes,
+ &mkl_context.in2_strides, input1, &mkl_context.input2_shape);
+
+ std::vector<float> coeff(2, 1.0);
+ mkl_context.MklCreateInputLayouts(ctx);
+ CHECK_EQ(dnnSumCreate_F32(&mkl_context.Eltwise, mkl_context.attributes, 2,
+ mkl_context.lt_input1, &coeff[0]),
+ E_SUCCESS);
+
+ Tensor mkl_tmp_input1_buf_tensor, mkl_tmp_input2_buf_tensor;
+ mkl_context.MklPrepareAddNInputs(ctx, &mkl_tmp_input1_buf_tensor,
+ &mkl_tmp_input2_buf_tensor);
+ Tensor* output = nullptr;
+ if (input1_in_mkl_format || input2_in_mkl_format) {
+ TensorShape tf_shape;
+ mkl_context.output_shape.SetMklTensor(true);
+ mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst);
+
+ mkl_context.output_shape.SetTfLayout(
+ mkl_context.in_dims, mkl_context.in1_sizes, mkl_context.in1_strides);
+ if (input1_in_mkl_format == true) {
+ mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
+ mkl_context.input1_shape.GetTfToMklDimMap());
+ } else {
+ mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
+ mkl_context.input2_shape.GetTfToMklDimMap());
+ }
+ tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+ mkl_context.output_shape.GetMklLayout())) /
+ sizeof(T));
+
+ AllocateOutputSetMklShape(ctx, 0, &output, tf_shape,
+ mkl_context.output_shape);
+ } else {
+ const TensorShape& o_shape = input1.shape();
+ mkl_context.output_shape.SetMklTensor(false);
+ AllocateOutputSetMklShape(ctx, 0, &output, o_shape,
+ mkl_context.output_shape);
+ }
+
+ mkl_context.Eltwise_res[dnnResourceDst] =
+ static_cast<void*>(output->flat<T>().data());
+
+ // Execute convolution
+ CHECK_EQ(dnnExecute_F32(mkl_context.Eltwise, mkl_context.Eltwise_res),
+ E_SUCCESS);
+
+ mkl_context.MklCleanup();
+ }
+
+ void ExtractMklOpParams(size_t** out_sizes, size_t** out_strides,
+ const Tensor& input, const MklShape* input_shape) {
+ bool input_in_mkl_format = input_shape->IsMklTensor();
+ int in_dims = input_in_mkl_format
+ ? input_shape->GetDimension()
+ : input.dims();
+ size_t* in_sizes = new size_t[in_dims];
+ size_t* in_strides = new size_t[in_dims];
+
+ if (input_in_mkl_format) {
+ for (int i = 0; i < in_dims; i++) {
+ in_sizes[i] = input_shape->GetSizes()[i];
+ in_strides[i] = input_shape->GetStrides()[i];
+ }
+ } else {
+ for (int i = 0; i < in_dims; i++) {
+ in_sizes[i] =
+ input.dim_size((in_dims - 1) - i);
+ }
+ in_strides[0] = 1;
+ for (int i = 1; i < in_dims; i++) {
+ in_strides[i] =
+ in_strides[i - 1] * in_sizes[i - 1];
+ }
+ }
+ *out_sizes = in_sizes;
+ *out_strides = in_strides;
+ }
+
+
+ private:
+ typedef struct {
+ int in_dims;
+ size_t* in1_sizes;
+ size_t* in1_strides;
+
+ size_t* in2_sizes;
+ size_t* in2_strides;
+ dnnPrimitive_t Eltwise = nullptr;
+ dnnPrimitiveAttributes_t attributes = nullptr;
+ void* Eltwise_res[dnnResourceNumber];
+ dnnLayout_t lt_input1 = nullptr, lt_input2 = nullptr;
+ MklShape input1_shape, input2_shape, output_shape;
+
+ void MklCreateInputLayouts(OpKernelContext* context) {
+ bool input1_in_mkl_format = input1_shape.IsMklTensor();
+ if (!input1_in_mkl_format) {
+ CHECK_EQ(
+ dnnLayoutCreate_F32(&lt_input1, in_dims, in1_sizes, in1_strides),
+ E_SUCCESS);
+ } else {
+ lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout());
+ }
+
+ bool input2_in_mkl_format = input2_shape.IsMklTensor();
+ if (!input2_in_mkl_format) {
+ CHECK_EQ(
+ dnnLayoutCreate_F32(&lt_input2, in_dims, in2_sizes, in2_strides),
+ E_SUCCESS);
+ } else {
+ lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout());
+ }
+ }
+
+ void MklPrepareAddNInputs(OpKernelContext* context,
+ Tensor* mkl_tmp_input1_buf_tensor,
+ Tensor* mkl_tmp_input2_buf_tensor) {
+ bool mkl_convert_input1, mkl_convert_input2;
+ dnnPrimitive_t mkl_prim_convert_input1 = nullptr,
+ mkl_prim_convert_input2 = nullptr;
+ dnnLayout_t mkl_lt_internal_input1 = nullptr,
+ mkl_lt_internal_input2 = nullptr;
+ void *mkl_buf_convert_input1 = nullptr, *mkl_buf_convert_input2 = nullptr;
+ dnnResourceType_t dnnResourceMultipleSrc2 =
+ (dnnResourceType_t)(dnnResourceMultipleSrc + 1);
+ // Compare with internal layouts and convert if needed
+ const Tensor& input1 = MklGetInput(context, 0);
+
+ void* mkl_buf_input1 =
+ const_cast<void*>(static_cast<const void*>(input1.flat<T>().data()));
+
+ CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+ &mkl_lt_internal_input1, Eltwise, dnnResourceMultipleSrc),
+ E_SUCCESS);
+ mkl_convert_input1 =
+ !dnnLayoutCompare_F32(mkl_lt_internal_input1, lt_input1);
+ if (mkl_convert_input1) {
+ CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input1, lt_input1,
+ mkl_lt_internal_input1),
+ E_SUCCESS);
+ AllocTmpBuffer(context, mkl_tmp_input1_buf_tensor,
+ mkl_lt_internal_input1, &mkl_buf_convert_input1);
+ CHECK_EQ(
+ dnnConversionExecute_F32(mkl_prim_convert_input1, mkl_buf_input1,
+ mkl_buf_convert_input1),
+ E_SUCCESS);
+ dnnDelete_F32(mkl_prim_convert_input1);
+ }
+ dnnLayoutDelete_F32(mkl_lt_internal_input1);
+
+ Eltwise_res[dnnResourceMultipleSrc] =
+ (mkl_convert_input1) ? mkl_buf_convert_input1 : mkl_buf_input1;
+
+ const Tensor& input2 = MklGetInput(context, 1);
+ void* mkl_buf_input2 =
+ const_cast<void*>(static_cast<const void*>(input2.flat<T>().data()));
+ CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+ &mkl_lt_internal_input2, Eltwise, dnnResourceMultipleSrc2),
+ E_SUCCESS);
+ mkl_convert_input2 =
+ !dnnLayoutCompare_F32(mkl_lt_internal_input2, lt_input2);
+ if (mkl_convert_input2) {
+ CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input2, lt_input2,
+ mkl_lt_internal_input2),
+ E_SUCCESS);
+ AllocTmpBuffer(context, mkl_tmp_input2_buf_tensor,
+ mkl_lt_internal_input2, &mkl_buf_convert_input2);
+ CHECK_EQ(
+ dnnConversionExecute_F32(mkl_prim_convert_input2, mkl_buf_input2,
+ mkl_buf_convert_input2),
+ E_SUCCESS);
+ dnnDelete_F32(mkl_prim_convert_input2);
+ }
+ dnnLayoutDelete_F32(mkl_lt_internal_input2);
+
+ Eltwise_res[dnnResourceMultipleSrc2] =
+ (mkl_convert_input2) ? mkl_buf_convert_input2 : mkl_buf_input2;
+ }
+
+ void MklCleanup() {
+ bool input1_in_mkl_format = input1_shape.IsMklTensor();
+ bool input2_in_mkl_format = input2_shape.IsMklTensor();
+ dnnDelete_F32(Eltwise);
+ if (!input1_in_mkl_format) {
+ dnnLayoutDelete_F32(lt_input1);
+ delete [] in1_sizes;
+ delete [] in1_strides;
+ }
+ if (!input2_in_mkl_format) {
+ dnnLayoutDelete_F32(lt_input2);
+ delete [] in2_sizes;
+ delete [] in2_strides;
+ }
+ }
+ } MklAddNOpContext;
+};
+
+#define REGISTER_MKL_CPU(T) \
+ REGISTER_KERNEL_BUILDER(Name("_MklAddN") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .Label(mkl_op_registry::kMklOpLabel), \
+ MklAddNOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+} // namespace tensorflow
+#endif // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 5dfce5d5c6..7f1555d325 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -406,8 +406,10 @@ class MklConv2DOp : public OpKernel {
CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, lt_filter,
mkl_lt_internal_filter),
E_SUCCESS);
+
mkl_buf_convert_filter = const_cast<void*>(
static_cast<const void*>(output_filter->flat<T>().data()));
+
CHECK_EQ(
dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
mkl_buf_convert_filter),
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
new file mode 100644
index 0000000000..7fc633c254
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@@ -0,0 +1,88 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0(the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+// See docs in ../ops/math_ops.cc.
+
+#define EIGEN_USE_THREADS
+#include <iostream>
+#include <vector>
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename Functor>
+class MklBinaryOp : public BinaryOp<Device, Functor> {
+ public:
+ explicit MklBinaryOp(OpKernelConstruction* context)
+ : BinaryOp<Device, Functor>(context) {}
+
+ void Compute(OpKernelContext* context) override {
+ auto in0 = context->input(0);
+ auto in1 = context->input(1);
+ VLOG(1) << "Shapes (start mklbinaryop compute): "
+ << in0.shape().DebugString() << " _and_ "
+ << in1.shape().DebugString();
+
+ // Call the TensorFlow BinaryOp Compute method
+ BinaryOp<Device, Functor>::Compute(context);
+
+ auto out = context->mutable_output(0);
+ VLOG(1) << "Shapes (output): " << out->shape().DebugString();
+
+ // Pass input shape through to ouput shape
+ ForwardMklMetaDataInToOut(context, 0, 0);
+
+ out = context->mutable_output(0);
+ VLOG(1) << "Shapes (output): " << out->shape().DebugString();
+ }
+};
+
+//---------- Registration macros for various element-wise ops -----------
+// We will need to redefine "REGISTER" to include the mkl_op_registry flag
+#pragma push_macro("REGISTER")
+#undef REGISTER
+#define REGISTER(OP, D, N, F, T) \
+ REGISTER_KERNEL_BUILDER(Name(N) \
+ .Device(DEVICE_##D) \
+ .TypeConstraint<T>("T") \
+ .Label(mkl_op_registry::kMklOpLabel), \
+ OP<D##Device, F<T>>);
+
+REGISTER5(MklBinaryOp, CPU, "_MklAdd", functor::add, float, Eigen::half, double,
+ int32, int64);
+REGISTER7(MklBinaryOp, CPU, "_MklSub", functor::sub, float, Eigen::half, double,
+ int32, int64, complex64, complex128);
+REGISTER5(MklBinaryOp, CPU, "_MklMul", functor::mul, float, Eigen::half, double,
+ uint8, int32);
+REGISTER5(MklBinaryOp, CPU, "_MklMaximum", functor::maximum, float, Eigen::half,
+ double, int32, int64);
+REGISTER5(MklBinaryOp, CPU, "_MklSquaredDifference",
+ functor::squared_difference, float, Eigen::half, double, int32,
+ int64);
+
+#undef REGISTER
+#pragma pop_macro("REGISTER")
+//-----------------------------------------------------------------------
+
+} // end namespace tensorflow
+
+#endif // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index ca20294a26..f31e7afd46 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -41,9 +41,9 @@ class MklIdentityOp : public OpKernel {
bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
if (input_in_mkl_format) {
- ForwarMklTensorInToOut(context, 0, 0);
+ ForwardMklTensorInToOut(context, 0, 0);
} else {
- FowardTfTensorInToOut(context, 0, 0);
+ ForwardTfTensorInToOut(context, 0, 0);
}
}
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
new file mode 100644
index 0000000000..b58e44e398
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -0,0 +1,259 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <algorithm>
+#include <vector>
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/kernels/mkl_tfconv_op.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+///////////////////////////////////////////////////////////
+// Op kernel
+// Checks and ensures that the 2 inputs are compatible for mkl binary ops.
+// Here's the basic logic:
+//
+// if both inputs are in TF format:
+// pass the inputs through to the output
+// else if both inputs are in mkl format:
+// if both have the same shape:
+// pass the inputs through to the output
+// else:
+// convert both to TF
+// else if one is TF and one is MKL:
+// if broadcast is needed:
+// convert the MKL format input to TF format
+// else:
+// convert the TF format input to MKL format
+///////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklInputConversionOp : public OpKernel {
+ public:
+ explicit MklInputConversionOp(OpKernelConstruction* context)
+ : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+ OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+ has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
+ }
+
+ private:
+ void Compute(OpKernelContext* context) override {
+ // Check if input tensors are in MKL format.
+ const Tensor& input_tensor_0 = MklGetInput(context, 0);
+ MklShape input_shape_0;
+ GetMklShape(context, 0, &input_shape_0);
+
+ const Tensor& input_tensor_1 = MklGetInput(context, 1);
+ MklShape input_shape_1;
+ GetMklShape(context, 1, &input_shape_1);
+
+ bool tf_shapes_are_same = MklCompareShapes(&context->input(0).shape(),
+ &context->input(1).shape());
+
+ VLOG(1) << "MklInputConversionOp: Input shapes are "
+ << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
+ << context->input(0).shape().DebugString() << " and "
+ << context->input(1).shape().DebugString();
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ // if both inputs are in TF format, just copy input tensors to output.
+ if (!input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+ VLOG(1) << "MklInputConversionOp: No conversion needed, "
+ << "copying TF inputs to output";
+
+ ForwardTfTensorInToOut(context, 0, 0);
+ ForwardTfTensorInToOut(context, 1, 1);
+ return;
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ // If both inputs are in MKL format
+ if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+ // If both have the same shape, pass them through
+ if (tf_shapes_are_same) {
+ VLOG(1) << "MklInputConversionOp: No conversion needed, "
+ << "copying MKL inputs with identical shapes to output";
+
+ ForwardMklTensorInToOut(context, 0, 0);
+ ForwardMklTensorInToOut(context, 1, 1);
+ return;
+ }
+
+ // Sanity check
+ bool mkl_shapes_are_same =
+ MklCompareShapes(&input_shape_0, &input_shape_1);
+ if (mkl_shapes_are_same) {
+ CHECK(false) << "MklInputConversionOp: Unexpected: TF shapes are "
+ "different but MKL shapes are same";
+ }
+
+ // Both have different shapes, so broadcast will be necessary.
+ // Convert to TF and pass both tensors through (we can't do broadcast
+ // with MKL tensors)
+ VLOG(1) << "MklInputConversionOp: Broadcast needed, "
+ << "converted MKL inputs to TF format";
+
+ MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+ op_data_type, has_avx512f_, 0);
+ MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+ op_data_type, has_avx512f_, 1);
+ SetDummyMklShapeOutput(context, 0);
+ SetDummyMklShapeOutput(context, 1);
+ return;
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ // One input is MKL and one is TF. If no broadcast is needed, convert
+ // the TF tensor to MKL, otherwise convert the MKL tensor to TF format
+ VLOG(1) << "MklInputConversionOp: Inputs in different formats (MKL/TF)";
+
+ const Tensor* mkl_tensor;
+ const MklShape* mkl_shape;
+ const Tensor* tf_tensor;
+ MklShape* tf_mkl_shape;
+ uint mkl_tensor_index;
+ uint tf_tensor_index;
+ if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+ mkl_tensor = &input_tensor_0;
+ mkl_shape = &input_shape_0;
+ mkl_tensor_index = 0;
+ tf_tensor = &input_tensor_1;
+ tf_mkl_shape = &input_shape_1;
+ tf_tensor_index = 1;
+ } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+ mkl_tensor = &input_tensor_1;
+ mkl_shape = &input_shape_1;
+ mkl_tensor_index = 1;
+ tf_tensor = &input_tensor_0;
+ tf_mkl_shape = &input_shape_0;
+ tf_tensor_index = 0;
+ } else {
+ CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
+ "shapes for MKL "
+ << "element-wise op";
+ }
+
+ // Broadcast is needed if the shapes are not the same
+ bool broadcast_needed;
+
+ size_t in0_size = 1;
+ for (size_t i = 0; i < mkl_shape->GetDimension(); ++i)
+ in0_size *= mkl_shape->tf_dim_size(i);
+
+ size_t in1_size = 1;
+ for (size_t i = 0; i < tf_tensor->shape().dims(); ++i)
+ in1_size *= tf_tensor->shape().dim_size(i);
+
+ broadcast_needed = (in0_size != in1_size);
+
+ if (!broadcast_needed) {
+ // Both shapes are same, convert the TF input to MKL
+ VLOG(1) << "MklInputConversionOp: No broadcast needed.";
+ VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index
+ << " to MKL format";
+
+ // Create MklShape
+ Tensor* tensor_out;
+ MklShape mkl_output_mkl_shape;
+ mkl_output_mkl_shape.SetMklTensor(true);
+ mkl_output_mkl_shape.SetTfLayout(mkl_shape->GetDimension(),
+ mkl_shape->GetSizes(),
+ mkl_shape->GetStrides());
+ mkl_output_mkl_shape.SetTfDimOrder(mkl_shape->GetDimension());
+
+ // ** Temporarily borrow the layout from the MKL input **
+ mkl_output_mkl_shape.SetMklLayout(mkl_shape->GetCurLayout());
+
+ // Create output tensor
+ AllocateOutputSetMklShape(context, tf_tensor_index, &tensor_out,
+ mkl_tensor->shape(), mkl_output_mkl_shape);
+
+ // Since the shapes are the same, use information from the other tensor
+ tf_mkl_shape->SetTfLayout(mkl_shape->GetDimension(),
+ mkl_shape->GetSizes(), mkl_shape->GetStrides());
+ // Convert the data format
+ tf_mkl_shape->GetConvertedFlatData(
+ mkl_shape->GetCurLayout(),
+ const_cast<T*>(tf_tensor->flat<T>().data()),
+ const_cast<T*>(tensor_out->flat<T>().data()));
+
+ // ** Release the borrowed layout to avoid double deletion
+ // in the destructor call **
+ mkl_output_mkl_shape.SetMklLayout(nullptr);
+
+ // -- The tensor in MKL format passes through --
+ ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index);
+ } else {
+ // Broadcast is needed, so convert the MKL input to TF
+ VLOG(1) << "MklInputConversionOp: Broadcast needed.";
+ VLOG(1) << "MklInputConversionOp: Converting input " << mkl_tensor_index
+ << " to TF format";
+ MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+ op_data_type, has_avx512f_,
+ mkl_tensor_index);
+ SetDummyMklShapeOutput(context, mkl_tensor_index);
+
+ // The tensor in TF format passes through
+ ForwardTfTensorInToOut(context, tf_tensor_index, tf_tensor_index);
+ }
+
+ VLOG(1) << "MklInputConversionOp: Shapes (output): "
+ << context->mutable_output(0)->shape().DebugString() << " and "
+ << context->mutable_output(1)->shape().DebugString();
+
+ VLOG(1) << "MklInputConversion completed successfully.";
+ }
+
+ private:
+ /// Data format of the operation
+ string data_format_str;
+
+ /// Data type of the operation
+ DataType op_data_type;
+
+ /// CPUIDInfo
+ bool has_avx512f_ = false;
+};
+
+///////////////////////////////////////////////////////////
+// Register kernel
+///////////////////////////////////////////////////////////
+
+#define REGISTER_CPU(T) \
+ REGISTER_KERNEL_BUILDER(Name("_MklInputConversion") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .Label(mkl_op_registry::kMklOpLabel), \
+ MklInputConversionOp<CPUDevice, T>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+} // namespace tensorflow
+#endif // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
new file mode 100644
index 0000000000..a240ee44fb
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -0,0 +1,136 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
+
+#include <algorithm>
+#include <vector>
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+///////////////////////////////////////////////////////////
+// Op kernel
+///////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklToTfOp : public OpKernel {
+ public:
+ explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+ OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+ has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
+ }
+
+ void Compute(OpKernelContext* context) override {
+ ConvertMklToTf(this, context, data_format_str, op_data_type, has_avx512f_,
+ 0);
+ VLOG(1) << "MKLToTFConversion complete successfully.";
+ }
+
+ static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
+ string data_format_str, DataType op_data_type,
+ bool has_avx512f, uint input_number) {
+ // Check that input tensor is in MKL format.
+ const Tensor& input_tensor = MklGetInput(context, input_number);
+ MklShape input_shape;
+ GetMklShape(context, input_number, &input_shape);
+
+ // if input is already in Tf format, then just copy input tensor to output.
+ if (!input_shape.IsMklTensor()) {
+ context->set_output(input_number, input_tensor);
+ VLOG(1) << "MKLToTFConversion: No conversion needed, "
+ << "copying input to output";
+ return;
+ }
+
+ // Check that input data type is same as operator data type and that it is
+ // same as output data type.
+ DataType input_data_type = op_kernel->input_type(input_number);
+ DataType output_data_type = op_kernel->output_type(input_number);
+ CHECK_EQ(op_data_type, input_data_type);
+ CHECK_EQ(op_data_type, output_data_type);
+
+ TensorShape output_shape;
+ size_t ndims = input_shape.GetDimension();
+ size_t* in_sizes = new size_t[ndims];
+ for (size_t i = 0; i < ndims; i++) {
+ // Outermost to innermost dimension
+ output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
+ in_sizes[i] = input_shape.GetSizes()[i];
+ }
+
+ // Allocate output tensor.
+ Tensor* output_tensor = NULL;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(input_number, output_shape, &output_tensor));
+
+ dnnLayout_t output_layout =
+ static_cast<dnnLayout_t>(input_shape.GetTfLayout());
+ // Execute DNNConversion.
+ void* input_buffer =
+ static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
+ delete[] in_sizes;
+ void* output_buffer =
+ static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
+ input_shape.GetConvertedFlatData(output_layout, input_buffer,
+ output_buffer);
+ VLOG(1) << "MKLToTFConversion complete successfully.";
+ }
+
+ private:
+ /// Data format of the operation
+ string data_format_str;
+
+ /// Data type of the operation
+ DataType op_data_type;
+
+ /// CPUIDInfo
+ bool has_avx512f_ = false;
+};
+
+///////////////////////////////////////////////////////////
+// Register kernel
+///////////////////////////////////////////////////////////
+
+#define REGISTER_CPU(T) \
+ REGISTER_KERNEL_BUILDER(Name("_MklToTf") \
+ .Device(DEVICE_CPU) \
+ .TypeConstraint<T>("T") \
+ .Label(mkl_op_registry::kMklOpLabel), \
+ MklToTfOp<CPUDevice, T>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+} // namespace tensorflow
+#endif // TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
+#endif // INTEL_MKL
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
new file mode 100644
index 0000000000..c8b307a2e4
--- /dev/null
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -0,0 +1,413 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+// TODO(shamanDevel): Enable complex inputs. This will require a specialization
+// of Gesvd for complex inputs as well as a new kernel
+// definition to output the singular values as reals
+// instead of complex values. The current CPU implementation
+// outputs the singular values as complex values and then
+// casts them to reals in the python wrapper.
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+static const char kErrMsg[] =
+ "Singular Value Decomposition was not successful. The input might not be "
+ "valid.";
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+// This kernel computes the reduction
+// V' = sum_i (M_i * U_i,1 * S_i).
+// The result is stored in V[batch] and has the same sign as the
+// real value of V (which should be computed)
+template <class Scalar>
+__global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m,
+ int64 ldu, const Scalar* M,
+ const Scalar* U, const Scalar* S,
+ Scalar* V) {
+ CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count, x) {
+ CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count, y) {
+ Scalar v = M[i + m * batch] * U[ldu * (i + m * batch)] * S[batch];
+ CudaAtomicAdd(V + batch, v);
+ }
+ }
+}
+
+// Extracts the sign of V
+// V[i] = V[i]>=0 ? 1 : 0
+template <class Scalar>
+__global__ void ExtractSignOfVKernel(CudaLaunchConfig config, Scalar* V) {
+ CUDA_1D_KERNEL_LOOP(i, config.virtual_thread_count) {
+ V[i] = V[i] >= 0 ? Scalar(1) : Scalar(-1);
+ }
+}
+}
+
+// Scalar: The input scalar type (can be complex)
+template <class Scalar>
+class SvdOpGpu : public AsyncOpKernel {
+ public:
+ using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+ explicit SvdOpGpu(OpKernelConstruction* context) : AsyncOpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("compute_uv", &compute_uv_));
+ OP_REQUIRES_OK(context, context->GetAttr("full_matrices", &full_matrices_));
+ }
+
+ void RunSVD(OpKernelContext* context, DoneCallback done, int64 m, int64 n,
+ int64 p, int64 batch_size, Scalar* input_ptr,
+ RealScalar* outputS_ptr, Scalar* outputU_ptr,
+ Scalar* outputVT_ptr, int* dev_info_ptr, CudaSolver& solver) {
+ // Save the input matrix
+ // Needed for the n=1 fix, see below, since SVD destroys the input
+ Tensor input_copy;
+ if (compute_uv_ && n == 1) {
+ OP_REQUIRES_OK_ASYNC(
+ context,
+ context->allocate_temp(DataTypeToEnum<Scalar>::v(),
+ TensorShape({batch_size, m}), &input_copy),
+ done);
+ const GPUDevice& d = context->eigen_device<GPUDevice>();
+ d.memcpy(input_copy.flat<Scalar>().data(), input_ptr,
+ batch_size * m * sizeof(Scalar));
+ }
+
+ for (int64 batch = 0; batch < batch_size; ++batch) {
+ Scalar* input = input_ptr + batch * m * n;
+ RealScalar* outputS = outputS_ptr + batch * p;
+ Scalar* outputU = NULL;
+ Scalar* outputVT = NULL;
+ char jobu = 'N';
+ char jobvt = 'N';
+
+ if (compute_uv_) {
+ if (full_matrices_) {
+ outputU = outputU_ptr + batch * m * m;
+ outputVT = outputVT_ptr + batch * n * n;
+ jobu = 'A';
+ jobvt = 'A';
+ } else {
+ outputU = outputU_ptr + batch * m * p;
+ outputVT = outputVT_ptr + batch * n * p;
+ jobu = 'S';
+ jobvt = 'S';
+ }
+ }
+
+ OP_REQUIRES_OK_ASYNC(
+ context, solver.Gesvd(jobu, jobvt, m, n, input, m, outputS, outputU,
+ m, outputVT, n, dev_info_ptr + batch),
+ done);
+ }
+
+ // This is a bug in cuSolver:
+ // If n is one, then outputVT only contains zeros instead of ones.
+ // Hence, I need to fill outputVT manually
+ // The question is: +1 or -1?
+ // -> Compute U*S and compare sign against M
+ // But because S is zero except for the first entry, the multiplication
+ // simplifies a lot.
+ // However, what happens if M contains zeros? At these indices, it is
+ // impossible to determine the value of V.
+ // -> Compute V for all rows in M to cope for zeros.
+ // 1. V' = sum_i (M_i * U_i,1 * S_i)
+ // 2. V = {1, V'>=0, -1, V'<0}
+ // TODO: what is with complex values?
+ if (compute_uv_ && n == 1) {
+ // 1. compute the (batched) sum
+ const GPUDevice& d = context->eigen_device<GPUDevice>();
+ d.memset(outputVT_ptr, 0, batch_size * sizeof(Scalar));
+ Cuda2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
+ ComputeValueOfVKernel<<<cfg2D.block_count, cfg2D.thread_per_block, 0,
+ d.stream()>>>(
+ cfg2D, m, full_matrices_ ? m : p, input_copy.flat<Scalar>().data(),
+ outputU_ptr, outputS_ptr, outputVT_ptr);
+ // 2. clamp V to -1 or +1
+ CudaLaunchConfig cfg1D = GetCudaLaunchConfig(batch_size, d);
+ ExtractSignOfVKernel<<<cfg1D.block_count, cfg1D.thread_per_block, 0,
+ d.stream()>>>(cfg1D, outputVT_ptr);
+ }
+ }
+
+ void CheckResult(OpKernelContext* context, DoneCallback done,
+ const std::vector<DeviceLapackInfo>& dev_info,
+ CudaSolver& solver, Tensor& catch1, Tensor& catch2) {
+ auto info_checker = [context, dev_info, done, catch1, catch2](
+ const Status& status, const std::vector<HostLapackInfo>& /* unused */) {
+ Status full_status = status;
+ if (!full_status.ok()) {
+ full_status.Update(errors::InvalidArgument(kErrMsg));
+ }
+ OP_REQUIRES_OK_ASYNC(context, full_status, done);
+ done();
+ };
+
+ OP_REQUIRES_OK_ASYNC(context, solver.CopyLapackInfoToHostAsync(
+ dev_info, std::move(info_checker)),
+ done);
+ }
+
+ // The SVD if m >= n
+ // TODO: can the two cases (MgeqN and MlessN) be simplified,
+ // common boilerplate be reduced, or even combined in one method?
+ void PerformSVD_MgeqN(OpKernelContext* context, DoneCallback done, int64 m,
+ int64 n, int64 p, const gtl::ArraySlice<int32>& perm,
+ const Tensor& M, Tensor* S, Tensor* U, Tensor* V) {
+ TensorShape shapeRaw = M.shape();
+ shapeRaw.RemoveDim(shapeRaw.dims() - 1);
+ shapeRaw.RemoveDim(shapeRaw.dims() - 1);
+
+ // Transpose M, because cuSolver expects it to be column-major
+ TensorShape input_shape = shapeRaw;
+ input_shape.AddDim(n);
+ input_shape.AddDim(m);
+ Tensor input_copy;
+ OP_REQUIRES_OK_ASYNC(
+ context, context->allocate_temp(M.dtype(), input_shape, &input_copy),
+ done);
+ auto device = context->eigen_device<GPUDevice>();
+ OP_REQUIRES_OK_ASYNC(context, DoTranspose(device, M, perm, &input_copy),
+ done);
+
+ // I need to transpose U at the end
+ // Not V, because cuSolver work column-major
+ Tensor u_copy;
+ if (compute_uv_) {
+ TensorShape u_shape;
+ if (full_matrices_) {
+ u_shape = U->shape();
+ } else {
+ u_shape = shapeRaw;
+ u_shape.AddDim(p);
+ u_shape.AddDim(m);
+ }
+ OP_REQUIRES_OK_ASYNC(
+ context, context->allocate_temp(U->dtype(), u_shape, &u_copy), done);
+ }
+
+ // get the pointers to the data
+ Scalar* input_ptr;
+ RealScalar* outputS_ptr;
+ Scalar* outputU_ptr = NULL;
+ Scalar* outputV_ptr = NULL;
+ auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
+ input_ptr = input_reshaped.data();
+ outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
+ if (compute_uv_) {
+ outputU_ptr = u_copy.template flat_inner_dims<Scalar, 3>().data();
+ outputV_ptr = V->template flat_inner_dims<Scalar, 3>().data();
+ }
+
+ // call the SVD
+ const int64 batch_size = input_reshaped.dimension(0);
+ std::vector<DeviceLapackInfo> dev_info;
+ dev_info.emplace_back(context, batch_size, "gesvd");
+ CudaSolver solver(context);
+ RunSVD(context, done, m, n, p, batch_size, input_ptr, outputS_ptr,
+ outputU_ptr, outputV_ptr, dev_info.back().mutable_data(), solver);
+
+ // Transpose U
+ if (compute_uv_) {
+ OP_REQUIRES_OK_ASYNC(context, DoTranspose(device, u_copy, perm, U), done);
+ }
+
+ // now check if the SVD operation succeeded or not
+ CheckResult(context, done, dev_info, solver, input_copy, u_copy);
+ }
+
+ // The SVD if m < n
+ void PerformSVD_MlessN(OpKernelContext* context, DoneCallback done, int64 m,
+ int64 n, int64 p, const gtl::ArraySlice<int32>& perm,
+ const Tensor& M, Tensor* S, Tensor* U, Tensor* V) {
+ // Perform the SVD on M'
+
+ // Reuse the input buffer or make a copy for the SVD depending on whether
+ // this op owns the
+ // input buffer exclusively. This is needed because the SVD modifies the
+ // input
+ Tensor input_copy;
+ OP_REQUIRES_OK_ASYNC(context, context->forward_input_or_allocate_temp(
+ {0}, DataTypeToEnum<Scalar>::value,
+ M.shape(), &input_copy),
+ done);
+
+ if (!M.SharesBufferWith(input_copy)) {
+ const GPUDevice& d = context->eigen_device<GPUDevice>();
+ d.memcpy(input_copy.flat<Scalar>().data(), M.flat<Scalar>().data(),
+ M.NumElements() * sizeof(Scalar));
+ }
+
+ // I need to transpose V at the end
+ Tensor v_copy;
+ if (compute_uv_) {
+ TensorShape v_shape;
+ if (full_matrices_) {
+ v_shape = V->shape();
+ } else {
+ TensorShape shapeRaw = M.shape();
+ shapeRaw.RemoveDim(shapeRaw.dims() - 1);
+ shapeRaw.RemoveDim(shapeRaw.dims() - 1);
+ v_shape = shapeRaw;
+ v_shape.AddDim(p);
+ v_shape.AddDim(n);
+ }
+ OP_REQUIRES_OK_ASYNC(
+ context, context->allocate_temp(V->dtype(), v_shape, &v_copy), done);
+ }
+
+ // get the pointers to the data
+ Scalar* input_ptr;
+ RealScalar* outputS_ptr;
+ Scalar* outputU_ptr = NULL;
+ Scalar* outputV_ptr = NULL;
+ auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
+ input_ptr = input_reshaped.data();
+ outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
+ if (compute_uv_) {
+ // Note that U and V are flipped
+ outputU_ptr = v_copy.template flat_inner_dims<Scalar, 3>().data();
+ outputV_ptr = U->template flat_inner_dims<Scalar, 3>().data();
+ }
+
+ // call the SVD
+ const int64 batch_size = input_reshaped.dimension(0);
+ std::vector<DeviceLapackInfo> dev_info;
+ dev_info.emplace_back(context, batch_size, "gesvd");
+ CudaSolver solver(context);
+ // Note that m and n are flipped
+ RunSVD(context, done, n, m, p, batch_size, input_ptr, outputS_ptr,
+ outputU_ptr, outputV_ptr, dev_info.back().mutable_data(), solver);
+
+ // Transpose V
+ if (compute_uv_) {
+ auto device = context->eigen_device<GPUDevice>();
+ OP_REQUIRES_OK_ASYNC(context, DoTranspose(device, v_copy, perm, V), done);
+ }
+
+ // now check if the SVD operation succeeded or not
+ CheckResult(context, done, dev_info, solver, input_copy, v_copy);
+ }
+
+ void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+ const Tensor& input = context->input(0);
+ const int ndims = input.dims();
+ const int64 m = input.dim_size(ndims - 2);
+ const int64 n = input.dim_size(ndims - 1);
+ const int64 p = std::min(m, n);
+
+ // Validate inputs.
+ OP_REQUIRES_ASYNC(
+ context, ndims >= 2,
+ errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
+ done);
+
+ // output tensors.
+ Tensor* outputU = NULL;
+ Tensor* outputS = NULL;
+ Tensor* outputV = NULL;
+
+ // compute shapes
+ TensorShape shapeRaw = input.shape();
+ shapeRaw.RemoveDim(shapeRaw.dims() - 1);
+ shapeRaw.RemoveDim(shapeRaw.dims() - 1);
+ TensorShape shapeS = shapeRaw;
+ TensorShape shapeU = shapeRaw;
+ TensorShape shapeV = shapeRaw;
+ shapeS.AddDim(p);
+ if (compute_uv_) {
+ if (full_matrices_) {
+ shapeU.AddDim(m);
+ shapeU.AddDim(m);
+ shapeV.AddDim(n);
+ shapeV.AddDim(n);
+ } else {
+ shapeU.AddDim(m);
+ shapeU.AddDim(p);
+ shapeV.AddDim(n);
+ shapeV.AddDim(p);
+ }
+ } else {
+ shapeU = TensorShape({0});
+ shapeV = TensorShape({0});
+ }
+
+ // allocate output
+ OP_REQUIRES_OK_ASYNC(context, context->allocate_output(0, shapeS, &outputS),
+ done);
+ OP_REQUIRES_OK_ASYNC(context, context->allocate_output(1, shapeU, &outputU),
+ done);
+ OP_REQUIRES_OK_ASYNC(context, context->allocate_output(2, shapeV, &outputV),
+ done);
+
+ if (n == 0 || m == 0) {
+ // If X is an empty matrix (0 rows, 0 col), X * X' == X.
+ // Therefore, we return X.
+ done();
+ return;
+ }
+
+ // Prepare permutation
+ std::vector<int32> perm;
+ for (size_t i = 0; i < ndims - 2; ++i) perm.push_back(i);
+ perm.push_back(ndims - 1); // transpose last two dimensions
+ perm.push_back(ndims - 2);
+ gtl::ArraySlice<int32> permAS(perm);
+
+ // call implementations
+ if (m >= n) {
+ PerformSVD_MgeqN(context, done, m, n, p, permAS, input, outputS, outputU,
+ outputV);
+ } else {
+ PerformSVD_MlessN(context, done, m, n, p, permAS, input, outputS, outputU,
+ outputV);
+ }
+ }
+
+ private:
+ bool compute_uv_;
+ bool full_matrices_;
+};
+
+// TODO: add support for complex types
+REGISTER_LINALG_OP_GPU("Svd", (SvdOpGpu<float>), float);
+REGISTER_LINALG_OP_GPU("Svd", (SvdOpGpu<double>), double);
+REGISTER_LINALG_OP_GPU("BatchSvd", (SvdOpGpu<float>), float);
+REGISTER_LINALG_OP_GPU("BatchSvd", (SvdOpGpu<double>), double);
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 075bacb432..2191e4e8c5 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -1069,7 +1069,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
} else {
OP_REQUIRES(
ctx, max_index < array_size,
- errors::InvalidArgument("Max scatter index must be <= array size (",
+ errors::InvalidArgument("Max scatter index must be < array size (",
max_index, " vs. ", array_size, ")"));
}
element_shape.RemoveDim(0);
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 2a59282fa5..ef4737cafe 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -498,6 +498,24 @@ Returns x + y element-wise.
[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
)doc");
+REGISTER_OP("_MklAdd")
+ .Input("x: T")
+ .Input("y: T")
+ .Input("mkl_x: uint8")
+ .Input("mkl_y: uint8")
+ .Output("z: T")
+ .Output("mkl_z: uint8")
+ .Attr(
+ "T: {half, float, double, uint8, int8, int16, int32, int64, complex64, "
+ "complex128, string}")
+ .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+ .Doc(R"doc(
+Returns x + y element-wise.
+
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+)doc");
+
REGISTER_OP("Sub")
.BINARY_MORE()
.SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
@@ -508,6 +526,19 @@ Returns x - y element-wise.
[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
)doc");
+REGISTER_OP("_MklSub")
+ .BINARY_FEWER()
+ .Input("mkl_x: uint8")
+ .Input("mkl_y: uint8")
+ .Output("mkl_z: uint8")
+ .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+ .Doc(R"doc(
+Returns x - y element-wise.
+
+*NOTE*: `Sub` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+)doc");
+
REGISTER_OP("Mul")
.BINARY_MORE()
.SetIsCommutative()
@@ -519,6 +550,20 @@ Returns x * y element-wise.
[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
)doc");
+REGISTER_OP("_MklMul")
+ .BINARY_MORE()
+ .Input("mkl_x: uint8")
+ .Input("mkl_y: uint8")
+ .Output("mkl_z: uint8")
+ .SetIsCommutative()
+ .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+ .Doc(R"doc(
+Returns x * y element-wise.
+
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+)doc");
+
REGISTER_OP("Div")
.BINARY_MORE()
.SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
@@ -577,6 +622,20 @@ Returns (x - y)(x - y) element-wise.
[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
)doc");
+REGISTER_OP("_MklSquaredDifference")
+ .BINARY_FEWER()
+ .Input("mkl_x: uint8")
+ .Input("mkl_y: uint8")
+ .Output("mkl_z: uint8")
+ .SetIsCommutative()
+ .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+ .Doc(R"doc(
+Returns (x - y)(x - y) element-wise.
+
+*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+)doc");
+
#undef BINARY_FEWER
#undef BINARY_MORE
@@ -594,6 +653,23 @@ Returns the max of x and y (i.e. x > y ? x : y) element-wise.
[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
)doc");
+REGISTER_OP("_MklMaximum")
+ .Input("x: T")
+ .Input("y: T")
+ .Input("mkl_x: uint8")
+ .Input("mkl_y: uint8")
+ .Output("z: T")
+ .Output("mkl_z: uint8")
+ .Attr("T: {half, float, double, int32, int64}")
+ .SetIsCommutative()
+ .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+ .Doc(R"doc(
+Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+)doc");
+
REGISTER_OP("Minimum")
.Input("x: T")
.Input("y: T")
@@ -2604,4 +2680,31 @@ Equivalent to np.digitize.
@end_compatibility
)doc");
+#ifdef INTEL_MKL
+REGISTER_OP("_MklAddN")
+ .Input("inputs: N * T")
+ .Input("mkl_input: N * uint8")
+ .Output("sum: T")
+ .Output("mkl_sum: uint8")
+ .Attr("N: int >= 1")
+ .Attr("T: numbertype")
+ .SetIsCommutative()
+ .SetIsAggregate()
+ .SetShapeFn([](InferenceContext* c) {
+ ShapeHandle cur = c->input(c->num_inputs() - 1);
+ for (int i = c->num_inputs() - 2; i >= 0; --i) {
+ TF_RETURN_WITH_CONTEXT_IF_ERROR(c->Merge(c->input(i), cur, &cur),
+ "From merging shape ", i,
+ " with other shapes.");
+ }
+ c->set_output(0, cur);
+ return Status::OK();
+ })
+ .Doc(R"doc(
+Add two input tensors element wise using mkl kernel sum.
+inputs: Must all be the same size and shape.
+)doc");
+
+#endif // INTEL_MKL
+
} // namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index fd0b785b8f..22afa4db9a 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -3241,6 +3241,29 @@ MKL operator to convert a tensor from MKL layout to TensorFlow layout.
NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
expected to invoke these operators.
)doc");
+
+REGISTER_OP("_MklInputConversion")
+ .Input("input_0: T")
+ .Input("input_1: T")
+ .Input("mkl_input_0: uint8")
+ .Input("mkl_input_1: uint8")
+ .Output("output_0: T")
+ .Output("output_1: T")
+ .Output("mkl_output_0: uint8")
+ .Output("mkl_output_1: uint8")
+ // All datatypes supported by element-wise ops
+ .Attr(
+ "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, "
+ "complex64, complex128}")
+ .Attr(GetConvnetDataFormatAttrString())
+ .Doc(R"doc(
+MKL operator to process the inputs to an elementwise MKL op. Both inputs
+need to be either in TF or in MKL format. This op is added before every
+element-wise MKL op.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
#endif // INTEL_MKL
} // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index c4bc57fd77..1b07f4ecf8 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -15866,6 +15866,25 @@ op {
summary: "Transforms a serialized tensorflow.TensorProto proto into a Tensor."
}
op {
+ name: "SerializeTensor"
+ input_arg {
+ name: "tensor"
+ description: "A Tensor of type `T`."
+ type: "T"
+ }
+ output_arg {
+ name: "serialized"
+ description: "A serialized TensorProto proto of the input tensor."
+ type_attr: DT_STRING
+ }
+ attr {
+ name: "T"
+ type: "type"
+ description: "The type of the input tensor."
+ }
+ summary: "Transforms a Tensor into a serialized TensorProto proto."
+}
+op {
name: "Placeholder"
output_arg {
name: "output"
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index 1f7ebe91cf..f23ff083af 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -26,7 +26,7 @@ using shape_inference::ShapeHandle;
REGISTER_OP("DecodeRaw")
.Input("bytes: string")
.Output("output: out_type")
- .Attr("out_type: {half,float,double,int32,uint8,int16,int8,int64}")
+ .Attr("out_type: {half,float,double,int32,uint16,uint8,int16,int8,int64}")
.Attr("little_endian: bool = true")
.SetShapeFn([](InferenceContext* c) {
// Note: last dimension is data dependent.
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 5e99187d50..aebd14c7e5 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -381,7 +381,7 @@ input = b'thirteen'
position = [1, 5, 7]
length = [3, 2, 1]
-output = [b'hir', b'ee', b'n"]
+output = [b'hir', b'ee', b'n']
```
input: Tensor of strings
diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc
index 86295592a8..639f6804ea 100644
--- a/tensorflow/core/platform/cuda_libdevice_path_test.cc
+++ b/tensorflow/core/platform/cuda_libdevice_path_test.cc
@@ -27,7 +27,7 @@ TEST(CudaLibdevicePathTest, LibdevicePath) {
VLOG(2) << "Libdevice root = " << LibdeviceRoot();
std::vector<string> libdevice_files;
TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
- io::JoinPath(LibdeviceRoot(), "libdevice.compute_*.bc"),
+ io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"),
&libdevice_files));
EXPECT_LT(0, libdevice_files.size());
}
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index ccb861c93a..9ba3a509c3 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
// TensorFlow uses semantic versioning, see http://semver.org/.
#define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 3
+#define TF_MINOR_VERSION 4
#define TF_PATCH_VERSION 0
// TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
// "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-dev"
#define TF_STR_HELPER(x) #x
#define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index af727c3d2b..f8eddbb2a9 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -25,6 +25,29 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"
+#include "cuda/include/cuda.h"
+
+// Mask for all 32 threads in a warp.
+#define CUDA_WARP_ALL 0xFFFFFFFF
+
+#if defined(CUDA_VERSION) && CUDA_VERSION < 9000
+// CUDA 9.0 introduces a new, light-weight barrier synchronization primitive
+// that operates at the warp-scope. This is required to ensure visibility of
+// reads/writes among threads that can make indepenent progress on Volta.
+// For previous CUDA versions these synchronizations not necessary, and we
+// define an empty function as a convenience for backward compatibility.
+__device__ inline void __syncwarp(unsigned mask=CUDA_WARP_ALL) {}
+
+// CUDA 9.0 deprecates the warp-intrinsic functions (shfl, ballot, etc.) in
+// favor of synchronizing versions. These ensure that all warp lanes specified
+// in mask execute the intrinsic in convergence. Here we provide legacy mappings
+// to the less-verbose routines provided in previous versions of CUDA.
+#define __ballot_sync(mask, predicate) __ballot(predicate)
+#define __shfl_sync(mask, val, srcLane, width) __shfl(val, srcLane, width)
+#define __shfl_down_sync(mask, val, delta, width) __shfl_down(val, delta, width)
+#define __shfl_up_sync(mask, val, delta, width) __shfl_up(val, delta, width)
+#define __shfl_xor_sync(mask, val, laneMask, width) __shfl_xor(val, laneMask, width)
+#endif
// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
// GetCuda3DLaunchConfig:
@@ -613,82 +636,95 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tf_max(const T& x, const T& y) {
return x < y ? y : x;
}
+__device__ EIGEN_ALWAYS_INLINE unsigned CudaBallot(unsigned mask,
+ int predicate) {
+ return __ballot_sync(mask, predicate);
+}
+
template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffle(T value, int srcLane,
+__device__ EIGEN_ALWAYS_INLINE T CudaShuffle(unsigned mask, T value,
+ int srcLane,
int width = warpSize) {
- return __shfl(value, srcLane, width);
+ return __shfl_sync(mask, value, srcLane, width);
}
// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffle(double value, int srcLane,
+__device__ EIGEN_ALWAYS_INLINE double CudaShuffle(unsigned mask,
+ double value, int srcLane,
int width = warpSize) {
unsigned lo, hi;
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
- hi = __shfl(hi, srcLane, width);
- lo = __shfl(lo, srcLane, width);
+ hi = __shfl_sync(mask, hi, srcLane, width);
+ lo = __shfl_sync(mask, lo, srcLane, width);
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
return value;
}
template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleUp(T value, int delta,
+__device__ EIGEN_ALWAYS_INLINE T CudaShuffleUp(unsigned mask,
+ T value, int delta,
int width = warpSize) {
- return __shfl_up(value, delta, width);
+ return __shfl_up_sync(mask, value, delta, width);
}
// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleUp(double value, int delta,
+__device__ EIGEN_ALWAYS_INLINE double CudaShuffleUp(unsigned mask,
+ double value, int delta,
int width = warpSize) {
unsigned lo, hi;
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
- hi = __shfl_up(hi, delta, width);
- lo = __shfl_up(lo, delta, width);
+ hi = __shfl_up_sync(mask, hi, delta, width);
+ lo = __shfl_up_sync(mask, lo, delta, width);
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
return value;
}
template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(T value, int delta,
+__device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(unsigned mask,
+ T value, int delta,
int width = warpSize) {
- return __shfl_down(value, delta, width);
+ return __shfl_down_sync(mask, value, delta, width);
}
// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleDown(double value, int delta,
+__device__ EIGEN_ALWAYS_INLINE double CudaShuffleDown(unsigned mask,
+ double value, int delta,
int width = warpSize) {
unsigned lo, hi;
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
- hi = __shfl_down(hi, delta, width);
- lo = __shfl_down(lo, delta, width);
+ hi = __shfl_down_sync(mask, hi, delta, width);
+ lo = __shfl_down_sync(mask, lo, delta, width);
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
return value;
}
template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(T value, int laneMask,
+__device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(unsigned mask,
+ T value, int laneMask,
int width = warpSize) {
- return __shfl_xor(value, laneMask, width);
+ return __shfl_xor_sync(mask, value, laneMask, width);
}
// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
// instead of float for lo and hi (which is incorrect with ftz, for example).
// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleXor(double value, int laneMask,
+__device__ EIGEN_ALWAYS_INLINE double CudaShuffleXor(unsigned mask,
+ double value, int laneMask,
int width = warpSize) {
unsigned lo, hi;
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
- hi = __shfl_xor(hi, laneMask, width);
- lo = __shfl_xor(lo, laneMask, width);
+ hi = __shfl_xor_sync(mask, hi, laneMask, width);
+ lo = __shfl_xor_sync(mask, lo, laneMask, width);
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
return value;
}
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index cb22a50e8f..f4bec9524a 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -65,6 +65,8 @@ class MklShape {
void SetDimensions(const size_t dimension) { dimension_ = dimension; }
+ void SetMklLayout(dnnLayout_t mklLayout) { mklLayout_ = mklLayout; }
+
void SetMklLayout(const void* primitive, size_t resourceType) {
CHECK_EQ(
dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive,
@@ -135,6 +137,7 @@ class MklShape {
size_t GetDimension() const { return dimension_; }
const size_t* GetSizes() const { return sizes_; }
int64 dim_size(int index) const { return sizes_[index]; }
+ int64 tf_dim_size(int index) const { return sizes_[tf_to_mkl_dim_map_[index]]; }
const size_t* GetStrides() const { return strides_; }
const size_t* GetTfToMklDimMap() const { return tf_to_mkl_dim_map_; }
size_t tf_dim_idx(int index) const { return tf_to_mkl_dim_map_[index]; }
@@ -581,7 +584,7 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
context->set_output(idx_data_out, output);
}
-inline void FowardTfTensorInToOut(OpKernelContext* context,
+inline void ForwardTfTensorInToOut(OpKernelContext* context,
int idx_in, int idx_out) {
int num_inputs = context->num_inputs();
int num_outputs = context->num_outputs();
@@ -598,7 +601,7 @@ inline void FowardTfTensorInToOut(OpKernelContext* context,
}
}
-inline void ForwarMklTensorInToOut(OpKernelContext* context,
+inline void ForwardMklTensorInToOut(OpKernelContext* context,
int idx_in, int idx_out) {
int num_inputs = context->num_inputs();
int num_outputs = context->num_outputs();
@@ -616,6 +619,98 @@ inline void ForwarMklTensorInToOut(OpKernelContext* context,
}
}
+// Forward the MKL shape ONLY (used in elementwise and other ops where
+// we call the eigen implementation and MKL shape is not used)
+inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
+ uint idx_data_in, uint idx_data_out) {
+ uint idx_meta_in = GetTensorMetaDataIndex(idx_data_in, context->num_inputs());
+ uint idx_meta_out =
+ GetTensorMetaDataIndex(idx_data_out, context->num_outputs());
+
+ if (IsRefType(context->input_dtype(idx_data_in))) {
+ context->forward_ref_input_to_ref_output(idx_meta_in, idx_meta_out);
+ } else {
+ context->set_output(idx_meta_out, context->input(idx_meta_in));
+ }
+}
+
+// Set a dummy MKL shape (called when the output is in TF format)
+inline void SetDummyMklShapeOutput(OpKernelContext* context,
+ uint idx_data_out) {
+ MklShape mkl_shape_output;
+ mkl_shape_output.SetMklTensor(false);
+ AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
+}
+
+// Checks if the TF shape for both MKL tensors is the same or not
+// Returns: true if both TF shapes are the same, false otherwise
+inline bool MklCompareShapes(const MklShape* input_shape_0,
+ const MklShape* input_shape_1) {
+ // Check for number of dimensions
+ if (input_shape_0->GetDimension() != input_shape_1->GetDimension()) {
+ return false;
+ }
+
+ // Check size of each dimension
+ size_t ndims = input_shape_0->GetDimension();
+ for (size_t i = 0; i < ndims; i++) {
+ if (input_shape_0->dim_size(i) != input_shape_1->dim_size(i)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Checks if the TF shape for both tensors is the same or not
+// Returns: true if TF shapes for both are the same, false otherwise
+inline bool MklCompareShapes(const MklShape* input_shape_0,
+ const TensorShape* input_shape_1) {
+ // Check for number of dimensions
+ if (input_shape_0->GetDimension() != input_shape_1->dims()) {
+ return false;
+ }
+
+ // Check size of each dimension
+ size_t ndims = input_shape_0->GetDimension();
+ for (size_t i = 0; i < ndims; i++) {
+ if (input_shape_0->tf_dim_size(i) != input_shape_1->dim_size(i)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Checks if the TF shape for both tensors is the same or not
+// Returns: true if TF shapes for both are the same, false otherwise
+inline bool MklCompareShapes(const TensorShape* input_shape_0,
+ const MklShape* input_shape_1) {
+ return MklCompareShapes(input_shape_1, input_shape_0);
+}
+
+// Checks if the TF shape for both tensors is the same or not
+// Returns: true if TF shapes for both are the same, false otherwise
+inline bool MklCompareShapes(const TensorShape* input_shape_0,
+ const TensorShape* input_shape_1) {
+ // Check for number of dimensions
+ if (input_shape_0->dims() != input_shape_1->dims()) {
+ return false;
+ }
+
+ // Check size of each dimension
+ size_t ndims = input_shape_0->dims();
+ for (size_t i = 0; i < ndims; i++) {
+ if (input_shape_0->dim_size(i) != input_shape_1->dim_size(i)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// TODO(intel_tf): Remove this routine when faster MKL layout conversion is
+// out.
inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
const float* buf_in = input.flat<float>().data();
float* buf_out = (*output)->flat<float>().data();
@@ -652,11 +747,19 @@ namespace mkl_op_registry {
static const char* kMklOpLabel = "MklOp";
static const char* kMklOpLabelPattern = "label='MklOp'";
+// Get the name of Mkl op from original TensorFlow op
+// We prefix 'Mkl' to the original op to get Mkl op.
+inline string GetMklOpName(const string& name) {
+ // Prefix that we add to Tensorflow op name to construct Mkl op name.
+ const char* const kMklOpPrefix = "_Mkl";
+ return string(kMklOpPrefix) + name;
+}
+
// Check whether opname with type T is registered as MKL-compliant.
//
// @input: name of the op
// @input: T datatype to be used for checking op
-// @return: true if opname is registered as Mkl op
+// @return: true if opname is registered as Mkl op; false otherwise
static inline bool IsMklOp(const std::string& op_name, DataType T) {
string kernel = KernelsRegisteredForOp(op_name);
bool result =
@@ -667,6 +770,28 @@ static inline bool IsMklOp(const std::string& op_name, DataType T) {
return result;
}
+// Check whether opname with type T is registered as MKL-compliant and
+// is element-wise.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as element-wise Mkl op; false otherwise
+static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
+ if (!IsMklOp(op_name, T)) {
+ return false;
+ }
+
+ bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+ 0 == op_name.compare(GetMklOpName("Sub")) ||
+ 0 == op_name.compare(GetMklOpName("Mul")) ||
+ 0 == op_name.compare(GetMklOpName("Maximum")) ||
+ 0 == op_name.compare(GetMklOpName("SquaredDifference")));
+
+ VLOG(1) << "mkl_op_registry::" << op_name
+ << " is elementwise MKL op: " << result;
+ return result;
+}
+
} // namespace mkl_op_registry
} // namespace tensorflow
diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md
index 0c0e88c1fe..c9f0c532c6 100644
--- a/tensorflow/docs_src/about/bib.md
+++ b/tensorflow/docs_src/about/bib.md
@@ -37,7 +37,7 @@ system, we suggest you cite this whitepaper.
<pre>
@misc{tensorflow2015-whitepaper,
title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
-url={http://tensorflow.org/},
+url={https://www.tensorflow.org/},
note={Software available from tensorflow.org},
author={
Mart\'{\i}n~Abadi and
diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md
index 5265e5889b..5defade7ae 100644
--- a/tensorflow/docs_src/extend/estimators.md
+++ b/tensorflow/docs_src/extend/estimators.md
@@ -15,7 +15,7 @@ as regressors and classifiers:
Construct a neural network regression model.
* @{tf.estimator.DNNLinearCombinedClassifier}:
Construct a neural network and linear combined classification model.
-* @{tf.estimator.DNNRegressor}:
+* @{tf.estimator.DNNLinearCombinedRegressor}:
Construct a neural network and linear combined regression model.
But what if none of `tf.estimator`'s predefined model types meets your needs?
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index 8eed9b5c5b..67fddfe809 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -447,7 +447,7 @@ estimator = tf.estimator.Estimator(model_fn=model_fn)
x_train = np.array([1., 2., 3., 4.])
y_train = np.array([0., -1., -2., -3.])
x_eval = np.array([2., 5., 8., 1.])
-y_eval = np.array([-1.01, -4.1, -7, 0.])
+y_eval = np.array([-1.01, -4.1, -7., 0.])
input_fn = tf.estimator.inputs.numpy_input_fn(
{"x": x_train}, y_train, batch_size=4, num_epochs=None, shuffle=True)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 7ebf5c4a2c..04cd462848 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -35,7 +35,7 @@ enable TensorFlow for C:
OS="linux" # Change to "darwin" for Mac OS
TARGET_DIRECTORY="/usr/local"
curl -L \
- "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.3.0.tar.gz" |
+ "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0-dev.tar.gz" |
sudo tar -C $TARGET_DIRECTORY -xz
The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index b991fd0f93..b7fa1fe39a 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -35,7 +35,7 @@ steps to install this library and enable TensorFlow for Go:
TF_TYPE="cpu" # Change to "gpu" for GPU support
TARGET_DIRECTORY='/usr/local'
curl -L \
- "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.3.0.tar.gz" |
+ "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0-dev.tar.gz" |
sudo tar -C $TARGET_DIRECTORY -xz
The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 2adcd4da73..e1200dde12 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -34,7 +34,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
<dependency>
<groupId>org.tensorflow</groupId>
<artifactId>tensorflow</artifactId>
- <version>1.3.0</version>
+ <version>1.4.0-dev</version>
</dependency>
```
@@ -63,7 +63,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
<dependency>
<groupId>org.tensorflow</groupId>
<artifactId>tensorflow</artifactId>
- <version>1.3.0</version>
+ <version>1.4.0-dev</version>
</dependency>
</dependencies>
</project>
@@ -122,7 +122,7 @@ refer to the simpler instructions above instead.
Take the following steps to install TensorFlow for Java on Linux or Mac OS:
1. Download
- [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0.jar),
+ [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-dev.jar),
which is the TensorFlow Java Archive (JAR).
2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -141,7 +141,7 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
OS=$(uname -s | tr '[:upper:]' '[:lower:]')
mkdir -p ./jni
curl -L \
- "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.3.0.tar.gz" |
+ "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0-dev.tar.gz" |
tar -xz -C ./jni
### Install on Windows
@@ -149,10 +149,10 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
Take the following steps to install TensorFlow for Java on Windows:
1. Download
- [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0.jar),
+ [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-dev.jar),
which is the TensorFlow Java Archive (JAR).
2. Download the following Java Native Interface (JNI) file appropriate for
- [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.3.0.zip).
+ [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0-dev.zip).
3. Extract this .zip file.
@@ -200,7 +200,7 @@ must be part of your `classpath`. For example, you can include the
downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
as follows:
-<pre><b>javac -cp libtensorflow-1.3.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.4.0-dev.jar HelloTF.java</b></pre>
### Running
@@ -214,11 +214,11 @@ two files are available to the JVM:
For example, the following command line executes the `HelloTF` program on Linux
and Mac OS X:
-<pre><b>java -cp libtensorflow-1.3.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.4.0-dev.jar:. -Djava.library.path=./jni HelloTF</b></pre>
And the following command line executes the `HelloTF` program on Windows:
-<pre><b>java -cp libtensorflow-1.3.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.4.0-dev.jar;. -Djava.library.path=jni HelloTF</b></pre>
If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
installed TensorFlow for Java and are ready to use the API. If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index d5e481520c..b759797082 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -172,7 +172,7 @@ Take the following steps to install TensorFlow with Virtualenv:
virtualenv environment:
<pre>(tensorflow)$ <b>pip3 install --upgrade \
- https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+ https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl</b></pre>
If you encounter installation problems, see
[Common Installation Problems](#common_installation_problems).
@@ -277,7 +277,7 @@ take the following steps:
<pre>
$ <b>sudo pip3 install --upgrade \
- https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp34-cp34m-linux_x86_64.whl</b>
+ https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl</b>
</pre>
If this step fails, see
@@ -464,7 +464,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
<pre>
(tensorflow)$ <b>pip install --ignore-installed --upgrade \
- https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+ https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl</b></pre>
<a name="ValidateYourInstallation"></a>
@@ -632,14 +632,14 @@ This section documents the relevant values for Linux installations.
CPU only:
<pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp27-none-linux_x86_64.whl
</pre>
GPU support:
<pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0dev-cp27-none-linux_x86_64.whl
</pre>
Note that GPU support requires the NVIDIA hardware and software described in
@@ -651,14 +651,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
CPU only:
<pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl
</pre>
GPU support:
<pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0dev-cp34-cp34m-linux_x86_64.whl
</pre>
Note that GPU support requires the NVIDIA hardware and software described in
@@ -670,14 +670,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
CPU only:
<pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp35-cp35m-linux_x86_64.whl
</pre>
GPU support:
<pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0dev-cp35-cp35m-linux_x86_64.whl
</pre>
@@ -689,14 +689,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
CPU only:
<pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp36-cp36m-linux_x86_64.whl
</pre>
GPU support:
<pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0dev-cp36-cp36m-linux_x86_64.whl
</pre>
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 6552bff459..448e300b17 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -109,7 +109,7 @@ Take the following steps to install TensorFlow with Virtualenv:
TensorFlow in the active Virtualenv is as follows:
<pre> $ <b>pip3 install --upgrade \
- https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any.whl</b></pre>
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py2-none-any.whl</b></pre>
If you encounter installation problems, see
[Common Installation Problems](#common-installation-problems).
@@ -230,7 +230,7 @@ take the following steps:
issue the following command:
<pre> $ <b>sudo pip3 install --upgrade \
- https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any.whl</b> </pre>
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py2-none-any.whl</b> </pre>
If the preceding command fails, see
[installation problems](#common-installation-problems).
@@ -339,7 +339,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
TensorFlow for Python 2.7:
<pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
- https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any.whl</b></pre>
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py2-none-any.whl</b></pre>
<a name="ValidateYourInstallation"></a>
@@ -512,7 +512,7 @@ This section documents the relevant values for Mac OS installations.
<pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py2-none-any.whl
</pre>
@@ -520,7 +520,7 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any.
<pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py3-none-any.whl
</pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index d58db00a4c..d8925d3909 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -342,10 +342,10 @@ Invoke `pip install` to install that pip package.
The filename of the `.whl` file depends on your platform.
For example, the following command will install the pip package
-for TensorFlow 1.3.0 on Linux:
+for TensorFlow 1.4.0dev on Linux:
<pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.3.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.4.0dev-py2-none-any.whl</b>
</pre>
## Validate your installation
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 3025c9971a..ae8749c231 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -153,6 +153,9 @@ TensorFlow}.
If the system outputs an error message instead of a greeting, see [Common
installation problems](#common_installation_problems).
+There is also a helpful [script](https://gist.github.com/mrry/ee5dbcfdd045fa48a27d56664411d41c)
+for Windows TensorFlow installation issues.
+
## Common installation problems
We are relying on Stack Overflow to document TensorFlow installation problems
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 989018bc86..6ba8bb7a34 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -319,7 +319,7 @@ described below.
* **`target`.** If this argument is left empty (the default), the session will
only use devices in the local machine. However, you may also specify a
`grpc://` URL to specify the address of a TensorFlow server, which gives the
- session access to all devices on machines that that server controls. See
+ session access to all devices on machines that this server controls. See
@{tf.train.Server} for details of how to create a TensorFlow
server. For example, in the common **between-graph replication**
configuration, the @{tf.Session} connects to a @{tf.train.Server} in the same
diff --git a/tensorflow/docs_src/programmers_guide/tensors.md b/tensorflow/docs_src/programmers_guide/tensors.md
index ff747f326f..cc4181e75e 100644
--- a/tensorflow/docs_src/programmers_guide/tensors.md
+++ b/tensorflow/docs_src/programmers_guide/tensors.md
@@ -147,7 +147,7 @@ Passing a single number, however, returns a subvector of a matrix, as follows:
```python
-my_row_vetor = my_matrix[2]
+my_row_vector = my_matrix[2]
my_column_vector = my_matrix[:, 3]
```
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index 0815cc2a17..8037c92c73 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -270,7 +270,7 @@ The `padding` argument specifies one of two enumerated values
(case-insensitive): `valid` (default value) or `same`. To specify that the
output tensor should have the same width and height values as the input tensor,
we set `padding=same` here, which instructs TensorFlow to add 0 values to the
-edges of the output tensor to preserve width and height of 28. (Without padding,
+edges of the input tensor to preserve width and height of 28. (Without padding,
a 5x5 convolution over a 28x28 tensor will produce a 24x24 tensor, as there are
24x24 locations to extract a 5x5 tile from a 28x28 grid.)
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index dda6e94f4b..bed8e21498 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -37,7 +37,7 @@ on API >= 14 devices.
4. [TF
Speech](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java):
Runs a simple speech recognition model built by the [audio training
- tutorial](https://www.tensorflow.org/tutorials/image_retraining). Listens
+ tutorial](https://www.tensorflow.org/versions/master/tutorials/audio_recognition). Listens
for a small set of words, and highlights them in the UI when they are
recognized.
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index 03294436b8..83cf9f0a2a 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -110,7 +110,7 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl
rgbBytes = new int[previewWidth * previewHeight];
onPreviewSizeChosen(new Size(previewSize.width, previewSize.height), 90);
}
- ImageUtils.convertYUV420SPToARGB8888(bytes, rgbBytes, previewWidth, previewHeight, false);
+ ImageUtils.convertYUV420SPToARGB8888(bytes, previewWidth, previewHeight, rgbBytes);
} catch (final Exception e) {
LOGGER.e(e, "Exception!");
return;
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
index 5f2ff9164c..5629f179c4 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
@@ -27,7 +27,7 @@ import java.io.FileOutputStream;
public class ImageUtils {
@SuppressWarnings("unused")
private static final Logger LOGGER = new Logger();
-
+
static {
try {
System.loadLibrary("tensorflow_demo");
@@ -98,73 +98,105 @@ public class ImageUtils {
// Always prefer the native implementation if available.
private static boolean useNativeConversion = true;
- public static void convertYUV420ToARGB8888(
- byte[] yData,
- byte[] uData,
- byte[] vData,
+ public static void convertYUV420SPToARGB8888(
+ byte[] input,
int width,
int height,
- int yRowStride,
- int uvRowStride,
- int uvPixelStride,
- int[] out) {
+ int[] output) {
if (useNativeConversion) {
try {
- convertYUV420ToARGB8888(
- yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false);
+ ImageUtils.convertYUV420SPToARGB8888(input, output, width, height, false);
return;
} catch (UnsatisfiedLinkError e) {
- LOGGER.w("Native YUV -> RGB implementation not found, falling back to Java implementation");
+ LOGGER.w(
+ "Native YUV420SP -> RGB implementation not found, falling back to Java implementation");
useNativeConversion = false;
}
}
- int i = 0;
- for (int y = 0; y < height; y++) {
- int pY = yRowStride * y;
- int uv_row_start = uvRowStride * (y >> 1);
- int pUV = uv_row_start;
- int pV = uv_row_start;
-
- for (int x = 0; x < width; x++) {
- int uv_offset = pUV + (x >> 1) * uvPixelStride;
- out[i++] =
- YUV2RGB(
- convertByteToInt(yData, pY + x),
- convertByteToInt(uData, uv_offset),
- convertByteToInt(vData, uv_offset));
+ // Java implementation of YUV420SP to ARGB8888 converting
+ final int frameSize = width * height;
+ for (int j = 0, yp = 0; j < height; j++) {
+ int uvp = frameSize + (j >> 1) * width;
+ int u = 0;
+ int v = 0;
+
+ for (int i = 0; i < width; i++, yp++) {
+ int y = 0xff & input[yp];
+ if ((i & 1) == 0) {
+ v = 0xff & input[uvp++];
+ u = 0xff & input[uvp++];
+ }
+
+ output[yp] = YUV2RGB(y, u, v);
}
}
}
- private static int convertByteToInt(byte[] arr, int pos) {
- return arr[pos] & 0xFF;
- }
-
- private static int YUV2RGB(int nY, int nU, int nV) {
- nY -= 16;
- nU -= 128;
- nV -= 128;
- if (nY < 0) nY = 0;
+ private static int YUV2RGB(int y, int u, int v) {
+ // Adjust and check YUV values
+ y = (y - 16) < 0 ? 0 : (y - 16);
+ u -= 128;
+ v -= 128;
// This is the floating point equivalent. We do the conversion in integer
// because some Android devices do not have floating point in hardware.
// nR = (int)(1.164 * nY + 2.018 * nU);
// nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
// nB = (int)(1.164 * nY + 1.596 * nV);
+ int y1192 = 1192 * y;
+ int r = (y1192 + 1634 * v);
+ int g = (y1192 - 833 * v - 400 * u);
+ int b = (y1192 + 2066 * u);
+
+ // Clipping RGB values to be inside boundaries [ 0 , kMaxChannelValue ]
+ r = r > kMaxChannelValue ? kMaxChannelValue : (r < 0 ? 0 : r);
+ g = g > kMaxChannelValue ? kMaxChannelValue : (g < 0 ? 0 : g);
+ b = b > kMaxChannelValue ? kMaxChannelValue : (b < 0 ? 0 : b);
+
+ return 0xff000000 | ((r << 6) & 0xff0000) | ((g >> 2) & 0xff00) | ((b >> 10) & 0xff);
+ }
+
+
+ public static void convertYUV420ToARGB8888(
+ byte[] yData,
+ byte[] uData,
+ byte[] vData,
+ int width,
+ int height,
+ int yRowStride,
+ int uvRowStride,
+ int uvPixelStride,
+ int[] out) {
+ if (useNativeConversion) {
+ try {
+ convertYUV420ToARGB8888(
+ yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false);
+ return;
+ } catch (UnsatisfiedLinkError e) {
+ LOGGER.w(
+ "Native YUV420 -> RGB implementation not found, falling back to Java implementation");
+ useNativeConversion = false;
+ }
+ }
- final int foo = 1192 * nY;
- int nR = foo + 1634 * nV;
- int nG = foo - 833 * nV - 400 * nU;
- int nB = foo + 2066 * nU;
+ int yp = 0;
+ for (int j = 0; j < height; j++) {
+ int pY = yRowStride * j;
+ int pUV = uvRowStride * (j >> 1);
- nR = Math.min(kMaxChannelValue, Math.max(0, nR));
- nG = Math.min(kMaxChannelValue, Math.max(0, nG));
- nB = Math.min(kMaxChannelValue, Math.max(0, nB));
+ for (int i = 0; i < width; i++) {
+ int uv_offset = pUV + (i >> 1) * uvPixelStride;
- return 0xff000000 | ((nR << 6) & 0x00ff0000) | ((nG >> 2) & 0x0000FF00) | ((nB >> 10) & 0xff);
+ out[yp++] = YUV2RGB(
+ 0xff & yData[pY + i],
+ 0xff & uData[uv_offset],
+ 0xff & vData[uv_offset]);
+ }
+ }
}
+
/**
* Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width and height. The
* input and output must already be allocated and non-null. For efficiency, no error checking is
@@ -176,7 +208,7 @@ public class ImageUtils {
* @param height The height of the input image.
* @param halfSize If true, downsample to 50% in each dimension, otherwise not.
*/
- public static native void convertYUV420SPToARGB8888(
+ private static native void convertYUV420SPToARGB8888(
byte[] input, int[] output, int width, int height, boolean halfSize);
/**
@@ -193,7 +225,7 @@ public class ImageUtils {
* @param halfSize If true, downsample to 50% in each dimension, otherwise not.
* @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
*/
- public static native void convertYUV420ToARGB8888(
+ private static native void convertYUV420ToARGB8888(
byte[] y,
byte[] u,
byte[] v,
@@ -215,7 +247,7 @@ public class ImageUtils {
* @param width The width of the input image.
* @param height The height of the input image.
*/
- public static native void convertYUV420SPToRGB565(
+ private static native void convertYUV420SPToRGB565(
byte[] input, byte[] output, int width, int height);
/**
@@ -228,7 +260,7 @@ public class ImageUtils {
* @param width The width of the input image.
* @param height The height of the input image.
*/
- public static native void convertARGB8888ToYUV420SP(
+ private static native void convertARGB8888ToYUV420SP(
int[] input, byte[] output, int width, int height);
/**
@@ -241,7 +273,7 @@ public class ImageUtils {
* @param width The width of the input image.
* @param height The height of the input image.
*/
- public static native void convertRGB565ToYUV420SP(
+ private static native void convertRGB565ToYUV420SP(
byte[] input, byte[] output, int width, int height);
/**
diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md
index 7974b8c879..7d2eb870be 100644
--- a/tensorflow/examples/ios/README.md
+++ b/tensorflow/examples/ios/README.md
@@ -30,7 +30,7 @@ cp ~/graphs/inception5h/* tensorflow/examples/ios/simple/data/
long time since it is big (~450MB). For example, if you want to run the
simple example, then:
```bash
-cd tensorflow/ios/simple
+cd tensorflow/examples/ios/simple
pod install
open tf_simple_example.xcworkspace # obs, not the .xcodeproj directory
```
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index 8298a90b44..c92c38b23c 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -15,9 +15,10 @@
r"""Simple speech recognition to spot a limited number of keywords.
This is a self-contained example script that will train a very basic audio
-recognition model in TensorFlow. It can download the necessary training data,
-and runs with reasonable defaults to train within a few hours even only using a
-CPU. For more information see http://tensorflow.org/tutorials/audio_recognition.
+recognition model in TensorFlow. It downloads the necessary training data and
+runs with reasonable defaults to train within a few hours even only using a CPU.
+For more information, please see
+https://www.tensorflow.org/tutorials/audio_recognition.
It is intended as an introduction to using neural networks for audio
recognition, and is not a full speech recognition system. For more advanced
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
index ffaa242a31..4424100390 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
@@ -515,8 +515,6 @@ public final class Tensor implements AutoCloseable {
private static int elemByteSize(DataType dataType) {
switch (dataType) {
- case UINT8:
- return 1;
case FLOAT:
case INT32:
return 4;
@@ -524,6 +522,7 @@ public final class Tensor implements AutoCloseable {
case INT64:
return 8;
case BOOL:
+ case UINT8:
return 1;
case STRING:
throw new IllegalArgumentException("STRING tensors do not have a fixed element size");
diff --git a/tensorflow/java/src/main/native/tensor_jni.cc b/tensorflow/java/src/main/native/tensor_jni.cc
index 7bfe6c896d..745abec244 100644
--- a/tensorflow/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/java/src/main/native/tensor_jni.cc
@@ -41,8 +41,11 @@ size_t elemByteSize(TF_DataType dtype) {
// have the same byte sizes. Validate that:
switch (dtype) {
case TF_BOOL:
+ case TF_UINT8:
static_assert(sizeof(jboolean) == 1,
"Java boolean not compatible with TF_BOOL");
+ static_assert(sizeof(jbyte) == 1,
+ "Java byte not compatible with TF_UINT8");
return 1;
case TF_FLOAT:
case TF_INT32:
@@ -90,6 +93,7 @@ void writeScalar(JNIEnv* env, jobject src, TF_DataType dtype, void* dst,
CASE(TF_DOUBLE, jdouble, "doubleValue", "()D", Double);
CASE(TF_INT32, jint, "intValue", "()I", Int);
CASE(TF_INT64, jlong, "longValue", "()J", Long);
+ CASE(TF_UINT8, jbyte, "byteValue", "()B", Byte);
#undef CASE
case TF_BOOL: {
jclass clazz = env->FindClass("java/lang/Boolean");
@@ -134,6 +138,7 @@ size_t write1DArray(JNIEnv* env, jarray array, TF_DataType dtype, void* dst,
CASE(TF_INT32, jint, Int);
CASE(TF_INT64, jlong, Long);
CASE(TF_BOOL, jboolean, Boolean);
+ CASE(TF_UINT8, jbyte, Byte);
#undef CASE
default:
throwException(env, kIllegalStateException, "invalid DataType(%d)",
@@ -168,6 +173,7 @@ size_t read1DArray(JNIEnv* env, TF_DataType dtype, const void* src,
CASE(TF_INT32, jint, Int);
CASE(TF_INT64, jlong, Long);
CASE(TF_BOOL, jboolean, Boolean);
+ CASE(TF_UINT8, jbyte, Byte);
#undef CASE
default:
throwException(env, kIllegalStateException, "invalid DataType(%d)",
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 9c0db82cbf..524f128154 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3785,6 +3785,7 @@ py_library(
"layers/convolutional.py",
"layers/core.py",
"layers/layers.py",
+ "layers/maxout.py",
"layers/normalization.py",
"layers/pooling.py",
],
@@ -3866,6 +3867,22 @@ py_test(
)
py_test(
+ name = "layers_maxout_test",
+ size = "small",
+ srcs = ["layers/maxout_test.py"],
+ main = "layers/maxout_test.py",
+ srcs_version = "PY2AND3",
+ deps = [
+ ":client_testlib",
+ ":framework_for_generated_wrappers",
+ ":layers",
+ ":math_ops",
+ ":nn_ops",
+ ":random_ops",
+ ],
+)
+
+py_test(
name = "layers_utils_test",
size = "small",
srcs = ["layers/utils_test.py"],
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index e986b7ff2b..009f3ea4b3 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -93,6 +93,22 @@ class DecodeRawOpTest(test.TestCase):
result = decode.eval(feed_dict={in_bytes: [""]})
self.assertEqual(len(result), 1)
+ def testToUInt16(self):
+ with self.test_session():
+ in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+ decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.uint16)
+ self.assertEqual([None, None], decode.get_shape().as_list())
+
+ # Use FF/EE/DD/CC so that decoded value is higher than 32768 for uint16
+ result = decode.eval(feed_dict={in_bytes: [b"\xFF\xEE\xDD\xCC"]})
+ self.assertAllEqual(
+ [[0xFF + 0xEE * 256, 0xDD + 0xCC * 256]], result)
+
+ with self.assertRaisesOpError(
+ "Input to DecodeRaw has length 3 that is not a multiple of 2, the "
+ "size of uint16"):
+ decode.eval(feed_dict={in_bytes: ["123", "456"]})
+
if __name__ == "__main__":
test.main()
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 543039bdd3..cce705110c 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -3538,7 +3538,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
weights_queue = data_flow_ops.FIFOQueue(
6, dtypes=dtypes_lib.float32, shapes=(1, 1))
_enqueue_vector(sess, weights_queue, [1.0])
- _enqueue_vector(sess, weights_queue, [1.0])
+ _enqueue_vector(sess, weights_queue, [0.5])
_enqueue_vector(sess, weights_queue, [1.0])
_enqueue_vector(sess, weights_queue, [0.0])
_enqueue_vector(sess, weights_queue, [1.0])
@@ -3551,7 +3551,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
variables.local_variables_initializer().run()
for _ in range(6):
sess.run(update_op)
- desired_output = np.mean([2.0 / 2.0, 1.0 / 2.0])
+ desired_output = np.mean([2.0 / 2.0, 0.5 / 1.5])
self.assertAlmostEqual(desired_output, mean_accuracy.eval())
def testMultipleUpdatesWithMissingClass(self):
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index bf20f5d1a9..516a9d000e 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -645,7 +645,6 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
with self.assertRaisesOpError(r"Segment id 0 out of range \[0, 0\)"):
s.eval()
-
class SegmentReductionOpBenchmark(test.Benchmark):
outer_dim_options = [2**x for x in range(9, 14, 2)]
ratio_options = [2**x for x in range(1, 6, 2)]
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index fd49e1a6cc..32a623e74a 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -41,23 +41,19 @@ class SvdOpTest(test.TestCase):
linalg_ops.svd(vector)
-def _GetSvdOpTest(dtype_, shape_, use_static_shape_):
+def _GetSvdOpTest(dtype_, shape_, use_static_shape_, use_gpu_):
is_complex = dtype_ in (np.complex64, np.complex128)
is_single = dtype_ in (np.float32, np.complex64)
+
+ # The gpu version returns results that are much less precise
+ precision_factor = 100 if use_gpu_ else 1
+ tol = precision_factor * (3e-4 if is_single else 1e-12)
def CompareSingularValues(self, x, y):
- if is_single:
- tol = 5e-5
- else:
- tol = 1e-14
self.assertAllClose(x, y, atol=(x[0] + y[0]) * tol)
def CompareSingularVectors(self, x, y, rank):
- if is_single:
- atol = 5e-4
- else:
- atol = 5e-14
# We only compare the first 'rank' singular vectors since the
# remainder form an arbitrary orthonormal basis for the
# (row- or column-) null space, whose exact value depends on
@@ -72,13 +68,9 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_):
sum_of_ratios = np.sum(np.divide(y, x), -2, keepdims=True)
phases = np.divide(sum_of_ratios, np.abs(sum_of_ratios))
x *= phases
- self.assertAllClose(x, y, atol=atol)
+ self.assertAllClose(x, y, atol=2 * tol)
def CheckApproximation(self, a, u, s, v, full_matrices):
- if is_single:
- tol = 1e-5
- else:
- tol = 1e-14
# Tests that a ~= u*diag(s)*transpose(v).
batch_shape = a.shape[:-2]
m = a.shape[-2]
@@ -99,10 +91,6 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_):
# Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
xx = math_ops.matmul(x, x, adjoint_a=True)
identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
- if is_single:
- tol = 1e-5
- else:
- tol = 1e-14
self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
def Test(self):
@@ -116,7 +104,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_):
for compute_uv in False, True:
for full_matrices in False, True:
- with self.test_session() as sess:
+ with self.test_session(use_gpu = use_gpu_) as sess:
if use_static_shape_:
x_tf = constant_op.constant(x_np)
else:
@@ -167,14 +155,15 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_):
if __name__ == "__main__":
- for dtype in np.float32, np.float64, np.complex64, np.complex128:
- for rows in 1, 2, 5, 10, 32, 100:
- for cols in 1, 2, 5, 10, 32, 100:
- for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
- shape = batch_dims + (rows, cols)
- for use_static_shape in True, False:
- name = "%s_%s_%s" % (dtype.__name__, "_".join(map(str, shape)),
- use_static_shape)
- setattr(SvdOpTest, "testSvd_" + name,
- _GetSvdOpTest(dtype, shape, use_static_shape))
+ for use_gpu in False, True:
+ for dtype in np.float32, np.float64, np.complex64, np.complex128:
+ for rows in 1, 2, 5, 10, 32, 100:
+ for cols in 1, 2, 5, 10, 32, 100:
+ for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
+ shape = batch_dims + (rows, cols)
+ for use_static_shape in True, False:
+ name = "%s_%s_%s_%s" % (dtype.__name__, "_".join(map(str, shape)),
+ use_static_shape, use_gpu)
+ setattr(SvdOpTest, "testSvd_" + name,
+ _GetSvdOpTest(dtype, shape, use_static_shape, use_gpu))
test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 43e4bb0ee2..3db5e4754a 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -25,24 +25,20 @@ from __future__ import print_function
import collections
import copy
-import functools
import re
import weakref
-from six.moves import xrange # pylint: disable=redefined-builtin
import numpy as np
-import six
-
from tensorflow.python.eager import context
from tensorflow.python.estimator import util as estimator_util
-from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape
from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables as tf_variables
from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.util import nest
+from tensorflow.python.ops import variables as tf_variables
from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
class Layer(object):
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 41c67743b6..9dec3b5a47 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -20,23 +20,13 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
-import six
-from six.moves import xrange # pylint: disable=redefined-builtin
-import numpy as np
-
from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import standard_ops
-from tensorflow.python.ops import variable_scope as vs
from tensorflow.python.framework import tensor_shape
from tensorflow.python.layers import base
from tensorflow.python.layers import utils
-from tensorflow.python import framework
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
class _Conv(base.Layer):
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 3570c003ef..4eecf9c9a1 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -22,6 +22,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
+
import six
from six.moves import xrange # pylint: disable=redefined-builtin
import numpy as np
@@ -29,15 +30,13 @@ import numpy as np
from tensorflow.python.eager import context
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base
+from tensorflow.python.layers import utils
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn
from tensorflow.python.ops import standard_ops
-from tensorflow.python.ops import variable_scope as vs
-
-from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
class Dense(base.Layer):
diff --git a/tensorflow/python/layers/maxout.py b/tensorflow/python/layers/maxout.py
new file mode 100644
index 0000000000..1ea36dbf6a
--- /dev/null
+++ b/tensorflow/python/layers/maxout.py
@@ -0,0 +1,108 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=unused-import,g-bad-import-order
+"""Contains the maxout layer
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import gen_array_ops
+
+from tensorflow.python.layers import base
+
+
+def maxout(inputs, num_units, axis=-1, name=None):
+ """Adds a maxout op from https://arxiv.org/abs/1302.4389
+
+ "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville,
+ Yoshua Bengio
+
+ Usually the operation is performed in the filter/channel dimension. This can also be
+ used after fully-connected layers to reduce number of features.
+
+ Arguments:
+ inputs: Tensor input
+ num_units: Specifies how many features will remain after maxout in the `axis` dimension
+ (usually channel). This must be multiple of number of `axis`.
+ axis: The dimension where max pooling will be performed. Default is the
+ last dimension.
+ name: Optional scope for name_scope.
+
+ Returns:
+ A `Tensor` representing the results of the pooling operation.
+
+ Raises:
+ ValueError: if num_units is not multiple of number of features.
+ """
+ return MaxOut(num_units=num_units, axis=axis, name=name)(inputs)
+
+
+class MaxOut(base.Layer):
+ """Adds a maxout op from https://arxiv.org/abs/1302.4389
+
+ "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, Yoshua
+ Bengio
+
+ Usually the operation is performed in the filter/channel dimension. This can also be
+ used after fully-connected layers to reduce number of features.
+
+ Arguments:
+ inputs: Tensor input
+ num_units: Specifies how many features will remain after maxout in the `axis` dimension
+ (usually channel).
+ This must be multiple of number of `axis`.
+ axis: The dimension where max pooling will be performed. Default is the
+ last dimension.
+ name: Optional scope for name_scope.
+
+ Returns:
+ A `Tensor` representing the results of the pooling operation.
+
+ Raises:
+ ValueError: if num_units is not multiple of number of features.
+ """
+
+ def __init__(self,
+ num_units,
+ axis=-1,
+ name=None,
+ **kwargs):
+ super(MaxOut, self).__init__(
+ name=name, trainable=False, **kwargs)
+ self.axis = axis
+ self.num_units = num_units
+
+ def call(self, inputs):
+ inputs = ops.convert_to_tensor(inputs)
+ shape = inputs.get_shape().as_list()
+ num_channels = shape[self.axis]
+ if num_channels % self.num_units:
+ raise ValueError('number of features({}) is not '
+ 'a multiple of num_units({})'
+ .format(num_channels, self.num_units))
+ shape[self.axis] = -1
+ shape += [num_channels // self.num_units]
+
+ # Dealing with batches with arbitrary sizes
+ for i in range(len(shape)):
+ if shape[i] is None:
+ shape[i] = gen_array_ops.shape(inputs)[i]
+ outputs = math_ops.reduce_max(gen_array_ops.reshape(inputs, shape), -1, keep_dims=False)
+
+ return outputs
diff --git a/tensorflow/python/layers/maxout_test.py b/tensorflow/python/layers/maxout_test.py
new file mode 100644
index 0000000000..26acac57c4
--- /dev/null
+++ b/tensorflow/python/layers/maxout_test.py
@@ -0,0 +1,61 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=unused-import,g-bad-import-order
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.layers import maxout
+from tensorflow.python.layers import convolutional as conv_layers
+from tensorflow.python.layers import core as core_layers
+
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+import numpy as np
+
+"""
+Contains the maxout layer tests
+"""
+
+
+class MaxOutTest(test.TestCase):
+ def test_simple(self):
+ inputs = random_ops.random_uniform((64, 10, 36), seed=1)
+ graph = maxout.maxout(inputs, num_units=3)
+ self.assertEqual(graph.get_shape().as_list(), [64, 10, 3])
+
+ def test_fully_connected(self):
+ inputs = random_ops.random_uniform((64, 50), seed=1)
+ graph = core_layers.dense(inputs, 50)
+ graph = maxout.maxout(graph, num_units=10)
+ self.assertEqual(graph.get_shape().as_list(), [64, 10])
+
+ def test_nchw(self):
+ inputs = random_ops.random_uniform((10, 100, 100, 3), seed=1)
+ graph = conv_layers.conv2d(inputs, 10, 3, padding="SAME")
+ graph = maxout.maxout(graph, num_units=1)
+ self.assertEqual(graph.get_shape().as_list(), [10, 100, 100, 1])
+
+ def test_invalid_shape(self):
+ inputs = random_ops.random_uniform((10, 100, 100, 3), seed=1)
+ graph = conv_layers.conv2d(inputs, 3, 10, strides=(1, 1))
+ with self.assertRaisesRegexp(ValueError, 'number of features'):
+ graph = maxout.maxout(graph, num_units=2)
+
+if __name__ == '__main__':
+ test.main()
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 222817cd3a..3bd9a0f491 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -26,24 +26,18 @@ import numpy as np
from tensorflow.python.eager import context
from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import tensor_shape
from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base
+from tensorflow.python.layers import utils
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import nn
from tensorflow.python.ops import gen_resource_variable_ops
from tensorflow.python.ops import resource_variable_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import standard_ops
from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope as vs
from tensorflow.python.training import moving_averages
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import variables
-
-from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
class BatchNormalization(base.Layer):
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index e903afa0a8..6245ec5054 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -20,21 +20,11 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
-import six
-from six.moves import xrange # pylint: disable=redefined-builtin
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import standard_ops
-from tensorflow.python.ops import variable_scope as vs
from tensorflow.python.framework import tensor_shape
from tensorflow.python.layers import base
from tensorflow.python.layers import utils
-from tensorflow.python import framework
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
class _Pooling1D(base.Layer):
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 98c287e63e..7c71d3c952 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -20,13 +20,8 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
-import six
-from six.moves import xrange # pylint: disable=redefined-builtin
-import numpy as np
-
from tensorflow.python.ops import variables
from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_util
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index eb0b08c5fd..bfacf151e7 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -259,11 +259,10 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
update_op: An operation that increments the confusion matrix.
"""
# Local variable to accumulate the predictions in the confusion matrix.
- cm_dtype = dtypes.int64 if weights is not None else dtypes.float64
total_cm = _create_local(
'total_confusion_matrix',
shape=[num_classes, num_classes],
- dtype=cm_dtype)
+ dtype=dtypes.float64)
# Cast the type to int64 required by confusion_matrix_ops.
predictions = math_ops.to_int64(predictions)
@@ -282,7 +281,7 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
# Accumulate the prediction to current confusion matrix.
current_cm = confusion_matrix.confusion_matrix(
- labels, predictions, num_classes, weights=weights, dtype=cm_dtype)
+ labels, predictions, num_classes, weights=weights, dtype=dtypes.float64)
update_op = state_ops.assign_add(total_cm, current_cm)
return total_cm, update_op
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 0060b58bd7..eb07343850 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -278,14 +278,12 @@ class ExponentialMovingAverage(object):
# Create an ExponentialMovingAverage object
ema = tf.train.ExponentialMovingAverage(decay=0.9999)
- # Create the shadow variables, and add ops to maintain moving averages
- # of var0 and var1.
- maintain_averages_op = ema.apply([var0, var1])
-
- # Create an op that will update the moving averages after each training
- # step. This is what we will use in place of the usual training op.
with tf.control_dependencies([opt_op]):
- training_op = tf.group(maintain_averages_op)
+ # Create the shadow variables, and add ops to maintain moving averages
+ # of var0 and var1. This also creates an op that will update the moving
+ # averages after each training step. This is what we will use in place
+ # of the usual training op.
+ training_op = ema.apply([var0, var1])
...train the model by running training_op...
```
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 5c156e7ee2..54649dab01 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -980,6 +980,7 @@ check_deps = rule(
def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
cuda_deps = [
clean_dep("//tensorflow/core:stream_executor_headers_lib"),
+ "@local_config_cuda//cuda:cuda_headers",
"@local_config_cuda//cuda:cudart_static",
]
deps = deps + tf_custom_op_library_additional_deps()
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index 850f0edd94..bddb87f00c 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -1,6 +1,6 @@
*tensorflow*
*perftools*gputools*
*tf_*
-TF_*
-TFE_*
+*TF_*
+*TFE_*
*nsync_*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 73d4c0cae4..11f66c5c8b 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -2,8 +2,8 @@ tensorflow {
global:
*tensorflow*;
*perftools*gputools*;
- TF_*;
- TFE_*;
+ *TF_*;
+ *TFE_*;
*nsync_*;
local:
*;
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 61f5ed084c..f6e3d2e6c7 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -60,8 +60,11 @@ reinstall_tensorflow_pip ${PIP_NAME}
# Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
# which will result testing system installed tensorflow
+# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
+# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
bazel test -c opt $BUILD_OPTS -k --test_output=errors \
--define=no_tensorflow_py_deps=true --test_lang_filters=py \
--test_tag_filters=-no_pip,-no_windows \
--build_tag_filters=-no_pip,-no_windows --build_tests_only \
+ --test_env=TF_SAVER_LENIENT_NAMES=True \
//${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index e1972a3100..25d327c818 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -61,8 +61,11 @@ reinstall_tensorflow_pip ${PIP_NAME}
# Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
# which will result testing system installed tensorflow
# GPU tests are very flaky when running concurrently, so set local_test_jobs=1
+# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
+# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
bazel test -c opt $BUILD_OPTS -k --test_output=errors \
--define=no_tensorflow_py_deps=true --test_lang_filters=py \
--test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \
--build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \
+ --test_env=TF_SAVER_LENIENT_NAMES=True \
--local_test_jobs=1 --build_tests_only //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 1b97c0d108..4cfaf68ef3 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -72,7 +72,7 @@ RUN mkdir /bazel && \
RUN git clone https://github.com/tensorflow/tensorflow.git && \
cd tensorflow && \
- git checkout r1.3
+ git checkout r1.4
WORKDIR /tensorflow
# TODO(craigcitro): Don't install the pip package, since it makes it
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 80b45ae704..8d7e759bb2 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -73,7 +73,7 @@ RUN mkdir /bazel && \
RUN git clone https://github.com/tensorflow/tensorflow.git && \
cd tensorflow && \
- git checkout r1.3
+ git checkout r1.4
WORKDIR /tensorflow
# Configure the build for our CUDA configuration.
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index f48fdcc9ec..cbf06a97d0 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -98,6 +98,7 @@ function main() {
"${TMPDIR}/external"
RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow
else
+ RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow
if [ -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external ]; then
# Old-style runfiles structure (--legacy_external_runfiles).
cp -R \
@@ -108,12 +109,12 @@ function main() {
bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \
"${TMPDIR}/external"
# Copy MKL libs over so they can be loaded at runtime
- so_lib_dir="bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8"
- if [ -d ${so_lib_dir} ]; then
- mkl_so_dir=$(ls ${so_lib_dir} | grep mkl)
- if [ $? -eq 0 ]; then
- mkdir "${TMPDIR}/_solib_k8"
- cp -R ${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/_solib_k8"
+ so_lib_dir=$(ls $RUNFILES | grep solib) || true
+ if [ -n "${so_lib_dir}" ]; then
+ mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true
+ if [ -n "${mkl_so_dir}" ]; then
+ mkdir "${TMPDIR}/${so_lib_dir}"
+ cp -R ${RUNFILES}/${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/${so_lib_dir}"
fi
fi
else
@@ -127,16 +128,15 @@ function main() {
bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
"${TMPDIR}/external"
# Copy MKL libs over so they can be loaded at runtime
- so_lib_dir="bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8"
- if [ -d ${so_lib_dir} ]; then
- mkl_so_dir=$(ls ${so_lib_dir} | grep mkl)
- if [ $? -eq 0 ]; then
- mkdir "${TMPDIR}/_solib_k8"
- cp -R ${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/_solib_k8"
+ so_lib_dir=$(ls $RUNFILES | grep solib) || true
+ if [ -n "${so_lib_dir}" ]; then
+ mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true
+ if [ -n "${mkl_so_dir}" ]; then
+ mkdir "${TMPDIR}/${so_lib_dir}"
+ cp -R ${RUNFILES}/${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/${so_lib_dir}"
fi
fi
fi
- RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow
fi
# protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index dd5a1d7449..00dffc4d27 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
# This version string is semver compatible, but incompatible with pip.
# For pip, we will remove all '-' characters from this string, and use the
# result for pip.
-_VERSION = '1.3.0'
+_VERSION = '1.4.0-dev'
REQUIRED_PACKAGES = [
'enum34 >= 1.1.6',
@@ -192,7 +192,7 @@ setup(
version=_VERSION.replace('-', ''),
description='TensorFlow helps the tensors flow',
long_description='',
- url='http://tensorflow.org/',
+ url='https://www.tensorflow.org/',
author='Google Inc.',
author_email='opensource@google.com',
# Contained modules and scripts.
@@ -233,8 +233,8 @@ setup(
'Topic :: Scientific/Engineering :: Mathematics',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Software Development',
- 'Topic :: Software Development :: Libraries',
- 'Topic :: Software Development :: Libraries :: Python Modules',
+ 'Topic :: Software Development :: Libraries',
+ 'Topic :: Software Development :: Libraries :: Python Modules',
],
license='Apache 2.0',
keywords='tensorflow tensor machine learning',)
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 431676c52d..2f24e2f019 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -99,7 +99,8 @@ def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
# Apply a patch_file to the repository root directory
# Runs 'patch -p1'
def _apply_patch(repo_ctx, patch_file):
- if not repo_ctx.which("patch"):
+ # Don't check patch on Windows, because patch is only available under bash.
+ if not _is_windows(repo_ctx) and not repo_ctx.which("patch"):
fail("patch command is not found, please install it")
cmd = [
@@ -628,11 +629,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
temp_workaround_http_archive(
name = "nccl_archive",
urls = [
- "http://mirror.bazel.build/github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz",
- "https://github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz",
+ "http://mirror.bazel.build/github.com/nvidia/nccl/archive/29a1a916dc14bb2c00feed3d4820d51fa85be1e6.tar.gz",
+ "https://github.com/nvidia/nccl/archive/29a1a916dc14bb2c00feed3d4820d51fa85be1e6.tar.gz",
],
- sha256 = "6c34a0862d9f8ed4ad5984c6a8206b351957bb14cf6ad7822720f285f4aada04",
- strip_prefix = "nccl-ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b",
+ sha256 = "6387030e37d14762f87eefbc86ee527293ec04745c66ccd820cf7fc0fdc23f92",
+ strip_prefix = "nccl-29a1a916dc14bb2c00feed3d4820d51fa85be1e6",
build_file = str(Label("//third_party:nccl.BUILD")),
repository = tf_repo_name,
)