diff options
Diffstat (limited to 'tensorflow')
103 files changed, 3001 insertions, 628 deletions
diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 315f0bee5d..eef327d690 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -472,6 +472,7 @@ cc_binary( "//tensorflow:darwin": [ "-Wl,-exported_symbols_list", # This line must be directly followed by the exported_symbols.lds file "//tensorflow/c:exported_symbols.lds", + "-Wl,-install_name,@rpath/libtensorflow.so", ], "//tensorflow:windows": [], "//tensorflow:windows_msvc": [], diff --git a/tensorflow/c/version_script.lds b/tensorflow/c/version_script.lds index 5855782003..c352a1440d 100644 --- a/tensorflow/c/version_script.lds +++ b/tensorflow/c/version_script.lds @@ -1,8 +1,8 @@ VERS_1.0 { # Export symbols in c_api.h. global: - TF_*; - TFE_*; + *TF_*; + *TFE_*; # Hide everything else. local: diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD index 7606e193a9..1eebc8f6a6 100644 --- a/tensorflow/cc/BUILD +++ b/tensorflow/cc/BUILD @@ -336,6 +336,7 @@ cc_library( ":cc_ops", ":cc_ops_internal", ":grad_op_registry", + ":gradients", ], alwayslink = 1, ) diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc index aba17cfe0c..ac288b1d83 100644 --- a/tensorflow/cc/gradients/math_grad.cc +++ b/tensorflow/cc/gradients/math_grad.cc @@ -696,6 +696,18 @@ Status MeanGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Mean", MeanGrad); +Status LgammaGrad(const Scope& scope, const Operation& op, + const std::vector<Output>& grad_inputs, + std::vector<Output>* grad_outputs) { + auto grad = grad_inputs[0]; + Scope grad_scope = scope.WithControlDependencies(grad); + auto x = ConjugateHelper(grad_scope, op.input(0)); + auto dx = Mul(scope, grad, Digamma(scope, x)); + grad_outputs->push_back(dx); + return scope.status(); +} +REGISTER_GRADIENT_OP("Lgamma", LgammaGrad); + Status MinOrMaxGrad(const Scope& scope, const Operation& op, const std::vector<Output>& grad_inputs, std::vector<Output>* grad_outputs) { diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc index 3534f16e8f..a174f223ad 100644 --- a/tensorflow/cc/gradients/math_grad_test.cc +++ b/tensorflow/cc/gradients/math_grad_test.cc @@ -821,5 +821,17 @@ TEST_F(NaryGradTest, Minimum) { RunTest(x, x_init_value, y, shape); } +TEST_F(NaryGradTest, Lgamma) { + TensorShape shape({3, 2}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape)); + auto y = Lgamma(scope_, x); + // Select values to avoid instability when computing finite differences. + // Ref: https://en.wikipedia.org/wiki/File:Gamma_plot.svg + Tensor x_init_value = + test::AsTensor<float>({-3.5f, -2.5f, -1.5f, 1.0f, 2.0f, 3.5f}, {3, 2}); + RunTest(x, x_init_value, y, shape); + // TODO(suharshs): add test case for complex values +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc index ccb58e7f91..fcc3fc9dae 100644 --- a/tensorflow/cc/gradients/nn_grad.cc +++ b/tensorflow/cc/gradients/nn_grad.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/cc/framework/grad_op_registry.h" +#include "tensorflow/cc/framework/gradients.h" namespace tensorflow { namespace ops { @@ -118,6 +119,87 @@ Status BiasAddGradHelper(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("BiasAdd", BiasAddGradHelper); +Status Conv2DGrad(const Scope& scope, const Operation& op, + const std::vector<Output>& grad_inputs, + std::vector<Output>* grad_outputs) { + string data_format; + string padding; + std::vector<int32> strides; + bool use_cudnn_on_gpu; + auto attrs = op.output(0).node()->attrs(); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "use_cudnn_on_gpu", + &use_cudnn_on_gpu)); + Conv2DBackpropInput::Attrs input_attrs; + input_attrs.DataFormat(data_format); + input_attrs.UseCudnnOnGpu(use_cudnn_on_gpu); + auto dx_1 = Conv2DBackpropInput(scope, Shape(scope, op.input(0)), + op.input(1), grad_inputs[0], + strides, padding, input_attrs); + grad_outputs->push_back(dx_1); + Conv2DBackpropFilter::Attrs filter_attrs; + filter_attrs.DataFormat(data_format); + filter_attrs.UseCudnnOnGpu(use_cudnn_on_gpu); + auto dx_2 = Conv2DBackpropFilter(scope, op.input(0), + Shape(scope, op.input(1)), grad_inputs[0], + strides, padding, filter_attrs); + grad_outputs->push_back(dx_2); + return scope.status(); +} +REGISTER_GRADIENT_OP("Conv2D", Conv2DGrad); + +Status MaxPoolGradHelper(const Scope& scope, const Operation& op, + const std::vector<Output>& grad_inputs, + std::vector<Output>* grad_outputs) { + string data_format; + string padding; + std::vector<int32> strides; + std::vector<int32> ksize; + auto attrs = op.output(0).node()->attrs(); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "ksize", &ksize)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides)); + internal::MaxPoolGrad::Attrs grad_attrs; + grad_attrs.DataFormat(data_format); + auto dx = internal::MaxPoolGrad(scope, op.input(0), + op.output(0), + grad_inputs[0], + ksize, strides, + padding, grad_attrs); + grad_outputs->push_back(dx); + return scope.status(); +} +REGISTER_GRADIENT_OP("MaxPool", MaxPoolGradHelper); + +Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op, + const std::vector<Output>& grad_inputs, + std::vector<Output>* grad_outputs) { + string data_format; + string padding; + auto attrs = op.output(0).node()->attrs(); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); + TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding)); + MaxPoolGradV2::Attrs grad_attrs; + grad_attrs.DataFormat(data_format); + auto dx = MaxPoolGradV2(scope, op.input(0), + op.output(0), + grad_inputs[0], + op.input(1), + op.input(2), + padding, + grad_attrs); + grad_outputs->push_back(dx); + grad_outputs->push_back(NoGradient()); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("MaxPoolV2", MaxPoolGradV2Helper); + + + } // anonymous namespace } // namespace ops } // namespace tensorflow diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc index 64f1f76066..23545f75ac 100644 --- a/tensorflow/cc/gradients/nn_grad_test.cc +++ b/tensorflow/cc/gradients/nn_grad_test.cc @@ -138,5 +138,32 @@ TEST_F(NNGradTest, BiasAddGradHelper) { RunTest({x, bias}, {shape, bias_shape}, {y}, {shape}); } +TEST_F(NNGradTest, Conv2DGrad) { + TensorShape shape({1, 2, 2, 1}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape)); + Tensor filter = test::AsTensor<float>({0.5f}, {1, 1, 1, 1}); + const std::vector<int> strides{1, 1, 1, 1}; + auto y = Conv2D(scope_, x, filter, strides, "SAME"); + RunTest(x, shape, y, shape); +} + +TEST_F(NNGradTest, MaxPoolGradHelper) { + TensorShape shape({1, 2, 2, 1}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape)); + const std::vector<int> ksize{1, 2, 2, 1}; + const std::vector<int> strides{1, 1, 1, 1}; + auto y = MaxPool(scope_, x, ksize, strides, "SAME"); + RunTest(x, shape, y, shape); +} + +TEST_F(NNGradTest, MaxPoolGradV2Helper) { + TensorShape shape({1, 2, 2, 1}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape)); + Tensor ksize = test::AsTensor<int>({1, 2, 2, 1}, {4}); + Tensor strides = test::AsTensor<int>({1, 1, 1, 1}, {4}); + auto y = MaxPoolV2(scope_, x, ksize, strides, "SAME"); + RunTest(x, shape, y, shape); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index f7ddee7b61..b8bdd78da8 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -999,7 +999,7 @@ Status IrEmitterUnnested::EmitRowReduction( // for (shuffle_distance = 16; shuffle_distance > 0; shuffle_distance /= 2) // partial_result = Reducer( // partial_result, - // __shfl_down(partial_result, shuffle_distance)); + // __shfl_down_sync(CUDA_WARP_ALL, partial_result, shuffle_distance)); // if (lane_id == 0) // AtomicReducer(&output[y], partial_result); // } diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc index 2e7765c4c6..b24fe417ff 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc @@ -71,7 +71,18 @@ const int kDefaultInlineThreshold = 1100; // Gets the libdevice filename for a particular compute capability. When // presented with a GPU we don't recognize, we just return the libdevice from // compute_20. -static string GetLibdeviceFilename(std::pair<int, int> compute_capability) { +static string GetLibdeviceFilename(const string& libdevice_dir_path, + std::pair<int, int> compute_capability) { + // Since CUDA 9.0, all GPU versions are included in a single file + const char* unified_libdevice_filename = "libdevice.10.bc"; + std::vector<string> unified_libdevice_files; + const tensorflow::Status status = + tensorflow::Env::Default()->GetMatchingPaths( + tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename), + &unified_libdevice_files); + if (status.ok() && unified_libdevice_files.size() == 1) { + return unified_libdevice_filename; + } // There are only four libdevice files: compute_{20,30,35,50}. Each GPU // version gets mapped to one of these. Note in particular that sm_60 and // sm_61 map to libdevice.compute_30. @@ -101,7 +112,7 @@ static string GetLibdeviceFilename(std::pair<int, int> compute_capability) { } // Gets the GPU name as it's known to LLVM for a given compute capability. If -// we see an unrecognized compute capability, we return "sm_20". +// we see an unrecognized compute capability, we return "sm_30". static string GetSmName(std::pair<int, int> compute_capability) { static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20}, {{2, 1}, 21}, @@ -114,8 +125,10 @@ static string GetSmName(std::pair<int, int> compute_capability) { {{5, 3}, 53}, {{6, 0}, 60}, {{6, 1}, 61}, - {{6, 2}, 62}}); - int sm_version = 20; + {{6, 2}, 62}, + // TODO: Change this to 70 once LLVM NVPTX supports it + {{7, 0}, 60}}); + int sm_version = 30; auto it = m->find(compute_capability); if (it != m->end()) { sm_version = it->second; @@ -306,7 +319,8 @@ tensorflow::Status LinkLibdeviceIfNecessary( llvm::Linker linker(*module); string libdevice_path = tensorflow::io::JoinPath( - libdevice_dir_path, GetLibdeviceFilename(compute_capability)); + libdevice_dir_path, GetLibdeviceFilename(libdevice_dir_path, + compute_capability)); TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path)); VLOG(1) << "Linking with libdevice from: " << libdevice_path; std::unique_ptr<llvm::Module> libdevice_module = diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index c85214e9a4..54cd26502a 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -531,6 +531,15 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) { << " computation: " << computation.get(); } } + if (instruction->opcode() == HloOpcode::kBroadcast) { + // If you see this failure then someone has confused the difference + // between the HLO broadcast op, and the UserComputation broadcast + // op. See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I + // or ComputationLowerer::Visit() + TF_RET_CHECK(instruction->dimensions().size() == + ShapeUtil::Rank(instruction->operand(0)->shape())) + << "Broadcast HLO has invalid number of dimensions."; + } auto previous = instructions.find(instruction->name()); TF_RET_CHECK(previous == instructions.end()) diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java index 6389ef1f5d..f60bd8282c 100644 --- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java +++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java @@ -20,6 +20,7 @@ import android.os.Build.VERSION; import android.os.Trace; import android.text.TextUtils; import android.util.Log; +import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; @@ -78,10 +79,35 @@ public class TensorFlowInferenceInterface { throw new RuntimeException("Failed to load model from '" + model + "'", e); } } + try { - loadGraph(is, g); + if (VERSION.SDK_INT >= 18) { + Trace.beginSection("initializeTensorFlow"); + Trace.beginSection("readGraphDef"); + } + + // TODO(ashankar): Can we somehow mmap the contents instead of copying them? + byte[] graphDef = new byte[is.available()]; + final int numBytesRead = is.read(graphDef); + if (numBytesRead != graphDef.length) { + throw new IOException( + "read error: read only " + + numBytesRead + + " of the graph, expected to read " + + graphDef.length); + } + + if (VERSION.SDK_INT >= 18) { + Trace.endSection(); // readGraphDef. + } + + loadGraph(graphDef, g); is.close(); Log.i(TAG, "Successfully loaded model from '" + model + "'"); + + if (VERSION.SDK_INT >= 18) { + Trace.endSection(); // initializeTensorFlow. + } } catch (IOException e) { throw new RuntimeException("Failed to load model from '" + model + "'", e); } @@ -105,8 +131,30 @@ public class TensorFlowInferenceInterface { this.runner = sess.runner(); try { - loadGraph(is, g); + if (VERSION.SDK_INT >= 18) { + Trace.beginSection("initializeTensorFlow"); + Trace.beginSection("readGraphDef"); + } + + int baosInitSize = is.available() > 16384 ? is.available() : 16384; + ByteArrayOutputStream baos = new ByteArrayOutputStream(baosInitSize); + int numBytesRead; + byte[] buf = new byte[16384]; + while ((numBytesRead = is.read(buf, 0, buf.length)) != -1) { + baos.write(buf, 0, numBytesRead); + } + byte[] graphDef = baos.toByteArray(); + + if (VERSION.SDK_INT >= 18) { + Trace.endSection(); // readGraphDef. + } + + loadGraph(graphDef, g); Log.i(TAG, "Successfully loaded model from the input stream"); + + if (VERSION.SDK_INT >= 18) { + Trace.endSection(); // initializeTensorFlow. + } } catch (IOException e) { throw new RuntimeException("Failed to load model from the input stream", e); } @@ -269,8 +317,8 @@ public class TensorFlowInferenceInterface { /** * Copy a byte sequence into the input Tensor with name {@link inputName} as a string-valued - * scalar tensor. In the TensorFlow type system, a "string" is an arbitrary sequence of - * bytes, not a Java {@code String} (which is a sequence of characters). + * scalar tensor. In the TensorFlow type system, a "string" is an arbitrary sequence of bytes, not + * a Java {@code String} (which is a sequence of characters). */ public void feedString(String inputName, byte[] src) { addFeed(inputName, Tensor.create(src)); @@ -278,9 +326,8 @@ public class TensorFlowInferenceInterface { /** * Copy an array of byte sequences into the input Tensor with name {@link inputName} as a - * string-valued one-dimensional tensor (vector). In the TensorFlow type system, a "string" - * is an arbitrary sequence of bytes, not a Java {@code String} (which is a sequence of - * characters). + * string-valued one-dimensional tensor (vector). In the TensorFlow type system, a "string" is an + * arbitrary sequence of bytes, not a Java {@code String} (which is a sequence of characters). */ public void feedString(String inputName, byte[][] src) { addFeed(inputName, Tensor.create(src)); @@ -458,27 +505,10 @@ public class TensorFlowInferenceInterface { } } - private void loadGraph(InputStream is, Graph g) throws IOException { + private void loadGraph(byte[] graphDef, Graph g) throws IOException { final long startMs = System.currentTimeMillis(); if (VERSION.SDK_INT >= 18) { - Trace.beginSection("loadGraph"); - Trace.beginSection("readGraphDef"); - } - - // TODO(ashankar): Can we somehow mmap the contents instead of copying them? - byte[] graphDef = new byte[is.available()]; - final int numBytesRead = is.read(graphDef); - if (numBytesRead != graphDef.length) { - throw new IOException( - "read error: read only " - + numBytesRead - + " of the graph, expected to read " - + graphDef.length); - } - - if (VERSION.SDK_INT >= 18) { - Trace.endSection(); // readGraphDef. Trace.beginSection("importGraphDef"); } @@ -490,7 +520,6 @@ public class TensorFlowInferenceInterface { if (VERSION.SDK_INT >= 18) { Trace.endSection(); // importGraphDef. - Trace.endSection(); // loadGraph. } final long endMs = System.currentTimeMillis(); diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py index 6e2c1aa033..2e944cbdd9 100644 --- a/tensorflow/contrib/estimator/python/estimator/extenders.py +++ b/tensorflow/contrib/estimator/python/estimator/extenders.py @@ -54,7 +54,7 @@ def add_metrics(estimator, metric_fn): ``` Args: - estimator: A ${tf.estimator.Esitmator} object. + estimator: A ${tf.estimator.Estimator} object. metric_fn: A function which should obey the following signature: - Args: can only have following four arguments in any order: * predictions: Predictions `Tensor` or dict of `Tensor` created by given diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py index ea8d9e0c63..9309678d90 100644 --- a/tensorflow/contrib/layers/__init__.py +++ b/tensorflow/contrib/layers/__init__.py @@ -51,6 +51,7 @@ See the @{$python/contrib.layers} guide. @@unit_norm @@bow_encoder @@embed_sequence +@@maxout @@apply_regularization @@l1_l2_regularizer diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py index da16bf6ce6..226d933d85 100644 --- a/tensorflow/contrib/layers/python/layers/feature_column.py +++ b/tensorflow/contrib/layers/python/layers/feature_column.py @@ -939,6 +939,11 @@ class _OneHotColumn( weighted_column = sparse_ops.sparse_merge(sp_ids=sparse_id_column, sp_values=weight_tensor, vocab_size=self.length) + # Remove (?, -1) index + weighted_column = sparse_ops.sparse_slice( + weighted_column, + [0, 0], + weighted_column.dense_shape) return sparse_ops.sparse_tensor_to_dense(weighted_column) dense_id_tensor = sparse_ops.sparse_tensor_to_dense(sparse_id_column, diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py index ab65e47af8..5ae885b720 100644 --- a/tensorflow/contrib/layers/python/layers/feature_column_test.py +++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py @@ -31,6 +31,7 @@ from tensorflow.python.feature_column import feature_column as fc_core from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib +from tensorflow.python.ops import lookup_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.ops import state_ops from tensorflow.python.ops import variable_scope @@ -319,6 +320,35 @@ class FeatureColumnTest(test.TestCase): self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights") self.assertEqual(one_hot.length, 3) + def testMissingValueInOneHotColumnForWeightedSparseColumn(self): + # Github issue 12583 + ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) + weighted_ids = fc.weighted_sparse_column(ids, "weights") + one_hot = fc.one_hot_column(weighted_ids) + features = { + 'ids': constant_op.constant([['marlo', 'unknown', 'omar']]), + 'weights': constant_op.constant([[2., 4., 6.]]) + } + one_hot_tensor = feature_column_ops.input_from_feature_columns( + features, [one_hot]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(lookup_ops.tables_initializer()) + self.assertAllEqual([[2., 6., 0.]], one_hot_tensor.eval()) + + def testMissingValueInOneHotColumnForSparseColumnWithKeys(self): + ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) + one_hot = fc.one_hot_column(ids) + features = { + 'ids': constant_op.constant([['marlo', 'unknown', 'omar']]) + } + one_hot_tensor = feature_column_ops.input_from_feature_columns( + features, [one_hot]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(lookup_ops.tables_initializer()) + self.assertAllEqual([[1., 1., 0.]], one_hot_tensor.eval()) + def testOneHotColumnDeepCopy(self): a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"]) column = fc.one_hot_column(a) diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py index 39e0f1fa23..cc494e9200 100644 --- a/tensorflow/contrib/layers/python/layers/layers.py +++ b/tensorflow/contrib/layers/python/layers/layers.py @@ -50,6 +50,7 @@ from tensorflow.python.ops import standard_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables as tf_variables from tensorflow.python.training import moving_averages +from tensorflow.python.layers.maxout import maxout # TODO(b/28426988): Replace legacy_* fns migrated from slim. # TODO(b/28426988): Remove legacy_* when all uses have migrated to new API. @@ -92,7 +93,8 @@ __all__ = ['avg_pool2d', 'unit_norm', 'legacy_fully_connected', 'legacy_linear', - 'legacy_relu'] + 'legacy_relu', + 'maxout'] DATA_FORMAT_NCHW = 'NCHW' DATA_FORMAT_NHWC = 'NHWC' @@ -811,7 +813,8 @@ def batch_norm(inputs, if data_format == DATA_FORMAT_NCHW: mean = array_ops.reshape(mean, params_shape_broadcast) variance = array_ops.reshape(variance, params_shape_broadcast) - beta = array_ops.reshape(beta, params_shape_broadcast) + if beta is not None: + beta = array_ops.reshape(beta, params_shape_broadcast) if gamma is not None: gamma = array_ops.reshape(gamma, params_shape_broadcast) diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py index 61c5fbafed..d1d18016f7 100644 --- a/tensorflow/contrib/layers/python/layers/layers_test.py +++ b/tensorflow/contrib/layers/python/layers/layers_test.py @@ -2636,6 +2636,13 @@ class BatchNormTest(test.TestCase): data_format='NCHW', shape=shape, is_training=True) self.assertAllClose(nhwc, nchw, atol=1e-4, rtol=1e-4) + def testBatchNormBeta(self): + # Test case for 11673 + with self.test_session() as sess: + a = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10)) + b = _layers.batch_norm(a, center=False, data_format='NCHW', + zero_debias_moving_mean=True) + sess.run(variables_lib.global_variables_initializer()) class LayerNormTest(test.TestCase): diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py index a90b9264f8..1f3295747e 100644 --- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py +++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py @@ -30,7 +30,7 @@ from tensorflow.python.framework import random_seed from tensorflow.python.platform import gfile # CVDF mirror of http://yann.lecun.com/exdb/mnist/ -SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/' +DEFAULT_SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/' def _read32(bytestream): @@ -215,7 +215,8 @@ def read_data_sets(train_dir, dtype=dtypes.float32, reshape=True, validation_size=5000, - seed=None): + seed=None, + source_url=DEFAULT_SOURCE_URL): if fake_data: def fake(): @@ -227,28 +228,31 @@ def read_data_sets(train_dir, test = fake() return base.Datasets(train=train, validation=validation, test=test) + if not source_url: # empty string check + source_url = DEFAULT_SOURCE_URL + TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, - SOURCE_URL + TRAIN_IMAGES) + source_url + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, - SOURCE_URL + TRAIN_LABELS) + source_url + TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, - SOURCE_URL + TEST_IMAGES) + source_url + TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, - SOURCE_URL + TEST_LABELS) + source_url + TEST_LABELS) with gfile.Open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) @@ -262,13 +266,13 @@ def read_data_sets(train_dir, train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] - + options = dict(dtype=dtype, reshape=reshape, seed=seed) - + train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) - + return base.Datasets(train=train, validation=validation, test=test) diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh index 4db9cce5ed..e85a79c279 100755 --- a/tensorflow/contrib/makefile/compile_nsync.sh +++ b/tensorflow/contrib/makefile/compile_nsync.sh @@ -214,12 +214,12 @@ for arch in $archs; do armeabi-v7a) toolchain="arm-linux-androideabi-4.9" sysroot_arch="arm" bin_prefix="arm-linux-androideabi" - march_option="-march=armv7-a" + march_option="-march=armv7-a -mfloat-abi=softfp -mfpu=neon" ;; armeabi-v7a-hard) toolchain="arm-linux-androideabi-4.9" sysroot_arch="arm" bin_prefix="arm-linux-androideabi" - march_option="-march=armv7-a" + march_option="-march=armv7-a -mfpu=neon" ;; mips) toolchain="mipsel-linux-android-4.9" sysroot_arch="mips" @@ -265,8 +265,7 @@ for arch in $archs; do -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/'"$arch"'/include \ -I../../platform/c++11 -I../../platform/gcc \ -I../../platform/posix -pthread - PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' \ - -mfloat-abi=softfp -mfpu=neon -fPIE + PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE PLATFORM_LDFLAGS=-pthread MKDEP=${CC} -M -std=c++11 PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \ diff --git a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in index 631d52235a..26c1ad4947 100644 --- a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in +++ b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in @@ -52,7 +52,9 @@ $(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS) @mkdir -p $(dir $@) $(CXX) $(CXXFLAGS) $(INCLUDES) \ -o $@ $(INFERENCE_OBJS) $(LIB_OBJS) \ - $(LIBFLAGS) $(LDFLAGS) -shared $(LIBS) + $(LIBFLAGS) $(LDFLAGS) \ + -shared -Wl,-soname,$(INFERENCE_SO_NAME) \ + $(LIBS) $(INFERENCE_SO_NAME): $(INFERENCE_SO_PATH) diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py index 463bd60300..76986d0156 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py @@ -34,6 +34,7 @@ from tensorflow.python.ops import metrics_impl from tensorflow.python.ops import nn from tensorflow.python.ops import state_ops from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import weights_broadcast_ops from tensorflow.python.util.deprecation import deprecated @@ -651,7 +652,7 @@ def _streaming_confusion_matrix_at_thresholds( label_is_neg = math_ops.logical_not(label_is_pos) if weights is not None: - broadcast_weights = _broadcast_weights( + broadcast_weights = weights_broadcast_ops.broadcast_weights( math_ops.to_float(weights), predictions) weights_tiled = array_ops.tile(array_ops.reshape( broadcast_weights, [1, -1]), [num_thresholds, 1]) @@ -955,7 +956,7 @@ def streaming_specificity_at_sensitivity( def streaming_sensitivity_at_specificity( predictions, labels, specificity, weights=None, num_thresholds=200, metrics_collections=None, updates_collections=None, name=None): - """Computes the specificity at a given sensitivity. + """Computes the sensitivity at a given specificity. The `streaming_sensitivity_at_specificity` function creates four local variables, `true_positives`, `true_negatives`, `false_positives` and @@ -1924,7 +1925,7 @@ def streaming_covariance(predictions, weighted_predictions = predictions weighted_labels = labels else: - weights = _broadcast_weights(weights, labels) + weights = weights_broadcast_ops.broadcast_weights(weights, labels) batch_count = math_ops.reduce_sum(weights) # n_B in eqn weighted_predictions = math_ops.multiply(predictions, weights) weighted_labels = math_ops.multiply(labels, weights) @@ -2051,7 +2052,7 @@ def streaming_pearson_correlation(predictions, # Broadcast weights here to avoid duplicate broadcasting in each call to # `streaming_covariance`. if weights is not None: - weights = _broadcast_weights(weights, labels) + weights = weights_broadcast_ops.broadcast_weights(weights, labels) cov, update_cov = streaming_covariance( predictions, labels, weights=weights, name='covariance') var_predictions, update_var_predictions = streaming_covariance( diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py index 506f4bd877..96606b9c0e 100644 --- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py +++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py @@ -228,7 +228,10 @@ class TFExampleDecoderTest(test.TestCase): image_shape = (2, 3, 3) unused_image, serialized_example = self.GenerateImage( image_format='jpeg', image_shape=image_shape) - with self.assertRaises(TypeError): + # decode_raw support uint16 now so ValueError will be thrown instead. + with self.assertRaisesRegexp( + ValueError, + 'true_fn and false_fn must have the same type: uint16, uint8'): unused_decoded_image = self.RunDecodeExample( serialized_example, tfexample_decoder.Image(dtype=dtypes.uint16), diff --git a/tensorflow/contrib/timeseries/README.md b/tensorflow/contrib/timeseries/README.md index 2b36ade986..0e15d162dd 100644 --- a/tensorflow/contrib/timeseries/README.md +++ b/tensorflow/contrib/timeseries/README.md @@ -2,7 +2,7 @@ TensorFlow Time Series (TFTS) is a collection of ready-to-use classic models (state space, autoregressive), and flexible infrastructure for building -high-performance time series models whatever the architecture. It includes tools +high-performance time series models with custom architectures. It includes tools for chunking and batching a series, and for saving model state across chunks, making use of parallel computation even when training sequential models on long series (using truncated backpropagation). diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc index ec5adfdaa0..26e18b28aa 100644 --- a/tensorflow/contrib/verbs/rdma.cc +++ b/tensorflow/contrib/verbs/rdma.cc @@ -165,9 +165,10 @@ void RdmaAdapter::Process_CQ() { RdmaBuffer* ab = rc->tx_ack_buffer_; ab->SendNextItem(); // find buffer - RdmaBuffer* tb = rc->FindBuffer(rm.name_); + RdmaTensorBuffer* tb = + reinterpret_cast<RdmaTensorBuffer*>(rc->FindBuffer(rm.name_)); tb->SetBufferStatus(remote, idle); - worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); }); + worker_env_->compute_pool->Schedule([tb]() { tb->ReSendNextItem(); }); } else if (rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) { // remote host requests to create a tensor buffer; // send ack to release remote tx message buffer @@ -198,7 +199,8 @@ void RdmaAdapter::Process_CQ() { RdmaBuffer* ab = rc->tx_ack_buffer_; ab->SendNextItem(); // find buffer - RdmaBuffer* tb = rc->FindBuffer(rm.name_); + RdmaTensorBuffer* tb = + reinterpret_cast<RdmaTensorBuffer*>(rc->FindBuffer(rm.name_)); CHECK(rm.buffer_size_ == tb->size_) << "rm.buffer_size = " << rm.buffer_size_ << "tb->size_ = " << tb->size_ << "rm.name_ = " << rm.name_; @@ -208,7 +210,7 @@ void RdmaAdapter::Process_CQ() { tb->SetRemoteMR(rmr, true); tb->SetBufferStatus(local, idle); tb->SetBufferStatus(remote, idle); - worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); }); + worker_env_->compute_pool->Schedule([tb]() { tb->ReSendNextItem(); }); } else if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) { // tensor RDMA write completed worker_env_->compute_pool->Schedule([rm, rc]() { @@ -624,6 +626,12 @@ RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name) RdmaTensorBuffer::RdmaTensorBuffer(RdmaChannel* channel, string name) : RdmaBuffer(channel, name) {} +RdmaTensorBuffer::~RdmaTensorBuffer() { + for (Itable it = retable.begin(); it != retable.end(); ++it) { + delete (it->second); + } +} + // Send the next ack from the buffer's job queue. void RdmaAckBuffer::SendNextItem() { uint32_t imm_data = LookupBufferIndex("rx_ack_buffer"); @@ -655,6 +663,99 @@ void RdmaMessageBuffer::SendNextItem() { } } +Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback( + const string& key_with_step_id, const string& key, int64 step_id, + const Rendezvous::ParsedKey& parsed) { + Rendezvous::DoneCallback cb = [this, key_with_step_id, key, step_id, parsed]( + const Status& status, const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) { + CHECK(status.ok()) << "RecvLocalAsync was not ok, key" << key_with_step_id + << " error message: " << status.error_message(); + size_t buffer_size = RdmaMessage::kMessageTotalBytes; + size_t tensor_bytes = 0; + // Figures out which device the tensor is hosted on. + Device* src_dev = nullptr; + Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice( + parsed.src_device, &src_dev); + CHECK(s.ok()) << "src device not found"; + // Does the device have the right incarnation number we expect? + CHECK(src_dev->attributes().incarnation() == parsed.src_incarnation) + << "RecvTensor expects a different device incarnation: " + << parsed.src_incarnation << " vs. " + << src_dev->attributes().incarnation() + << ". Your worker job was probably restarted. Check your " + << "worker job for the reason why it was restarted."; + Device* dst_dev = nullptr; + // destination is on CPU. + s = channel_->adapter_->worker_env_->device_mgr->LookupDevice("CPU:0", + &dst_dev); + CHECK(s.ok()) << "dst device not found"; + AllocatorAttributes dst_alloc_attr; + dst_alloc_attr.set_on_host(true); + + bool can_memcpy = DataTypeCanUseMemcpy(in.dtype()); + // string tensor needs to be serialized + Tensor copy; + TensorProto proto; + if (src_dev->tensorflow_gpu_device_info() && + (!send_args.alloc_attrs.on_host())) { + CHECK(send_args.device_context) + << "send dev name: " << src_dev->name() + << " gpu_info: " << src_dev->tensorflow_gpu_device_info(); + + if (can_memcpy) { + AllocatorAttributes host_alloc_attrs; + host_alloc_attrs.set_gpu_compatible(true); + host_alloc_attrs.set_on_host(true); + Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0); + copy = Tensor(alloc, in.dtype(), in.shape()); + tensor_bytes = in.TotalBytes(); + buffer_size += tensor_bytes; + GPUUtil::CopyGPUTensorToCPU( + src_dev, send_args.device_context, &in, ©, + [this, copy, tensor_bytes, buffer_size, key, in, step_id, + key_with_step_id, is_dead, send_args, recv_args](const Status& s) { + CHECK(s.ok()) << "copy tensor from gpu sync"; + StringPiece copy_buf; + copy_buf = copy.tensor_data(); + PostCopyOperations(true, buffer_size, tensor_bytes, key, in, + step_id, is_dead, key_with_step_id, ©, + NULL, ©_buf, send_args, recv_args); + }); + } else { + // "val" is on a GPU. No longer uses GPUUtil to fill the proto, use + // aync instead + GPUUtil::SetProtoFromGPU( + in, src_dev, send_args.device_context, &proto, is_dead, + [this, proto, buffer_size, key, in, step_id, key_with_step_id, + is_dead, send_args, recv_args](const Status& s) mutable { + CHECK(s.ok()) << "copy proto from gpu sync"; + auto tensor_bytes = proto.ByteSize(); + buffer_size += tensor_bytes; + PostCopyOperations(false, buffer_size, tensor_bytes, key, in, + step_id, is_dead, key_with_step_id, NULL, + &proto, NULL, send_args, recv_args); + }); + } + } else { + // tensor is in CPU memory. + StringPiece copy_buf; + if (can_memcpy) { + copy_buf = in.tensor_data(); + tensor_bytes = in.TotalBytes(); + } else { + in.AsProtoTensorContent(&proto); + tensor_bytes = proto.ByteSize(); + } + buffer_size += tensor_bytes; + PostCopyOperations(can_memcpy, buffer_size, tensor_bytes, key, in, + step_id, is_dead, key_with_step_id, ©, &proto, + ©_buf, send_args, recv_args); + } + }; + return cb; +} + // Send the next tensor from the buffer's job queue. void RdmaTensorBuffer::SendNextItem() { // get the key @@ -666,6 +767,7 @@ void RdmaTensorBuffer::SendNextItem() { queue_.pop(); } } + // send the tensor if a key is acquired. if (key_with_step_id != "") { VLOG(2) << "try to send tensor: " << key_with_step_id; @@ -675,107 +777,54 @@ void RdmaTensorBuffer::SendNextItem() { CHECK(key.compare(name_) == 0); Rendezvous::ParsedKey parsed; Rendezvous::ParseKey(key, &parsed); - Rendezvous::DoneCallback cb = [this, key_with_step_id, key, step_id, - parsed](const Status& status, - const Rendezvous::Args& send_args, - const Rendezvous::Args& recv_args, - const Tensor& in, bool is_dead) { - CHECK(status.ok()) << "RecvLocalAsync was not ok, key" << key_with_step_id - << " error message: " << status.error_message(); - size_t buffer_size = RdmaMessage::kMessageTotalBytes; - size_t tensor_bytes = 0; - // Figures out which device the tensor is hosted on. - Device* src_dev = nullptr; - Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice( - parsed.src_device, &src_dev); - CHECK(s.ok()) << "src device not found"; - // Does the device have the right incarnation number we expect? - CHECK(src_dev->attributes().incarnation() == parsed.src_incarnation) - << "RecvTensor expects a different device incarnation: " - << parsed.src_incarnation << " vs. " - << src_dev->attributes().incarnation() - << ". Your worker job was probably restarted. Check your " - << "worker job for the reason why it was restarted."; - Device* dst_dev = nullptr; - // destination is on CPU. - s = channel_->adapter_->worker_env_->device_mgr->LookupDevice("CPU:0", - &dst_dev); - CHECK(s.ok()) << "dst device not found"; - AllocatorAttributes dst_alloc_attr; - dst_alloc_attr.set_on_host(true); - - bool can_memcpy = DataTypeCanUseMemcpy(in.dtype()); - // string tensor needs to be serialized - Tensor copy; - TensorProto proto; - if (src_dev->tensorflow_gpu_device_info() && - (!send_args.alloc_attrs.on_host())) { - CHECK(send_args.device_context) - << "send dev name: " << src_dev->name() - << " gpu_info: " << src_dev->tensorflow_gpu_device_info(); - - if (can_memcpy) { - AllocatorAttributes host_alloc_attrs; - host_alloc_attrs.set_gpu_compatible(true); - host_alloc_attrs.set_on_host(true); - Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0); - copy = Tensor(alloc, in.dtype(), in.shape()); - tensor_bytes = in.TotalBytes(); - buffer_size += tensor_bytes; - GPUUtil::CopyGPUTensorToCPU( - src_dev, send_args.device_context, &in, ©, - [this, copy, tensor_bytes, buffer_size, key, in, step_id, - key_with_step_id, is_dead](const Status& s) { - CHECK(s.ok()) << "copy tensor from gpu sync"; - StringPiece copy_buf; - copy_buf = copy.tensor_data(); - PostCopyOperations(true, buffer_size, tensor_bytes, key, in, - step_id, is_dead, key_with_step_id, ©, - NULL, ©_buf); - }); - } else { - // "val" is on a GPU. No longer uses GPUUtil to fill the proto, use - // aync instead - GPUUtil::SetProtoFromGPU( - in, src_dev, send_args.device_context, &proto, is_dead, - [this, proto, buffer_size, key, in, step_id, key_with_step_id, - is_dead](const Status& s) mutable { - CHECK(s.ok()) << "copy proto from gpu sync"; - auto tensor_bytes = proto.ByteSize(); - buffer_size += tensor_bytes; - PostCopyOperations(false, buffer_size, tensor_bytes, key, in, - step_id, is_dead, key_with_step_id, NULL, - &proto, NULL); - }); - } - } else { - // tensor is in CPU memory. - StringPiece copy_buf; - if (can_memcpy) { - copy_buf = in.tensor_data(); - tensor_bytes = in.TotalBytes(); - } else { - in.AsProtoTensorContent(&proto); - tensor_bytes = proto.ByteSize(); - } - buffer_size += tensor_bytes; - PostCopyOperations(can_memcpy, buffer_size, tensor_bytes, key, in, - step_id, is_dead, key_with_step_id, ©, &proto, - ©_buf); - } - // maybe some margin for string tensor? - }; - + Rendezvous::DoneCallback cb = + getRecvTensorCallback(key_with_step_id, key, step_id, parsed); channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(step_id, parsed, cb); } } +void RdmaTensorBuffer::ReSendNextItem() { + // get the key + string key_with_step_id = ""; + { + mutex_lock lock{mu_}; + if (!requeue.empty()) { + key_with_step_id = requeue.front(); + requeue.pop(); + } + } + + // send the tensor if a key is acquired. + if (key_with_step_id != "") { + VLOG(2) << "try to send tensor: " << key_with_step_id; + string key; + int64 step_id; + VerbsUtil::GetKeyAndStepId(key_with_step_id, key, step_id); + CHECK(key.compare(name_) == 0); + Rendezvous::ParsedKey parsed; + Rendezvous::ParseKey(key, &parsed); + Rendezvous::DoneCallback cb = + getRecvTensorCallback(key_with_step_id, key, step_id, parsed); + ReItem* item; + { + mutex_lock lock{mu_}; + Itable it = retable.find(key_with_step_id); + CHECK(it != retable.end()) << "Could not find dup-recv context"; + item = it->second; + retable.erase(it); + } + cb(Status::OK(), item->send_args, item->recv_args, item->in, item->is_dead); + delete (item); + } +} + void RdmaTensorBuffer::PostCopyOperations( bool can_memcpy, size_t buffer_size, size_t tensor_bytes, const string& key, const Tensor& in, int64 step_id, bool is_dead, const string& key_with_step_id, const Tensor* copy, - const TensorProto* proto, const StringPiece* copy_buf) { + const TensorProto* proto, const StringPiece* copy_buf, + const Rendezvous::Args& send_args, const Rendezvous::Args& recv_args) { // prepare message RdmaMessage rm; rm.name_size_ = key.size(); @@ -793,9 +842,12 @@ void RdmaTensorBuffer::PostCopyOperations( VLOG(2) << "Extend RDMA buffer from " << size_ << " to " << buffer_size; } CreateCPUBuffer(buffer_size, false); + // Need to be received again, put into the re-recv queue and the table + requeue.push(key_with_step_id); + ReItem* item = new ReItem(send_args, recv_args, in, is_dead); + retable.insert(std::pair<string, ReItem*>(key_with_step_id, item)); mu_.unlock(); - // put back the key since it is not sent; - EnqueueItem(key_with_step_id); + // no longer used: put back the key since it is not sent; // ask the remote to create the same buffer rm.type_ = RDMA_MESSAGE_BUFFER_REQUEST; rm.remote_addr_ = reinterpret_cast<uint64_t>(buffer_); @@ -841,9 +893,11 @@ void RdmaTensorBuffer::PostCopyOperations( } Write(imm_data, buffer_size); } else { + // Need to be received again, put into the re-recv queue and the table + requeue.push(key_with_step_id); + ReItem* item = new ReItem(send_args, recv_args, in, is_dead); + retable.insert(std::pair<string, ReItem*>(key_with_step_id, item)); mu_.unlock(); - // put back the key since it is not sent; - EnqueueItem(key_with_step_id); } } diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h index 16ef58bc62..e1e07db776 100644 --- a/tensorflow/contrib/verbs/rdma.h +++ b/tensorflow/contrib/verbs/rdma.h @@ -28,6 +28,7 @@ limitations under the License. #include <vector> #include "tensorflow/core/distributed_runtime/worker_env.h" +#include "tensorflow/core/framework/rendezvous.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" @@ -224,14 +225,57 @@ class RdmaMessageBuffer : public RdmaBuffer { class RdmaTensorBuffer : public RdmaBuffer { public: explicit RdmaTensorBuffer(RdmaChannel* channel, string name); - virtual ~RdmaTensorBuffer() override {} + virtual ~RdmaTensorBuffer() override; void SendNextItem() override; void PostCopyOperations(bool can_memcpy, size_t buffer_size, size_t tensor_bytes, const string& key, const Tensor& in, int64 step_id, bool is_dead, const string& key_with_step_id, const Tensor* copy, - const TensorProto* proto, - const StringPiece* copy_buf); + const TensorProto* proto, const StringPiece* copy_buf, + const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args); + + void ReSendNextItem(); + + private: + Rendezvous::DoneCallback getRecvTensorCallback( + const string& key_with_step_id, const string& key, int64 step_id, + const Rendezvous::ParsedKey& parsed); + + struct ReItem { + Rendezvous::Args send_args; + Rendezvous::Args recv_args; + Tensor in; + bool is_dead; + + ReItem(const Rendezvous::Args& send_args_, + const Rendezvous::Args& recv_args_, const Tensor& in_, bool is_dead_) + : send_args(send_args_), + recv_args(recv_args_), + in(in_), + is_dead(is_dead_) { + if (send_args.device_context) { + send_args.device_context->Ref(); + } + if (recv_args.device_context) { + recv_args.device_context->Ref(); + } + } + + ~ReItem() { + if (send_args.device_context) { + send_args.device_context->Unref(); + } + if (recv_args.device_context) { + recv_args.device_context->Unref(); + } + } + }; + typedef std::map<string, ReItem*> Table; + typedef Table::iterator Itable; + + std::queue<string> requeue GUARDED_BY(mu_); + Table retable GUARDED_BY(mu_); }; struct RdmaMessage { diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 6c1896d7ab..188036b7aa 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -790,13 +790,16 @@ cc_library( ]) + if_mkl([ "//tensorflow/core/kernels:mkl_concat_op", "//tensorflow/core/kernels:mkl_conv_op", + "//tensorflow/core/kernels:mkl_cwise_ops_common", "//tensorflow/core/kernels:mkl_fused_batch_norm_op", "//tensorflow/core/kernels:mkl_identity_op", + "//tensorflow/core/kernels:mkl_input_conversion_op", "//tensorflow/core/kernels:mkl_lrn_op", "//tensorflow/core/kernels:mkl_pooling_ops", "//tensorflow/core/kernels:mkl_relu_op", "//tensorflow/core/kernels:mkl_reshape_op", "//tensorflow/core/kernels:mkl_tfconv_op", + "//tensorflow/core/kernels:mkl_aggregate_ops", ]), ) @@ -2481,10 +2484,13 @@ tf_cc_test_mkl( "//tensorflow/cc:cc_ops", "//tensorflow/cc:scope", "//tensorflow/cc:sendrecv_ops", + "//tensorflow/core/kernels:mkl_aggregate_ops", "//tensorflow/core/kernels:mkl_concat_op", "//tensorflow/core/kernels:mkl_conv_op", + "//tensorflow/core/kernels:mkl_cwise_ops_common", "//tensorflow/core/kernels:mkl_fused_batch_norm_op", "//tensorflow/core/kernels:mkl_identity_op", + "//tensorflow/core/kernels:mkl_input_conversion_op", "//tensorflow/core/kernels:mkl_lrn_op", "//tensorflow/core/kernels:mkl_pooling_ops", "//tensorflow/core/kernels:mkl_relu_op", diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h index 005aabf9b8..f16da10d7a 100644 --- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h +++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h @@ -75,12 +75,12 @@ class MklCPUAllocator : public Allocator { // Hooks provided by this allocator for memory allocation routines from MKL static inline void* MallocHook(size_t size) { - VLOG(2) << "MklCPUAllocator: In MallocHook"; + VLOG(3) << "MklCPUAllocator: In MallocHook"; return cpu_allocator()->AllocateRaw(kAlignment, size); } static inline void FreeHook(void* ptr) { - VLOG(2) << "MklCPUAllocator: In FreeHook"; + VLOG(3) << "MklCPUAllocator: In FreeHook"; cpu_allocator()->DeallocateRaw(ptr); } diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc index 29acad34e9..06695db779 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc @@ -69,9 +69,8 @@ class GrpcWorkerCache : public WorkerCachePartial { } else { SharedGrpcChannelPtr channel = channel_cache_->FindWorkerChannel(target); if (!channel) return nullptr; - WorkerInterface* ret = NewGrpcRemoteWorker(&live_rpc_counter_, channel, - &completion_queue_, &logger_); - return ret; + return NewGrpcRemoteWorker(&live_rpc_counter_, channel, + &completion_queue_, &logger_); } } diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.cc b/tensorflow/core/distributed_runtime/worker_cache_partial.cc index 90d5e78884..61e5416234 100644 --- a/tensorflow/core/distributed_runtime/worker_cache_partial.cc +++ b/tensorflow/core/distributed_runtime/worker_cache_partial.cc @@ -29,7 +29,7 @@ namespace tensorflow { bool WorkerCachePartial::GetDeviceLocalityNonBlocking( const string& device_name, DeviceLocality* locality) { mutex_lock lock(mu_); // could use reader lock - const auto& iter = device_status_cache_.find(device_name); + auto iter = device_status_cache_.find(device_name); if (iter != device_status_cache_.end()) { *locality = iter->second.locality(); return true; @@ -44,16 +44,8 @@ void WorkerCachePartial::GetDeviceLocalityAsync(const string& device_name, // If cache entry was empty, make one try to fill it by RPC. SchedClosure([this, &device_name, locality, done]() { Status s = RefreshDeviceStatus(device_name); - if (s.ok()) { - if (!GetDeviceLocalityNonBlocking(device_name, locality)) { - mutex_lock lock(mu_); - const auto& iter = device_status_cache_.find(device_name); - if (iter == device_status_cache_.end()) { - s = errors::Unavailable("No known remote device: ", device_name); - } else { - s = errors::Internal("Failed to find locality for ", device_name); - } - } + if (s.ok() && !GetDeviceLocalityNonBlocking(device_name, locality)) { + s = errors::Unavailable("No known remote device: ", device_name); } done(s); }); @@ -70,7 +62,9 @@ Status WorkerCachePartial::RefreshDeviceStatus(const string& device_name) { s = errors::InvalidArgument("Bad device name to RefreshDeviceStatus: ", device_name); } - auto deleter = [this, task](WorkerInterface* wi) { ReleaseWorker(task, wi); }; + auto deleter = [this, &task](WorkerInterface* wi) { + ReleaseWorker(task, wi); + }; std::unique_ptr<WorkerInterface, decltype(deleter)> rwi(CreateWorker(task), deleter); if (s.ok() && !rwi.get()) { diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index cf5d6e8baa..90377e54c7 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -256,6 +256,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { public: MklLayoutRewritePass() { // NOTE: names are alphabetically sorted. + csinfo_.addn = "AddN"; csinfo_.avg_pool = "AvgPool"; csinfo_.avg_pool_grad = "AvgPoolGrad"; csinfo_.bias_add = "BiasAdd"; @@ -279,17 +280,31 @@ class MklLayoutRewritePass : public GraphOptimizationPass { csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias"; csinfo_.mkl_conv2d_with_bias_backprop_bias = "_MklConv2DWithBiasBackpropBias"; - csinfo_.relu = "Relu"; - csinfo_.relu_grad = "ReluGrad"; - csinfo_.reshape = "Reshape"; - csinfo_.split = "Split"; + csinfo_.relu = "Relu"; + csinfo_.relu_grad = "ReluGrad"; + csinfo_.reshape = "Reshape"; + csinfo_.split = "Split"; + // Element-wise ops. Ensure you also add any new ops to IsOpElementWise + // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the + // MklInputConversion op is added before it. + csinfo_.add = "Add"; + csinfo_.maximum = "Maximum"; + csinfo_.mul = "Mul"; + csinfo_.squared_difference = "SquaredDifference"; + csinfo_.sub = "Sub"; + // End - element-wise ops. See note above. // NOTE: names are alphabetically sorted. + rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN, + AddNRewrite, nullptr}); + rinfo_.push_back({csinfo_.add, + mkl_op_registry::GetMklOpName(csinfo_.add), + CopyAttrsDataType, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.avg_pool, - GetMklOpName(csinfo_.avg_pool), + mkl_op_registry::GetMklOpName(csinfo_.avg_pool), CopyAttrsPooling, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.avg_pool_grad, - GetMklOpName(csinfo_.avg_pool_grad), + mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad), CopyAttrsPooling, AlwaysRewrite, nullptr}); // BiasAddGrad gets written into Conv2DWithBiasBackpropBias depending // on if context contains Conv2D. @@ -303,50 +318,62 @@ class MklLayoutRewritePass : public GraphOptimizationPass { CopyAttrsBiasAddGrad, ContextMatchRewrite, &biasaddgrad_matmul_context_}); rinfo_.push_back({csinfo_.concat, - GetMklOpName(csinfo_.concat), + mkl_op_registry::GetMklOpName(csinfo_.concat), CopyAttrsConcat, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.concatv2, - GetMklOpName(csinfo_.concatv2), + mkl_op_registry::GetMklOpName(csinfo_.concatv2), CopyAttrsConcatV2, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.conv2d, - GetMklOpName(csinfo_.conv2d), + mkl_op_registry::GetMklOpName(csinfo_.conv2d), CopyAttrsConv2D, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.conv2d_grad_filter, - GetMklOpName(csinfo_.conv2d_grad_filter), + mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter), CopyAttrsConv2D, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.conv2d_grad_input, - GetMklOpName(csinfo_.conv2d_grad_input), + mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input), CopyAttrsConv2D, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.fused_batch_norm, - GetMklOpName(csinfo_.fused_batch_norm), + mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm), CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.fused_batch_norm_grad, - GetMklOpName(csinfo_.fused_batch_norm_grad), + mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad), CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.identity, - GetMklOpName(csinfo_.identity), + mkl_op_registry::GetMklOpName(csinfo_.identity), CopyAttrsIdentity, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.lrn, - GetMklOpName(csinfo_.lrn), + mkl_op_registry::GetMklOpName(csinfo_.lrn), CopyAttrsLRN, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.lrn_grad, - GetMklOpName(csinfo_.lrn_grad), + mkl_op_registry::GetMklOpName(csinfo_.lrn_grad), CopyAttrsLRN, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.max_pool, - GetMklOpName(csinfo_.max_pool), + mkl_op_registry::GetMklOpName(csinfo_.max_pool), CopyAttrsPooling, NonDepthBatchWisePoolRewrite, nullptr}); rinfo_.push_back({csinfo_.max_pool_grad, - GetMklOpName(csinfo_.max_pool_grad), + mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad), CopyAttrsPooling, AlwaysRewrite, nullptr}); + rinfo_.push_back({csinfo_.maximum, + mkl_op_registry::GetMklOpName(csinfo_.maximum), + CopyAttrsDataType, AlwaysRewrite, nullptr}); + rinfo_.push_back({csinfo_.mul, + mkl_op_registry::GetMklOpName(csinfo_.mul), + CopyAttrsDataType, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.relu, - GetMklOpName(csinfo_.relu), - CopyAttrsRelu, AlwaysRewrite, nullptr}); + mkl_op_registry::GetMklOpName(csinfo_.relu), + CopyAttrsDataType, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.relu_grad, - GetMklOpName(csinfo_.relu_grad), - CopyAttrsRelu, AlwaysRewrite, nullptr}); + mkl_op_registry::GetMklOpName(csinfo_.relu_grad), + CopyAttrsDataType, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.reshape, - GetMklOpName(csinfo_.reshape), + mkl_op_registry::GetMklOpName(csinfo_.reshape), CopyAttrsReshape, AlwaysRewrite, nullptr}); + rinfo_.push_back({csinfo_.squared_difference, + mkl_op_registry::GetMklOpName(csinfo_.squared_difference), + CopyAttrsDataType, AlwaysRewrite, nullptr}); + rinfo_.push_back({csinfo_.sub, + mkl_op_registry::GetMklOpName(csinfo_.sub), + CopyAttrsDataType, AlwaysRewrite, nullptr}); // Add info about which ops to add workspace edge to and the slots. wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3}); @@ -429,6 +456,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { /// Structure to store all constant strings /// NOTE: names are alphabetically sorted. typedef struct { + string addn; + string add; string avg_pool; string avg_pool_grad; string bias_add; @@ -446,15 +475,19 @@ class MklLayoutRewritePass : public GraphOptimizationPass { string matmul; string max_pool; string max_pool_grad; + string maximum; string mkl_conv2d; string mkl_conv2d_grad_input; string mkl_conv2d_grad_filter; string mkl_conv2d_with_bias; string mkl_conv2d_with_bias_backprop_bias; + string mul; string relu; string relu_grad; string reshape; string split; + string squared_difference; + string sub; } ConstStringsInfo; private: @@ -502,15 +535,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass { return N; } - // Get the name of Mkl op from original TensorFlow op - // We prefix 'Mkl' to the original op to get Mkl op. - // TODO(nhasabni) We should move this to mkl_util.h. - inline string GetMklOpName(const string& name) const { - // Prefix that we add to Tensorflow op name to construct Mkl op name. - const char* const kMklOpPrefix = "_Mkl"; - return string(kMklOpPrefix) + name; - } - // Can op represented by node 'n' run on DEVICE_CPU? // Op can run on CPU with MKL if the runtime assigned device or the // user requested device contains device CPU, or both are empty. @@ -604,6 +628,19 @@ class MklLayoutRewritePass : public GraphOptimizationPass { return false; } + static bool AddNRewrite(const Node* n, const ContextInfo* c) { + CHECK_NOTNULL(n); + + int num; + CHECK_EQ(GetNodeAttr(n->def(), "N", &num).ok(), true); + + // Condition that specifies non-batch-wise and non-depth-wise pooling. + if (num == 2) { + return true; + } + + return false; + } // Is BiasAddGrad node in 'n' is associated with Conv2DWithBias node // specified in contextinfo 'ci'. Function updates fwd_node to point // to Conv2DWithBias node if 'n' is associated with Conv2DWithBias. @@ -907,15 +944,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // We need operator-specific function to copy attributes because the framework // does not provide any generic function for it. // NOTE: names are alphabetically sorted. + static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb); + static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsIdentity(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb); - static void CopyAttrsRelu(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb); static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb); @@ -1334,7 +1372,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); for (auto ws : wsinfo_) { if (orig_node->type_string() == ws.fwd_op && - mkl_op_registry::IsMklOp(GetMklOpName(orig_node->type_string()), T)) { + mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) { // If this op is a fwd op, then we need to check if there is an // edge from this node's fwd_slot to bwdop's bwd_slot. If there is // an edge, then we just add an attribute on this node for setting @@ -1360,7 +1398,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( nb->Attr("workspace_enabled", false); } } else if (orig_node->type_string() == ws.bwd_op && - mkl_op_registry::IsMklOp(GetMklOpName(orig_node->type_string()), + mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) { // If this op is a bwd op, then we need to add workspace edge and // it's Mkl tensor edge between its corresponding fwd op and this @@ -1376,7 +1414,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( if (e->src_output() == ws.fwd_slot && // We would have rewritten the forward op, so we need to use // GetMklOpName call to get its Mkl name. - e->src()->type_string() == GetMklOpName(ws.fwd_op) && + e->src()->type_string() == mkl_op_registry::GetMklOpName(ws.fwd_op) && e->dst_input() == ws.bwd_slot) { nb->Attr("workspace_enabled", true); CHECK_NOTNULL(ws_tensors); @@ -1455,6 +1493,20 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node, nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu); } +void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, + NodeBuilder* nb) { + DataType T; + int N; + + // Get all attributes from old node. + TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); + TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N)); + + // Add attributes to new node. + nb->Attr("T", T); + nb->Attr("N", N); +} + void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb) { DataType T; @@ -1527,8 +1579,8 @@ void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node, nb->Attr("data_format", data_format); } -void MklLayoutRewritePass::CopyAttrsRelu(const Node* orig_node, - NodeBuilder* nb) { +void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node, + NodeBuilder* nb) { DataType T; // Get all attributes from old node. @@ -1894,7 +1946,15 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g, } // Get all inputs. - const int num_inputs = orig_node->in_edges().size(); + int num_inputs = orig_node->in_edges().size(); + + // Drop count for control edges from inputs + for (const Edge* e : orig_node->in_edges()) { + if (e->IsControlEdge()) { + num_inputs--; + } + } + gtl::InlinedVector<Node*, 4> control_edges; gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs); FillInputs(orig_node, &control_edges, &inputs); @@ -2008,7 +2068,34 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const { // BiasAddGrad is not an Mkl layer, so we make an exception for it. if (n->type_string() != csinfo_.bias_add_grad) { - if (!mkl_op_registry::IsMklOp(GetMklOpName(n->type_string()), T)) { + if (!mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), T)) { + return nullptr; + } + } + + // For elementwise node, we reuse the Eigen implementation and pass the MKL + // metadata tensor through so we can avoid conversions. However, if all + // incoming edges are in TF format, we don't need all this overhead, so + // replace the elementwise node only if at least one of its parents is a MKL + // node. + // + // TODO(vrane): Add implementation for element-wise ops that doesn't reuse + // eigen code to reduce cross-library dependency. + if (mkl_op_registry::IsMklElementWiseOp( + mkl_op_registry::GetMklOpName(n->type_string()), T)) { + bool incoming_mkl_edge = false; + for (auto parent : n->in_edges()) { + if (mkl_op_registry::IsMklOp( + mkl_op_registry::GetMklOpName(parent->src()->type_string()), T)) { + incoming_mkl_edge = true; + break; + } else { + VLOG(1) << "Non-MKL parent is: " << parent->src()->type_string(); + } + } + if (incoming_mkl_edge == false) { + VLOG(1) << "Skipping replacement of elementwise node which has no MKL " + "parents."; return nullptr; } } diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index bd1d74368e..6a41e3965a 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -133,19 +133,19 @@ TEST_F(MklLayoutPassTest, Basic) { InitGraph( "node { name: 'A' op: 'Input'}" "node { name: 'B' op: 'Input'}" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }" - "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(Mul);D(Mul)|" + "A(Input);B(Input);C(Zeta);D(Zeta)|" "A->C;A->D;B->C:1;B->D:1"); } // Test set 1: Conv2D + AddBias -// C=_MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y) (for interleaved ordering) -// C=_MklConv2D(A,B,M,N); E=BiasAdd(C,D); Z=Sub(E,Y) (for contiguous ordering) +// C=_MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Zeta(E,Y) (for interleaved ordering) +// C=_MklConv2D(A,B,M,N); E=BiasAdd(C,D); Z=Zeta(E,Y) (for contiguous ordering) TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) { CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); InitGraph( @@ -166,18 +166,18 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['C', 'D'] }" "node { name: 'Y' op: 'Input'}" - "node { name: 'Z' op: 'Sub'" + "node { name: 'Z' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['E', 'Y']}"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);" - "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->E;" + "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->E;" "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;M->E:3;" "N->E:4;Y->Z:1"); } -// C=_MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Sub(E,Y) (for interleaved) -// C=_MklConv2D(A,B,M:1,N:1); E=BiasAdd(C,D); Z=Sub(E,Y) (for contiguous) +// C=_MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Zeta(E,Y) (for interleaved) +// C=_MklConv2D(A,B,M:1,N:1); E=BiasAdd(C,D); Z=Zeta(E,Y) (for contiguous) // Test for correct output slots selected TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) { CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); @@ -199,17 +199,17 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['C', 'D'] }" "node { name: 'Y' op: 'Input'}" - "node { name: 'Z' op: 'Sub'" + "node { name: 'Z' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['E', 'Y']}"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);" - "M(_MklInput2);N(_MklInput2);Y(Input);Z(Sub)|A->E;" + "M(_MklInput2);N(_MklInput2);Y(Input);Z(Zeta)|A->E;" "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;" "M:1->E:3;N:1->E:4;Y->Z:1"); } -// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y); +// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y); // This is a case of node rewrite followed by node merge. // We will first rewrite Conv2D to _MklConv2D, and then merge _MklConv2D // with BiasAdd to produce _MklConv2DWithBias. @@ -231,12 +231,12 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['C', 'D'] }" "node { name: 'Y' op: 'Input'}" - "node { name: 'Z' op: 'Sub'" + "node { name: 'Z' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['E', 'Y']}"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" - "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Sub)|" + "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Zeta)|" "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;" "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;" "DMT/_2->E:5;E->Z;Y->Z:1"); @@ -286,7 +286,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) { "M(_MklInput);N(_MklInput)|A->C;B->C:1;D->F;E->F:1;M->C:2;N->C:3"); } -// _MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add). +// _MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Zeta). // Merge should not be done in such case. TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) { InitGraph( @@ -308,12 +308,12 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['D', 'E'] }" // Conv2D has two outputs. // No merge should happen. - "node { name: 'G' op: 'Add'" + "node { name: 'G' op: 'Zeta'" " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'E'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);" - "G(Add);M(_MklInput);N(_MklInput)|A->C;B->C:1;C->G;D->F;" + "G(Zeta);M(_MklInput);N(_MklInput)|A->C;B->C:1;C->G;D->F;" "E->F:1;E->G:1;M->C:2;N->C:3"); } @@ -362,7 +362,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'C', 'M', 'N', 'O']}" - "node { name: 'E' op: 'Sub'" + "node { name: 'E' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['D', 'A']}" "node { name: 'F' op: 'Int32Input'}" @@ -387,7 +387,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) { " input: ['E'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);" - "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);" + "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);" "I(_MklConv2DBackpropInput);J(_MklConv2DWithBiasBackpropBias);" "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G;B->D:1;" "B->I:1;C->D:2;D->E;DMT/_0->J:1;E->G:2;E->I:2;E->J;" @@ -413,7 +413,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative1) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'C', 'M', 'N', 'O']}" - "node { name: 'E' op: 'Sub'" + "node { name: 'E' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['D', 'A']}" "node { name: 'F' op: 'Int32Input'}" @@ -438,7 +438,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative1) { " input: ['E'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);" - "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);" + "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);" "I(_MklConv2DBackpropInput);J(BiasAddGrad);" "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;" "B->I:1;C->D:2;D->E;E->G;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;" @@ -463,7 +463,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['B', 'A', 'C', 'M', 'N', 'O']}" - "node { name: 'E' op: 'Sub'" + "node { name: 'E' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['D', 'A']}" "node { name: 'F' op: 'Int32Input'}" @@ -488,7 +488,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) { " input: ['E'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);" - "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);" + "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);" "I(_MklConv2DBackpropInput);J(BiasAddGrad);" "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;" "B->I:1;C->D:2;D->E;E->G:2;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;" @@ -512,7 +512,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'C', 'M', 'N', 'O']}" - "node { name: 'E' op: 'Sub'" + "node { name: 'E' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['D', 'A']}" "node { name: 'F' op: 'Int32Input'}" @@ -529,7 +529,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) { " input: ['E'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);" - "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);" + "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);" "H(_MklConv2DWithBiasBackpropBias);M(_MklInput);N(_MklInput);" "O(_MklInput)|A->D;A->E:1;A->G;B->D:1;C->D:2;D->E;DMT/_0->H:1;" "E->G:2;E->H;E:control->DMT/_0:control;F->G:1;M->D:3;M->G:3;" @@ -553,7 +553,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative1) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'C', 'M', 'N', 'O']}" - "node { name: 'E' op: 'Sub'" + "node { name: 'E' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['D', 'A']}" "node { name: 'F' op: 'Int32Input'}" @@ -570,7 +570,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative1) { " input: ['E'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);" - "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);" + "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);" "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;" "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;" "O->G:5"); @@ -593,7 +593,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['B', 'A', 'C', 'M', 'N', 'O']}" - "node { name: 'E' op: 'Sub'" + "node { name: 'E' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['D', 'A']}" "node { name: 'F' op: 'Int32Input'}" @@ -610,7 +610,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) { " input: ['E'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);" - "E(Sub);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);" + "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);" "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;" "C->D:2;D->E;E->G:2;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;" "O->G:5"); @@ -618,8 +618,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) { // No _MklConv2DWithBias in context, but _MklConv2D in context. // No rewrite for BiasAddGrad should happen. -// C=_MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D) (for interleaved) -// C=_MklConv2D(A,B,M,N); D=Sub(C,A); E=BiasAddGrad(D) (for contiguous) +// C=_MklConv2D(A,M,B,N); D=Zeta(C,A); E=BiasAddGrad(D) (for interleaved) +// C=_MklConv2D(A,B,M,N); D=Zeta(C,A); E=BiasAddGrad(D) (for contiguous) TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) { InitGraph( "node { name: 'A' op: 'Input'}" @@ -633,7 +633,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'M', 'N']}" - "node { name: 'D' op: 'Sub'" + "node { name: 'D' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'A']}" "node { name: 'E' op: 'BiasAddGrad'" @@ -641,21 +641,21 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(_MklConv2D);D(Sub);E(BiasAddGrad);" + "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);" "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;" "M->C:2;N->C:3"); } // No Conv2D in the context for BiasAddGrad. No rewrite should happen. -// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D) +// C=Polygamma(A,B); D=Zeta(C,A); E=BiasAddGrad(D) TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) { InitGraph( "node { name: 'A' op: 'Input'}" "node { name: 'B' op: 'Input'}" - "node { name: 'C' op: 'Add'" + "node { name: 'C' op: 'Polygamma'" " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B']}" - "node { name: 'D' op: 'Sub'" + "node { name: 'D' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'A']}" "node { name: 'E' op: 'BiasAddGrad'" @@ -663,13 +663,13 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|" + "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|" "A->C;A->D:1;B->C:1;C->D;D->E"); } // No Conv2D in the context for BiasAddGrad, but MatMul in context. // Rewrite should happen, but name of BiasAddGrad does not change. -// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D) +// C=MatMul(A,B); D=Zeta(C,A); E=BiasAddGrad(D) TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) { InitGraph( "node { name: 'A' op: 'Input'}" @@ -679,7 +679,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) { " attr { key: 'transpose_a' value { b: false } }" " attr { key: 'transpose_b' value { b: false } }" " input: ['A', 'B']}" - "node { name: 'D' op: 'Sub'" + "node { name: 'D' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'A']}" "node { name: 'E' op: 'BiasAddGrad'" @@ -687,12 +687,12 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|" + "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|" "A->C;A->D:1;B->C:1;C->D;D->E"); } // Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests -// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D) +// C=MatMul(A,B); D=Zeta(C,A); E=BiasAddGrad(D) TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) { InitGraph( "node { name: 'A' op: 'Input'}" @@ -702,7 +702,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) { " attr { key: 'transpose_a' value { b: false } }" " attr { key: 'transpose_b' value { b: false } }" " input: ['A', 'B']}" - "node { name: 'D' op: 'Sub'" + "node { name: 'D' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'A']}" "node { name: 'E' op: 'BiasAddGrad'" @@ -710,20 +710,20 @@ TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|" + "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|" "A->C;A->D:1;B->C:1;C->D;D->E"); } // No MatMul in the context for BiasAddGrad. No rewrite should happen. -// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D) +// C=Polygamma(A,B); D=Zeta(C,A); E=BiasAddGrad(D) TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) { InitGraph( "node { name: 'A' op: 'Input'}" "node { name: 'B' op: 'Input'}" - "node { name: 'C' op: 'Add'" + "node { name: 'C' op: 'Polygamma'" " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B']}" - "node { name: 'D' op: 'Sub'" + "node { name: 'D' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'A']}" "node { name: 'E' op: 'BiasAddGrad'" @@ -731,7 +731,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|" + "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|" "A->C;A->D:1;B->C:1;C->D;D->E"); } @@ -752,10 +752,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B']}" - "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['B', 'C'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(_MklConv2D);D(Mul);DMT/_0(Const);" + "A(Input);B(Input);C(_MklConv2D);D(Zeta);DMT/_0(Const);" "DMT/_1(Const)|A->C;A:control->DMT/_0:control;" "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;" "DMT/_1->C:3"); @@ -781,11 +781,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'C']}" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);" - "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;" + "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->C;A->D;" "A:control->DMT/_0:control;A:control->DMT/_1:control;" "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;" "C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2"); @@ -803,10 +803,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B']}" - "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_HALF } }" + "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }" " input: ['B', 'C'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(HalfInput);B(HalfInput);C(Conv2D);D(Mul)|" + "A(HalfInput);B(HalfInput);C(Conv2D);D(Zeta)|" "A->C;B->C:1;B->D;C->D:1"); } @@ -822,11 +822,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'C']}" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);" - "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|" + "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|" "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;" "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;" "DMT/_1->D:4;DMT/_2->D:5"); @@ -844,11 +844,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['B', 'A', 'C']}" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);" - "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|" + "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|" "A->D:1;A->E;B->D;B:control->DMT/_0:control;" "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;" "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5"); @@ -869,11 +869,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) { " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'N' value { i: 2 } }" " input: ['A', 'B:0', 'B:1']}" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);" - "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D;A:control->DMT/_0:control;" + "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;" "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;" "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5"); } @@ -908,12 +908,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) { " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'N' value { i: 2 } }" " input: ['G', 'E', 'F']}" - "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'H'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);" - "F(_MklConv2D);G(Const);H(_MklConcat);I(Mul)|A->E;A->I;" + "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;" "A:control->DMT/_2:control;A:control->DMT/_3:control;" "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;" "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;" @@ -935,7 +935,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B']}" - "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'D']}" "node { name: 'G' op: 'Const' " " attr { key: 'dtype' value { type: DT_INT32 } }" @@ -946,12 +946,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) { " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'N' value { i: 2 } }" " input: ['G', 'E', 'F']}" - "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'H'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" - "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);" - "H(_MklConcat);I(Mul)|A->E;A->I;A:control->DMT/_0:control;" + "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);" + "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;" "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;" "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;" "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1"); @@ -973,11 +973,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) { " attr { key: 'Tidx' value { type: DT_INT32 } }" " attr { key: 'N' value { i: 2 } }" " input: ['B:0', 'B:1', 'A']}" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);" - "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D:2;B->D;B:1->D:1;" + "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D:2;B->D;B:1->D:1;" "B:control->DMT/_0:control;B:control->DMT/_1:control;" "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;" "DMT/_1->D:4;DMT/_2->D:5"); @@ -1014,12 +1014,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) { " attr { key: 'Tidx' value { type: DT_INT32 } }" " attr { key: 'N' value { i: 2 } }" " input: ['E', 'F', 'G']}" - "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'H'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);" - "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Mul)|A->E;A->I;" + "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;" "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;" "C:control->DMT/_0:control;C:control->DMT/_1:control;" "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;" @@ -1041,7 +1041,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B']}" - "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'D']}" "node { name: 'G' op: 'Const' " " attr { key: 'dtype' value { type: DT_INT32 } }" @@ -1053,12 +1053,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) { " attr { key: 'Tidx' value { type: DT_INT32 } }" " attr { key: 'N' value { i: 2 } }" " input: ['E', 'F', 'G']}" - "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'H'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" - "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);" - "H(_MklConcatV2);I(Mul)|A->E;A->I;A:control->DMT/_0:control;" + "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);" + "H(_MklConcatV2);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;" "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;" "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;" "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;" @@ -1071,10 +1071,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) { "node { name: 'B' op: 'Relu'" " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(_MklRelu);C(Mul);DMT/_0(Const)|A->B;A->C;" + "A(Input);B(_MklRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;" "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1"); } @@ -1085,10 +1085,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) { "node { name: 'C' op: 'ReluGrad'" " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }" - "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'C'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(_MklReluGrad);D(Mul);DMT/_0(Const);" + "A(Input);B(Input);C(_MklReluGrad);D(Zeta);DMT/_0(Const);" "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;" "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3"); } @@ -1102,10 +1102,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) { "node { name: 'C' op: 'ReluGrad'" " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }" - "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'C'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(_MklRelu);C(_MklReluGrad);D(Mul);DMT/_0(Const);" + "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);" "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;" "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;" "DMT/_1->C:2"); @@ -1121,10 +1121,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(_MklAvgPool);C(Mul);DMT/_0(Const)|A->B;A->C;" + "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;" "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1"); } @@ -1139,10 +1139,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }" " input: ['A', 'B'] }" - "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['B', 'C'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);" + "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);" "DMT/_1(Const)|A->C;A:control->DMT/_0:control;" "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;" "DMT/_1->C:3"); @@ -1166,10 +1166,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }" " input: ['I', 'B'] }" - "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'C'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);" + "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);" "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;" "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;" "I:control->DMT/_1:control"); @@ -1188,12 +1188,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) { " attr { key: 'epsilon' value { f: 0.0001 } }" " attr { key: 'is_training' value { b: true } }" " input: ['A', 'B', 'C', 'D', 'E'] }" - "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'F'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);" - "F(_MklFusedBatchNormGrad);G(Mul)|A->F;A->G;" + "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;" "A:control->DMT/_0:control;A:control->DMT/_1:control;" "A:control->DMT/_2:control;A:control->DMT/_3:control;" "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;" @@ -1214,12 +1214,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) { " attr { key: 'epsilon' value { f: 0.0001 } }" " attr { key: 'is_training' value { b: true } }" " input: ['A', 'B', 'C', 'D', 'E'] }" - "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'F'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);" - "F(_MklFusedBatchNorm);G(Mul)|A->F;A->G;" + "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;" "A:control->DMT/_0:control;A:control->DMT/_1:control;" "A:control->DMT/_2:control;A:control->DMT/_3:control;" "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;" @@ -1268,12 +1268,12 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) { " attr { key: 'depth_radius' value { i: 2 } }" " input: ['E', 'F', 'B'] }" "node { name: 'H' op: 'Input'}" - "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['H', 'G'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);" - "I(Mul)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;" + "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;" "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;" "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;" "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I"); @@ -1301,11 +1301,11 @@ TEST_F(MklLayoutPassTest, LRN_Positive) { " attr { key: 'data_format' value { s: 'NCHW' } }" " attr { key: 'depth_radius' value { i: 2 } }" " input: ['C', 'D', 'B'] }" - "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'E'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" - "DMT/_2(Const);E(_MklLRNGrad);F(Mul)|" + "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|" "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;" "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;" "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1"); @@ -1323,10 +1323,10 @@ TEST_F(MklLayoutPassTest, LRN_Negative1) { " attr { key: 'data_format' value { s: 'NCHW' } }" " attr { key: 'depth_radius' value { i: 2 } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(_MklLRN);C(Mul);DMT/_0(Const)|" + "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|" "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1"); } @@ -1344,11 +1344,11 @@ TEST_F(MklLayoutPassTest, LRN_Negative2) { " attr { key: 'data_format' value { s: 'NCHW' } }" " attr { key: 'depth_radius' value { i: 2 } }" " input: ['A', 'B', 'C'] }" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);" - "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|" + "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|" "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;" "A:control->DMT/_2:control;A:control->DMT/_3:control;" "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;" @@ -1386,12 +1386,12 @@ TEST_F(MklLayoutPassTest, LRN_Negative3) { " attr { key: 'data_format' value { s: 'NCHW' } }" " attr { key: 'depth_radius' value { i: 2 } }" " input: ['C', 'B', 'D'] }" - "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['E', 'F'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);" - "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Mul)|A->B;" + "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Zeta)|A->B;" "A:control->DMT/_0:control;B->E:2;" "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;" "C:control->DMT/_1:control;C:control->DMT/_2:control;" @@ -1421,11 +1421,11 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }" " input: ['C', 'B', 'D'] }" - "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'E'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);" - "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Mul)|" + "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|" "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;" "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;" "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1"); @@ -1444,10 +1444,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(_MklMaxPool);C(Mul);DMT/_0(Const)|" + "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|" "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1"); } @@ -1466,11 +1466,11 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:2, i:2} } }" " input: ['A', 'B', 'C'] }" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'D'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);" - "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|" + "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|" "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;" "A:control->DMT/_2:control;A:control->DMT/_3:control;" "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;" @@ -1489,10 +1489,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } // Test MaxPool handling for batch-wise pooling (NCHW) @@ -1507,10 +1507,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 2, i:1, i:1, i:1} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } // Test MaxPool handling for depth-wise pooling (NHWC) @@ -1525,10 +1525,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } // Test MaxPool handling for depth-wise pooling (NCHW) @@ -1543,10 +1543,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:2, i:1, i:1} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } // Test MaxPool handling for batch-wise pooling (NHWC) @@ -1561,10 +1561,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } // Test MaxPool handling for batch-wise pooling (NHWC) @@ -1579,10 +1579,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 2, i:1, i:1, i:1} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } // Test MaxPool handling for depth-wise pooling (NHWC) @@ -1597,10 +1597,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } // Test MaxPool handling for depth-wise pooling (NHWC) @@ -1615,10 +1615,10 @@ TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:2} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }"); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } ///////////////////////////////////////////////////////////////////// @@ -1636,10 +1636,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B']}" - "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['B', 'C'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(Conv2D);D(Mul)|A->C;B->C:1;B->D;C->D:1"); + "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1"); } TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) { @@ -1657,7 +1657,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'C', 'M', 'N', 'O']}" - "node { name: 'E' op: 'Sub'" + "node { name: 'E' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['D', 'A']}" "node { name: 'F' op: 'BiasAddGrad'" @@ -1666,7 +1666,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) { " input: ['E'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);" - "E(Sub);F(BiasAddGrad);M(_MklInput);N(_MklInput);" + "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);" "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;" "M->D:3;N->D:4;O->D:5"); } @@ -1683,10 +1683,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'C']}" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'D'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Mul)|" + "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|" "A->D;A->E;B->D:1;C->D:2;D->E:1"); } @@ -1696,10 +1696,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) { "node { name: 'B' op: 'Relu'" " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Relu);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1"); } TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) { @@ -1709,10 +1709,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) { "node { name: 'C' op: 'ReluGrad'" " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }" - "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'C'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(Input);C(ReluGrad);D(Mul)|A->C;A->D;B->C:1;C->D:1"); + "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1"); } TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) { @@ -1725,10 +1725,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) { @@ -1741,10 +1741,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) { " attr { key: 'padding' value { s: 'VALID' } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" - "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Input);B(AvgPool);C(Mul)|A->B;A->C;B->C:1"); + "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1"); } // Concat Op test: Concat with no Mkl layer feeding it @@ -1762,10 +1762,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) { " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'N' value { i: 2 } }" " input: ['A', 'B:0', 'B:1']}" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'D'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Const);B(InputList);C(Input);D(Concat);E(Mul)|A->D;" + "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;" "B->D:1;B:1->D:2;C->E;D->E:1"); } @@ -1784,10 +1784,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) { " attr { key: 'Tidx' value { type: DT_INT32 } }" " attr { key: 'N' value { i: 2 } }" " input: ['B:0', 'B:1', 'A']}" - "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'D'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Const);B(InputList);C(Input);D(ConcatV2);E(Mul)|" + "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|" "A->D:2;B->D;B:1->D:1;C->E;D->E:1"); } @@ -1804,11 +1804,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) { " attr { key: 'epsilon' value { f: 0.0001 } }" " attr { key: 'is_training' value { b: true } }" " input: ['A', 'B', 'C', 'D', 'E'] }" - "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'F'] }", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);E(Input);" - "F(FusedBatchNorm);G(Mul)|A->F;A->G;B->F:1;C->F:2;D->F:3;" + "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;" "E->F:4;F->G:1"); } @@ -1832,12 +1832,12 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['C', 'D'] }" "node { name: 'Y' op: 'Input'}" - "node { name: 'Z' op: 'Sub'" + "node { name: 'Z' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['E', 'Y']}", kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);" - "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->C;" + "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;" "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1"); } @@ -1853,7 +1853,7 @@ static void BM_MklLayoutRewritePass(int iters, int op_nodes) { random::SimplePhilox rnd(&philox); for (int op = 0; op < op_nodes; op++) { s += strings::Printf( - "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { " + "node { name: 'op%04d' op: 'Zeta' attr { key: 'T' value { " "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }", op, rnd.Uniform(10), rnd.Uniform(10)); } diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc index 590b3d030f..3f8b0e86d0 100644 --- a/tensorflow/core/graph/mkl_tfconversion_pass.cc +++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc @@ -64,6 +64,15 @@ namespace tensorflow { // in the Mkl format. Non-compliant ops accept inputs and outputs in the // TensorFlow format. // +// ADDENDUM: For element-wise ops, we may or may not need a conversion to +// take place before we hit the op. For this, we add a new op before each +// element-wise MKL op to deal with the inputs, called _MklInputConversion. +// This pass has been enhanced to add this capability. +// +// The _MklInputConversion op will check the inputs to the elementwise op and +// make sure that either both are in MKL format or both are in TF format, +// depending on their initial state and whether broadcast is needed or not. + class MklToTfConversionPass : public GraphOptimizationPass { public: MklToTfConversionPass() {} @@ -87,6 +96,16 @@ class MklToTfConversionPass : public GraphOptimizationPass { return mkl_op_registry::IsMklOp(op_name, T); } + // Is the input Op supported by Mkl-specific layout AND + // is it element-wise? + // + // @input op_name string of the op + // @input T Datatype to use for checking input op + // @return true if op is Mkl supported; false, otherwise. + inline bool IsMklElementWiseOp(const string& op_name, DataType T) const { + return mkl_op_registry::IsMklElementWiseOp(op_name, T); + } + // Insert layout conversion node on the edge pointed by 'e' from graph 'g'. // // Edge will be deleted once a call to this function is successful. @@ -96,6 +115,17 @@ class MklToTfConversionPass : public GraphOptimizationPass { // @return Success:OK() if insertion is successful, otherwise returns // appropriate error status code. Status InsertConversionNodeOnEdge(std::unique_ptr<Graph>* g, Edge*); + + // For element-wise ops, we need to sanitize the inputs. For this, we add a + // new node at the input of the replacement element-wise node that checks + // the inputs and converts one/both of them as required. See the op code + // comments for details. + // + // Insert input conversion node as parent of 'n' from graph 'g'. + // + // @return Success:OK() if insertion is successful, otherwise returns + // appropriate error status code. + Status InsertInputConversionNode(std::unique_ptr<Graph>* g, Node*); }; // We register MklToTf insertion for phase 2 in post-partition grouping @@ -171,6 +201,92 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge( return Status::OK(); } +Status MklToTfConversionPass::InsertInputConversionNode( + std::unique_ptr<Graph>* g, Node* n) { + CHECK_NOTNULL(n); + + // Get the input nodes and edges + std::vector<const Edge*> edges; + TF_CHECK_OK(n->input_edges(&edges)); + if (edges.size() != 4) { + return Status(error::Code::INVALID_ARGUMENT, + "MKL Binary Element-wise op should have exactly 2 data" + " inputs and 2 metadata inputs"); + } + + // Sanity check: ensure that both inputs are of the expected type, and the + // same type as input type + CHECK_EQ(BaseType(edges[0]->src()->output_type(edges[0]->src_output())), + BaseType(edges[1]->src()->output_type(edges[1]->src_output()))); + CHECK_EQ(BaseType(edges[0]->src()->output_type(edges[0]->src_output())), + BaseType(n->input_type(0))); + + // Check ordering of edges + for (uint i = 0; i < 4; i++) { + CHECK_EQ((edges[i]->dst_input() == i), true); + } + + // Build the conversion node and specify src as input. + Node* conversion_node = nullptr; + + TF_CHECK_OK( + NodeBuilder((*g)->NewName("MklInputConversion"), "_MklInputConversion") + .Input(edges[0]->src(), edges[0]->src_output()) + .Input(edges[1]->src(), edges[1]->src_output()) + .Input(edges[2]->src(), edges[2]->src_output()) + .Input(edges[3]->src(), edges[3]->src_output()) + .Device(n->def().device()) + .Attr("T", n->input_type(0)) + .Finalize(&**g, &conversion_node)); + + CHECK_NOTNULL(conversion_node); + + // Change the destination of any control edges to the InputConversion node + if (edges.size() != n->in_edges().size()) { + std::vector<const Edge*> edges_to_remove; + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) { + CHECK_NOTNULL((*g)->AddControlEdge(e->src(), conversion_node)); + edges_to_remove.push_back(e); + } + } + for (const Edge* e : edges_to_remove) { + (*g)->RemoveEdge(e); + } + } + + string data_format; + if (GetNodeAttr(edges[0]->src()->def(), "data_format", &data_format) == + Status::OK()) { + conversion_node->AddAttr("data_format", data_format); + } + + // Get assigned device from destination node and apply it to conversion node. + // We want conversion node to be on the same device as the destination node. + conversion_node->set_assigned_device_name(n->assigned_device_name()); + + // Set the Mkl op label for this op. + conversion_node->AddAttr("_kernel", mkl_op_registry::kMklOpLabel); + + // Now that we have added edges from src->conversion_node, let's add edge from + // output of conversion_node to the element-wise node. + CHECK_NOTNULL((*g)->AddEdge(conversion_node, 0, n, edges[0]->dst_input())); + CHECK_NOTNULL((*g)->AddEdge(conversion_node, 1, n, edges[1]->dst_input())); + CHECK_NOTNULL((*g)->AddEdge(conversion_node, 2, n, edges[2]->dst_input())); + CHECK_NOTNULL((*g)->AddEdge(conversion_node, 3, n, edges[3]->dst_input())); + + VLOG(1) << "MklToTfConversionPass - InputConversion: Inserting input " + << "conversion node on: " << n->type_string() << " successful."; + + // Remove src->dst edge now. + (*g)->RemoveEdge(edges[0]); + (*g)->RemoveEdge(edges[1]); + (*g)->RemoveEdge(edges[2]); + (*g)->RemoveEdge(edges[3]); + + return Status::OK(); +} + bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) { bool result = false; @@ -239,6 +355,49 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) { DumpGraph("After MklToTfConversionPass", &**g); + //--------------------------------------------------------------------------- + // Check all nodes and add an input-conversion-node if the node is an mkl + // element-wise node. + VLOG(1) << "Before running MklToTfConversionPass - InputConversion"; + + std::vector<Node*> candidate_nodes; + std::vector<Node*> order; + GetReversePostOrder(**g, &order); // This will give us topological sort. + + for (Node* n : order) { + // If node is not an op or it does not have a datatype, then skip. + DataType datatype; + if (!n->IsOp() || (GetNodeAttr(n->def(), "T", &datatype) != Status::OK())) { + continue; + } + if (IsMklElementWiseOp(n->type_string(), datatype)) { + // If the input node is an input-conversion op, skip + Node* input_node = nullptr; + TF_CHECK_OK(n->input_node(0, &input_node)); + DataType input_datatype; + if ((GetNodeAttr(n->def(), "T", &input_datatype) == Status::OK()) && + (input_node->type_string().compare("_MklInputConversion") == 0)) { + continue; + } + + VLOG(1) << "MklToTfConversionPass: InputConversion: Scheduled node " + << n->name() << " for inserting input conversion node"; + candidate_nodes.push_back(const_cast<Node*>(n)); + } + } + + // Process all candidate edges and insert conversion nodes on them. + for (Node* n : candidate_nodes) { + // Even if we insert conversion node on a single node, we + // need to return true. + if (InsertInputConversionNode(g, n) == Status::OK()) { + VLOG(1) << "MklToTfConversionPass: Inserted conversion " + << "on node " << n->name(); + result = true; + } + } + DumpGraph("After MklToTfConversionPass - InputConversion", &**g); + // We need to return true even if we insert one conversion node // anywhere in the graph. return result; diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index b6d7e3b4b2..cff6e30c04 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -2340,7 +2340,10 @@ tf_kernel_library( tf_kernel_library( name = "svd_op", prefix = "svd_op", - deps = LINALG_DEPS, + deps = LINALG_DEPS + if_cuda([ + ":cuda_solvers", + ":transpose_functor", + ]), ) cc_library( @@ -2938,7 +2941,7 @@ tf_kernel_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:nn_ops_op_lib", - ], + ] + if_cuda(["@cub_archive//:cub"]), ) tf_kernel_library( @@ -5502,6 +5505,22 @@ tf_mkl_kernel_library( ) tf_mkl_kernel_library( + name = "mkl_input_conversion_op", + hdrs = ["mkl_tfconv_op.h"], + prefix = "mkl_input_conversion", + deps = [ + ":bounds_check", + ":ops_util", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:nn_ops_op_lib", + "//third_party/mkl:intel_binary_blob", + ], +) + +tf_mkl_kernel_library( name = "mkl_pooling_ops", srcs = [ "mkl_avgpooling_op.cc", @@ -5544,6 +5563,14 @@ tf_mkl_kernel_library( ) tf_mkl_kernel_library( + name = "mkl_aggregate_ops", + prefix = "mkl_aggregate_ops", + deps = MATH_DEPS + [ + "//third_party/mkl:intel_binary_blob", + ], +) + +tf_mkl_kernel_library( name = "mkl_concat_op", prefix = "mkl_concat_op", deps = ARRAY_DEPS + [ @@ -5575,6 +5602,20 @@ tf_mkl_kernel_library( ], ) +tf_mkl_kernel_library( + name = "mkl_cwise_ops_common", + hdrs = [ + "cwise_ops.h", + "cwise_ops_common.h", + "cwise_ops_gradients.h", + ], + prefix = "mkl_cwise_ops_common", + deps = NN_DEPS + [ + "cwise_op", + "//third_party/mkl:intel_binary_blob", + ], +) + cc_library( name = "dataset", srcs = ["dataset.cc"], diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc index ddc2d457b0..42f3db1d79 100644 --- a/tensorflow/core/kernels/bias_op_gpu.cu.cc +++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc @@ -173,15 +173,20 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop, // Accumulate the results in the shared memory into the first element. // No syncthreads is needed since this is only in the same warp. int32 thread_index = threadIdx.x; - if (thread_index < 16) s_data[thread_index] += s_data[thread_index + 16]; - if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8]; - if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4]; - if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2]; - if (thread_index < 1) s_data[thread_index] += s_data[thread_index + 1]; - - // The first thread writes out the accumulated result to the global location. - if (thread_index == 0) { - CudaAtomicAdd(bias_backprop + bias_index, T(s_data[0])); + if (thread_index < 16) { + s_data[thread_index] += s_data[thread_index + 16]; + __syncwarp(0xFFFF); + if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8]; + __syncwarp(0xFF); + if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4]; + __syncwarp(0xF); + if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2]; + __syncwarp(0x3); + if (thread_index == 0) { + T val = T(s_data[0] + s_data[1]); + // The first thread writes out the accumulated result to global location. + CudaAtomicAdd(bias_backprop + bias_index, val); + } } } diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc index 5c6b5eec82..43197d8cf4 100644 --- a/tensorflow/core/kernels/cuda_solvers.cc +++ b/tensorflow/core/kernels/cuda_solvers.cc @@ -174,7 +174,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync( } info_checker_callback(status, host_lapack_infos); }; - + auto cb = std::bind(wrapped_info_checker_callback, context_, std::move(info_checker_callback), std::move(host_lapack_infos)); @@ -188,6 +188,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync( // numeric types. #define TF_CALL_LAPACK_TYPES(m) \ m(float, S) m(double, D) m(std::complex<float>, C) m(std::complex<double>, Z) +#define TF_CALL_LAPACK_TYPES_NO_COMPLEX(m) m(float, S) m(double, D) // Macros to construct cusolverDn method names. #define DN_SOLVER_FN(method, lapack_prefix) cusolverDn##lapack_prefix##method @@ -327,6 +328,41 @@ static inline Status GetrsImpl(SolverFnT solver, OpKernelContext* context, TF_CALL_LAPACK_TYPES(GETRS_INSTANCE); +template <typename Scalar, typename BufSizeFnT, typename SolverFnT> +static inline Status GesvdImpl(BufSizeFnT bufsize, SolverFnT solver, + OpKernelContext* context, + cusolverDnHandle_t cusolver_dn_handle, + signed char jobu, signed char jobvt, int m, + int n, Scalar* A, int lda, Scalar* S, Scalar* U, + int ldu, Scalar* VT, int ldvt, + int* dev_lapack_info) { + /* Get amount of workspace memory required. */ + int lwork; + TF_RETURN_IF_CUSOLVER_ERROR(bufsize(cusolver_dn_handle, m, n, &lwork)); + /* Allocate device memory for workspace. */ + ScratchSpace<Scalar> dev_workspace(context, lwork, /* on_host */ false); + /* Launch the solver kernel. */ + TF_RETURN_IF_CUSOLVER_ERROR(solver( + cusolver_dn_handle, jobu, jobvt, m, n, CUDAComplex(A), lda, S, + CUDAComplex(U), ldu, CUDAComplex(VT), ldvt, + CUDAComplex(dev_workspace.mutable_data()), lwork, NULL, dev_lapack_info)); + return Status::OK(); +} + +#define GESVD_INSTANCE(Scalar, lapack_prefix) \ + template <> \ + Status CudaSolver::Gesvd<Scalar>( \ + signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A, \ + int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT, \ + int ldvt, int* dev_lapack_info) const { \ + return GesvdImpl(DN_BUFSIZE_FN(gesvd, lapack_prefix), \ + DN_SOLVER_FN(gesvd, lapack_prefix), context_, \ + cusolver_dn_handle_, jobu, jobvt, m, n, dev_A, lda, \ + dev_S, dev_U, ldu, dev_VT, ldvt, dev_lapack_info); \ + } + +TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVD_INSTANCE); + //============================================================================= // Wrappers of cuBlas computational methods begin here. // diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h index 0fd6450f98..7cbdc895dd 100644 --- a/tensorflow/core/kernels/cuda_solvers.h +++ b/tensorflow/core/kernels/cuda_solvers.h @@ -258,13 +258,23 @@ class CudaSolver { Status Syevd(cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, Scalar* dev_A, int lda, Scalar* dev_W, int* dev_lapack_info) const; +*/ // Singular value decomposition. // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-gesvd template <typename Scalar> Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A, - int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT, - int ldvt, int* dev_lapack_info); - */ + int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT, + int ldvt, int* dev_lapack_info) const; + /* + // Batched linear solver using LU factorization from getrfBatched. + // See: + http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched + template <typename Scalar> + Status GetrsBatched(cublasOperation_t trans, int n, int nrhs, + const Scalar* dev_Aarray[], int lda, const int* devIpiv, + Scalar* dev_Barray[], int ldb, int* info, int batch_size) + const; + */ private: OpKernelContext* context_; // not owned. diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h index d935331904..ada39eae38 100644 --- a/tensorflow/core/kernels/cwise_ops.h +++ b/tensorflow/core/kernels/cwise_ops.h @@ -139,7 +139,7 @@ struct scalar_left : private Binary { typedef Tout result_type; const Tin* left; - EIGEN_DEVICE_FUNC inline scalar_left(const scalar_left& other) = default; + inline scalar_left(const scalar_left& other) = default; template <typename... Args> EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c, Args... args) @@ -169,7 +169,7 @@ struct scalar_right : private Binary { typedef Tout result_type; const Tin* right; - EIGEN_DEVICE_FUNC inline scalar_right(const scalar_right& other) = default; + inline scalar_right(const scalar_right& other) = default; template <typename... Args> EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c, Args... args) diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc index 192a4f732e..693c6467ac 100644 --- a/tensorflow/core/kernels/cwise_ops_common.cc +++ b/tensorflow/core/kernels/cwise_ops_common.cc @@ -20,7 +20,9 @@ namespace tensorflow { BinaryOpShared::BinaryOpShared(OpKernelConstruction* ctx, DataType out, DataType in) : OpKernel(ctx) { +#ifndef INTEL_MKL OP_REQUIRES_OK(ctx, ctx->MatchSignature({in, in}, {out})); +#endif } void BinaryOpShared::SetUnimplementedError(OpKernelContext* ctx) { diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc index 9492a4e26d..1c0085cfea 100644 --- a/tensorflow/core/kernels/decode_raw_op.cc +++ b/tensorflow/core/kernels/decode_raw_op.cc @@ -105,6 +105,7 @@ REGISTER(Eigen::half); REGISTER(float); REGISTER(double); REGISTER(int32); +REGISTER(uint16); REGISTER(uint8); REGISTER(int16); REGISTER(int8); diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc index fcfcd188d2..ecfe51d599 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/util/tensor_format.h" +#include "external/cub_archive/cub/util_ptx.cuh" #if !defined(_MSC_VER) #define UNROLL _Pragma("unroll") @@ -1015,6 +1016,21 @@ __global__ void __launch_bounds__(640, 2) } } +// Device function to compute sub-warp sum reduction for a power-of-two group of +// neighboring threads. +template<int kWidth, typename T> +__device__ __forceinline__ T WarpSumReduce(T val) { + // support only power-of-two widths. + assert(__popc(kWidth) == 1); + int sub_warp = cub::LaneId() / kWidth; + int zeros = sub_warp * kWidth; + unsigned mask = ((1UL << kWidth) - 1) << zeros; + for (int delta = kWidth / 2; delta > 0; delta /= 2) { + val += CudaShuffleXor(mask, val, delta); + } + return val; +} + // CUDA kernel to compute the depthwise convolution backward w.r.t. filter in // NHWC format, tailored for small images up to 32x32. Stride and depth // multiplier must be 1. Padding must be 'SAME'. Only use this kernel if @@ -1127,6 +1143,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( // Note: the condition to reach this is uniform across the entire block. __syncthreads(); + unsigned active_threads = CudaBallot(CUDA_WARP_ALL, depth_in_range); if (depth_in_range) { const T* const out_ptr = inout_offset + output; @@ -1140,7 +1157,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset]; // Warp-accumulate pixels of the same depth and write to accumulator. for (int delta = 16; delta >= kBlockSlices; delta /= 2) { - val += CudaShuffleDown(val, delta); + val += CudaShuffleDown(active_threads, val, delta); } if (!(thread_idx & 32 - kBlockSlices) /* lane_idx < kBlockSlices */) { *accum_ptr = val; @@ -1164,9 +1181,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( if (filter_depth < in_depth) { T val = accum_data[i]; // Warp-accumulate the pixels of the same depth from the accumulator. - for (int delta = kAccumPixels / 2; delta > 0; delta /= 2) { - val += CudaShuffleDown(val, delta); - } + val = WarpSumReduce<kAccumPixels>(val); if (!(thread_idx & kAccumPixels - 1)) { CudaAtomicAdd(filter_offset + filter, val); } @@ -1382,6 +1397,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall( // Note: the condition to reach this is uniform across the entire block. __syncthreads(); + unsigned active_threads = CudaBallot(CUDA_WARP_ALL, slice_in_range); if (slice_in_range) { const T* const out_ptr = inout_offset + output; @@ -1395,7 +1411,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall( T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset]; // Warp-accumulate pixels of the same depth and write to accumulator. for (int delta = 16 / kBlockSlices; delta > 0; delta /= 2) { - val += CudaShuffleDown(val, delta); + val += CudaShuffleDown(active_threads, val, delta); } if (!(thread_idx & 32 / kBlockSlices - 1)) { *accum_ptr = val; @@ -1419,9 +1435,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall( if (filter_depth < in_depth) { T val = accum_data[i]; // Warp-accumulate pixels of the same depth from the accumulator. - for (int delta = kAccumPixels / 2; delta > 0; delta /= 2) { - val += CudaShuffleDown(val, delta); - } + val = WarpSumReduce<kAccumPixels>(val); if (!(thread_idx & kAccumPixels - 1)) { CudaAtomicAdd(filter_offset + filter, val); } diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc index 8a0a558eef..ea0cc139f3 100644 --- a/tensorflow/core/kernels/fill_functor.cc +++ b/tensorflow/core/kernels/fill_functor.cc @@ -20,6 +20,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/variant_encode_decode.h" namespace tensorflow { namespace functor { @@ -50,6 +51,7 @@ DEFINE_SETZERO_CPU(int32); DEFINE_SETZERO_CPU(int64); DEFINE_SETZERO_CPU(complex64); DEFINE_SETZERO_CPU(complex128); +DEFINE_SETZERO_CPU(Variant); #undef DEFINE_SETZERO_CPU #ifdef TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc new file mode 100644 index 0000000000..51ba127def --- /dev/null +++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc @@ -0,0 +1,273 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/math_ops.cc. + +#ifdef INTEL_MKL +#define EIGEN_USE_THREADS + +#include <numeric> + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/platform/logging.h" + +#include "mkl_dnn.h" +#include "mkl_dnn_types.h" +#include "tensorflow/core/util/mkl_util.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template <typename Device, typename T> +class MklAddNOp : public OpKernel { + public: + explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* ctx) override { + const int num = ctx->num_inputs(); + OP_REQUIRES(ctx, num / 2 == 2, + errors::InvalidArgument("Only additions of two arguments " + "supported by MKL. Num inputs: ", + num)); + + MklAddNOpContext mkl_context; + const Tensor& input0 = MklGetInput(ctx, 0); + GetMklShape(ctx, 0, &(mkl_context.input1_shape)); + bool input1_in_mkl_format = mkl_context.input1_shape.IsMklTensor(); + + const Tensor& input1 = MklGetInput(ctx, 1); + GetMklShape(ctx, 1, &(mkl_context.input2_shape)); + bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor(); + + mkl_context.in_dims = input1_in_mkl_format + ? mkl_context.input1_shape.GetDimension() + : input0.dims(); + mkl_context.in_dims = input2_in_mkl_format + ? mkl_context.input2_shape.GetDimension() + : input1.dims(); + // Generate size, stride for input if input is in MKL format. + ExtractMklOpParams(&mkl_context.in1_sizes, + &mkl_context.in1_strides, input0, &mkl_context.input1_shape); + ExtractMklOpParams(&mkl_context.in2_sizes, + &mkl_context.in2_strides, input1, &mkl_context.input2_shape); + + std::vector<float> coeff(2, 1.0); + mkl_context.MklCreateInputLayouts(ctx); + CHECK_EQ(dnnSumCreate_F32(&mkl_context.Eltwise, mkl_context.attributes, 2, + mkl_context.lt_input1, &coeff[0]), + E_SUCCESS); + + Tensor mkl_tmp_input1_buf_tensor, mkl_tmp_input2_buf_tensor; + mkl_context.MklPrepareAddNInputs(ctx, &mkl_tmp_input1_buf_tensor, + &mkl_tmp_input2_buf_tensor); + Tensor* output = nullptr; + if (input1_in_mkl_format || input2_in_mkl_format) { + TensorShape tf_shape; + mkl_context.output_shape.SetMklTensor(true); + mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst); + + mkl_context.output_shape.SetTfLayout( + mkl_context.in_dims, mkl_context.in1_sizes, mkl_context.in1_strides); + if (input1_in_mkl_format == true) { + mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims, + mkl_context.input1_shape.GetTfToMklDimMap()); + } else { + mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims, + mkl_context.input2_shape.GetTfToMklDimMap()); + } + tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>( + mkl_context.output_shape.GetMklLayout())) / + sizeof(T)); + + AllocateOutputSetMklShape(ctx, 0, &output, tf_shape, + mkl_context.output_shape); + } else { + const TensorShape& o_shape = input1.shape(); + mkl_context.output_shape.SetMklTensor(false); + AllocateOutputSetMklShape(ctx, 0, &output, o_shape, + mkl_context.output_shape); + } + + mkl_context.Eltwise_res[dnnResourceDst] = + static_cast<void*>(output->flat<T>().data()); + + // Execute convolution + CHECK_EQ(dnnExecute_F32(mkl_context.Eltwise, mkl_context.Eltwise_res), + E_SUCCESS); + + mkl_context.MklCleanup(); + } + + void ExtractMklOpParams(size_t** out_sizes, size_t** out_strides, + const Tensor& input, const MklShape* input_shape) { + bool input_in_mkl_format = input_shape->IsMklTensor(); + int in_dims = input_in_mkl_format + ? input_shape->GetDimension() + : input.dims(); + size_t* in_sizes = new size_t[in_dims]; + size_t* in_strides = new size_t[in_dims]; + + if (input_in_mkl_format) { + for (int i = 0; i < in_dims; i++) { + in_sizes[i] = input_shape->GetSizes()[i]; + in_strides[i] = input_shape->GetStrides()[i]; + } + } else { + for (int i = 0; i < in_dims; i++) { + in_sizes[i] = + input.dim_size((in_dims - 1) - i); + } + in_strides[0] = 1; + for (int i = 1; i < in_dims; i++) { + in_strides[i] = + in_strides[i - 1] * in_sizes[i - 1]; + } + } + *out_sizes = in_sizes; + *out_strides = in_strides; + } + + + private: + typedef struct { + int in_dims; + size_t* in1_sizes; + size_t* in1_strides; + + size_t* in2_sizes; + size_t* in2_strides; + dnnPrimitive_t Eltwise = nullptr; + dnnPrimitiveAttributes_t attributes = nullptr; + void* Eltwise_res[dnnResourceNumber]; + dnnLayout_t lt_input1 = nullptr, lt_input2 = nullptr; + MklShape input1_shape, input2_shape, output_shape; + + void MklCreateInputLayouts(OpKernelContext* context) { + bool input1_in_mkl_format = input1_shape.IsMklTensor(); + if (!input1_in_mkl_format) { + CHECK_EQ( + dnnLayoutCreate_F32(<_input1, in_dims, in1_sizes, in1_strides), + E_SUCCESS); + } else { + lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout()); + } + + bool input2_in_mkl_format = input2_shape.IsMklTensor(); + if (!input2_in_mkl_format) { + CHECK_EQ( + dnnLayoutCreate_F32(<_input2, in_dims, in2_sizes, in2_strides), + E_SUCCESS); + } else { + lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout()); + } + } + + void MklPrepareAddNInputs(OpKernelContext* context, + Tensor* mkl_tmp_input1_buf_tensor, + Tensor* mkl_tmp_input2_buf_tensor) { + bool mkl_convert_input1, mkl_convert_input2; + dnnPrimitive_t mkl_prim_convert_input1 = nullptr, + mkl_prim_convert_input2 = nullptr; + dnnLayout_t mkl_lt_internal_input1 = nullptr, + mkl_lt_internal_input2 = nullptr; + void *mkl_buf_convert_input1 = nullptr, *mkl_buf_convert_input2 = nullptr; + dnnResourceType_t dnnResourceMultipleSrc2 = + (dnnResourceType_t)(dnnResourceMultipleSrc + 1); + // Compare with internal layouts and convert if needed + const Tensor& input1 = MklGetInput(context, 0); + + void* mkl_buf_input1 = + const_cast<void*>(static_cast<const void*>(input1.flat<T>().data())); + + CHECK_EQ(dnnLayoutCreateFromPrimitive_F32( + &mkl_lt_internal_input1, Eltwise, dnnResourceMultipleSrc), + E_SUCCESS); + mkl_convert_input1 = + !dnnLayoutCompare_F32(mkl_lt_internal_input1, lt_input1); + if (mkl_convert_input1) { + CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input1, lt_input1, + mkl_lt_internal_input1), + E_SUCCESS); + AllocTmpBuffer(context, mkl_tmp_input1_buf_tensor, + mkl_lt_internal_input1, &mkl_buf_convert_input1); + CHECK_EQ( + dnnConversionExecute_F32(mkl_prim_convert_input1, mkl_buf_input1, + mkl_buf_convert_input1), + E_SUCCESS); + dnnDelete_F32(mkl_prim_convert_input1); + } + dnnLayoutDelete_F32(mkl_lt_internal_input1); + + Eltwise_res[dnnResourceMultipleSrc] = + (mkl_convert_input1) ? mkl_buf_convert_input1 : mkl_buf_input1; + + const Tensor& input2 = MklGetInput(context, 1); + void* mkl_buf_input2 = + const_cast<void*>(static_cast<const void*>(input2.flat<T>().data())); + CHECK_EQ(dnnLayoutCreateFromPrimitive_F32( + &mkl_lt_internal_input2, Eltwise, dnnResourceMultipleSrc2), + E_SUCCESS); + mkl_convert_input2 = + !dnnLayoutCompare_F32(mkl_lt_internal_input2, lt_input2); + if (mkl_convert_input2) { + CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input2, lt_input2, + mkl_lt_internal_input2), + E_SUCCESS); + AllocTmpBuffer(context, mkl_tmp_input2_buf_tensor, + mkl_lt_internal_input2, &mkl_buf_convert_input2); + CHECK_EQ( + dnnConversionExecute_F32(mkl_prim_convert_input2, mkl_buf_input2, + mkl_buf_convert_input2), + E_SUCCESS); + dnnDelete_F32(mkl_prim_convert_input2); + } + dnnLayoutDelete_F32(mkl_lt_internal_input2); + + Eltwise_res[dnnResourceMultipleSrc2] = + (mkl_convert_input2) ? mkl_buf_convert_input2 : mkl_buf_input2; + } + + void MklCleanup() { + bool input1_in_mkl_format = input1_shape.IsMklTensor(); + bool input2_in_mkl_format = input2_shape.IsMklTensor(); + dnnDelete_F32(Eltwise); + if (!input1_in_mkl_format) { + dnnLayoutDelete_F32(lt_input1); + delete [] in1_sizes; + delete [] in1_strides; + } + if (!input2_in_mkl_format) { + dnnLayoutDelete_F32(lt_input2); + delete [] in2_sizes; + delete [] in2_strides; + } + } + } MklAddNOpContext; +}; + +#define REGISTER_MKL_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("_MklAddN") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklAddNOp<CPUDevice, T>); + +TF_CALL_float(REGISTER_MKL_CPU); +#undef REGISTER_MKL_CPU +} // namespace tensorflow +#endif // INTEL_MKL diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 5dfce5d5c6..7f1555d325 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -406,8 +406,10 @@ class MklConv2DOp : public OpKernel { CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, lt_filter, mkl_lt_internal_filter), E_SUCCESS); + mkl_buf_convert_filter = const_cast<void*>( static_cast<const void*>(output_filter->flat<T>().data())); + CHECK_EQ( dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter, mkl_buf_convert_filter), diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc new file mode 100644 index 0000000000..7fc633c254 --- /dev/null +++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc @@ -0,0 +1,88 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0(the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +// See docs in ../ops/math_ops.cc. + +#define EIGEN_USE_THREADS +#include <iostream> +#include <vector> + +#include "tensorflow/core/kernels/cwise_ops_common.h" + +#include "tensorflow/core/util/mkl_util.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template <typename Device, typename Functor> +class MklBinaryOp : public BinaryOp<Device, Functor> { + public: + explicit MklBinaryOp(OpKernelConstruction* context) + : BinaryOp<Device, Functor>(context) {} + + void Compute(OpKernelContext* context) override { + auto in0 = context->input(0); + auto in1 = context->input(1); + VLOG(1) << "Shapes (start mklbinaryop compute): " + << in0.shape().DebugString() << " _and_ " + << in1.shape().DebugString(); + + // Call the TensorFlow BinaryOp Compute method + BinaryOp<Device, Functor>::Compute(context); + + auto out = context->mutable_output(0); + VLOG(1) << "Shapes (output): " << out->shape().DebugString(); + + // Pass input shape through to ouput shape + ForwardMklMetaDataInToOut(context, 0, 0); + + out = context->mutable_output(0); + VLOG(1) << "Shapes (output): " << out->shape().DebugString(); + } +}; + +//---------- Registration macros for various element-wise ops ----------- +// We will need to redefine "REGISTER" to include the mkl_op_registry flag +#pragma push_macro("REGISTER") +#undef REGISTER +#define REGISTER(OP, D, N, F, T) \ + REGISTER_KERNEL_BUILDER(Name(N) \ + .Device(DEVICE_##D) \ + .TypeConstraint<T>("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + OP<D##Device, F<T>>); + +REGISTER5(MklBinaryOp, CPU, "_MklAdd", functor::add, float, Eigen::half, double, + int32, int64); +REGISTER7(MklBinaryOp, CPU, "_MklSub", functor::sub, float, Eigen::half, double, + int32, int64, complex64, complex128); +REGISTER5(MklBinaryOp, CPU, "_MklMul", functor::mul, float, Eigen::half, double, + uint8, int32); +REGISTER5(MklBinaryOp, CPU, "_MklMaximum", functor::maximum, float, Eigen::half, + double, int32, int64); +REGISTER5(MklBinaryOp, CPU, "_MklSquaredDifference", + functor::squared_difference, float, Eigen::half, double, int32, + int64); + +#undef REGISTER +#pragma pop_macro("REGISTER") +//----------------------------------------------------------------------- + +} // end namespace tensorflow + +#endif // INTEL_MKL diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc index ca20294a26..f31e7afd46 100644 --- a/tensorflow/core/kernels/mkl_identity_op.cc +++ b/tensorflow/core/kernels/mkl_identity_op.cc @@ -41,9 +41,9 @@ class MklIdentityOp : public OpKernel { bool input_in_mkl_format = mkl_shape_input.IsMklTensor(); if (input_in_mkl_format) { - ForwarMklTensorInToOut(context, 0, 0); + ForwardMklTensorInToOut(context, 0, 0); } else { - FowardTfTensorInToOut(context, 0, 0); + ForwardTfTensorInToOut(context, 0, 0); } } diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc new file mode 100644 index 0000000000..b58e44e398 --- /dev/null +++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc @@ -0,0 +1,259 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +#include <algorithm> +#include <vector> +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/util/tensor_format.h" + +#include "tensorflow/core/kernels/mkl_tfconv_op.h" +#include "tensorflow/core/util/mkl_util.h" + +namespace tensorflow { +typedef Eigen::ThreadPoolDevice CPUDevice; + +/////////////////////////////////////////////////////////// +// Op kernel +// Checks and ensures that the 2 inputs are compatible for mkl binary ops. +// Here's the basic logic: +// +// if both inputs are in TF format: +// pass the inputs through to the output +// else if both inputs are in mkl format: +// if both have the same shape: +// pass the inputs through to the output +// else: +// convert both to TF +// else if one is TF and one is MKL: +// if broadcast is needed: +// convert the MKL format input to TF format +// else: +// convert the TF format input to MKL format +/////////////////////////////////////////////////////////// + +template <typename Device, typename T> +class MklInputConversionOp : public OpKernel { + public: + explicit MklInputConversionOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str)); + OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type)); + has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F); + } + + private: + void Compute(OpKernelContext* context) override { + // Check if input tensors are in MKL format. + const Tensor& input_tensor_0 = MklGetInput(context, 0); + MklShape input_shape_0; + GetMklShape(context, 0, &input_shape_0); + + const Tensor& input_tensor_1 = MklGetInput(context, 1); + MklShape input_shape_1; + GetMklShape(context, 1, &input_shape_1); + + bool tf_shapes_are_same = MklCompareShapes(&context->input(0).shape(), + &context->input(1).shape()); + + VLOG(1) << "MklInputConversionOp: Input shapes are " + << (tf_shapes_are_same ? "*same*" : "*different*") << ": " + << context->input(0).shape().DebugString() << " and " + << context->input(1).shape().DebugString(); + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // if both inputs are in TF format, just copy input tensors to output. + if (!input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) { + VLOG(1) << "MklInputConversionOp: No conversion needed, " + << "copying TF inputs to output"; + + ForwardTfTensorInToOut(context, 0, 0); + ForwardTfTensorInToOut(context, 1, 1); + return; + } + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // If both inputs are in MKL format + if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) { + // If both have the same shape, pass them through + if (tf_shapes_are_same) { + VLOG(1) << "MklInputConversionOp: No conversion needed, " + << "copying MKL inputs with identical shapes to output"; + + ForwardMklTensorInToOut(context, 0, 0); + ForwardMklTensorInToOut(context, 1, 1); + return; + } + + // Sanity check + bool mkl_shapes_are_same = + MklCompareShapes(&input_shape_0, &input_shape_1); + if (mkl_shapes_are_same) { + CHECK(false) << "MklInputConversionOp: Unexpected: TF shapes are " + "different but MKL shapes are same"; + } + + // Both have different shapes, so broadcast will be necessary. + // Convert to TF and pass both tensors through (we can't do broadcast + // with MKL tensors) + VLOG(1) << "MklInputConversionOp: Broadcast needed, " + << "converted MKL inputs to TF format"; + + MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str, + op_data_type, has_avx512f_, 0); + MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str, + op_data_type, has_avx512f_, 1); + SetDummyMklShapeOutput(context, 0); + SetDummyMklShapeOutput(context, 1); + return; + } + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // One input is MKL and one is TF. If no broadcast is needed, convert + // the TF tensor to MKL, otherwise convert the MKL tensor to TF format + VLOG(1) << "MklInputConversionOp: Inputs in different formats (MKL/TF)"; + + const Tensor* mkl_tensor; + const MklShape* mkl_shape; + const Tensor* tf_tensor; + MklShape* tf_mkl_shape; + uint mkl_tensor_index; + uint tf_tensor_index; + if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) { + mkl_tensor = &input_tensor_0; + mkl_shape = &input_shape_0; + mkl_tensor_index = 0; + tf_tensor = &input_tensor_1; + tf_mkl_shape = &input_shape_1; + tf_tensor_index = 1; + } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) { + mkl_tensor = &input_tensor_1; + mkl_shape = &input_shape_1; + mkl_tensor_index = 1; + tf_tensor = &input_tensor_0; + tf_mkl_shape = &input_shape_0; + tf_tensor_index = 0; + } else { + CHECK(false) << "MklInputConversionOp: Unexpected combination of input " + "shapes for MKL " + << "element-wise op"; + } + + // Broadcast is needed if the shapes are not the same + bool broadcast_needed; + + size_t in0_size = 1; + for (size_t i = 0; i < mkl_shape->GetDimension(); ++i) + in0_size *= mkl_shape->tf_dim_size(i); + + size_t in1_size = 1; + for (size_t i = 0; i < tf_tensor->shape().dims(); ++i) + in1_size *= tf_tensor->shape().dim_size(i); + + broadcast_needed = (in0_size != in1_size); + + if (!broadcast_needed) { + // Both shapes are same, convert the TF input to MKL + VLOG(1) << "MklInputConversionOp: No broadcast needed."; + VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index + << " to MKL format"; + + // Create MklShape + Tensor* tensor_out; + MklShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(true); + mkl_output_mkl_shape.SetTfLayout(mkl_shape->GetDimension(), + mkl_shape->GetSizes(), + mkl_shape->GetStrides()); + mkl_output_mkl_shape.SetTfDimOrder(mkl_shape->GetDimension()); + + // ** Temporarily borrow the layout from the MKL input ** + mkl_output_mkl_shape.SetMklLayout(mkl_shape->GetCurLayout()); + + // Create output tensor + AllocateOutputSetMklShape(context, tf_tensor_index, &tensor_out, + mkl_tensor->shape(), mkl_output_mkl_shape); + + // Since the shapes are the same, use information from the other tensor + tf_mkl_shape->SetTfLayout(mkl_shape->GetDimension(), + mkl_shape->GetSizes(), mkl_shape->GetStrides()); + // Convert the data format + tf_mkl_shape->GetConvertedFlatData( + mkl_shape->GetCurLayout(), + const_cast<T*>(tf_tensor->flat<T>().data()), + const_cast<T*>(tensor_out->flat<T>().data())); + + // ** Release the borrowed layout to avoid double deletion + // in the destructor call ** + mkl_output_mkl_shape.SetMklLayout(nullptr); + + // -- The tensor in MKL format passes through -- + ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index); + } else { + // Broadcast is needed, so convert the MKL input to TF + VLOG(1) << "MklInputConversionOp: Broadcast needed."; + VLOG(1) << "MklInputConversionOp: Converting input " << mkl_tensor_index + << " to TF format"; + MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str, + op_data_type, has_avx512f_, + mkl_tensor_index); + SetDummyMklShapeOutput(context, mkl_tensor_index); + + // The tensor in TF format passes through + ForwardTfTensorInToOut(context, tf_tensor_index, tf_tensor_index); + } + + VLOG(1) << "MklInputConversionOp: Shapes (output): " + << context->mutable_output(0)->shape().DebugString() << " and " + << context->mutable_output(1)->shape().DebugString(); + + VLOG(1) << "MklInputConversion completed successfully."; + } + + private: + /// Data format of the operation + string data_format_str; + + /// Data type of the operation + DataType op_data_type; + + /// CPUIDInfo + bool has_avx512f_ = false; +}; + +/////////////////////////////////////////////////////////// +// Register kernel +/////////////////////////////////////////////////////////// + +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("_MklInputConversion") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklInputConversionOp<CPUDevice, T>); + +TF_CALL_NUMBER_TYPES(REGISTER_CPU); +#undef REGISTER_CPU +} // namespace tensorflow +#endif // INTEL_MKL diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h new file mode 100644 index 0000000000..a240ee44fb --- /dev/null +++ b/tensorflow/core/kernels/mkl_tfconv_op.h @@ -0,0 +1,136 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +#ifndef TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_ +#define TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_ + +#include <algorithm> +#include <vector> +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/util/tensor_format.h" + +#include "mkl_dnn.h" +#include "mkl_dnn_types.h" +#include "tensorflow/core/util/mkl_util.h" + +namespace tensorflow { +typedef Eigen::ThreadPoolDevice CPUDevice; + +/////////////////////////////////////////////////////////// +// Op kernel +/////////////////////////////////////////////////////////// + +template <typename Device, typename T> +class MklToTfOp : public OpKernel { + public: + explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str)); + OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type)); + has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F); + } + + void Compute(OpKernelContext* context) override { + ConvertMklToTf(this, context, data_format_str, op_data_type, has_avx512f_, + 0); + VLOG(1) << "MKLToTFConversion complete successfully."; + } + + static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context, + string data_format_str, DataType op_data_type, + bool has_avx512f, uint input_number) { + // Check that input tensor is in MKL format. + const Tensor& input_tensor = MklGetInput(context, input_number); + MklShape input_shape; + GetMklShape(context, input_number, &input_shape); + + // if input is already in Tf format, then just copy input tensor to output. + if (!input_shape.IsMklTensor()) { + context->set_output(input_number, input_tensor); + VLOG(1) << "MKLToTFConversion: No conversion needed, " + << "copying input to output"; + return; + } + + // Check that input data type is same as operator data type and that it is + // same as output data type. + DataType input_data_type = op_kernel->input_type(input_number); + DataType output_data_type = op_kernel->output_type(input_number); + CHECK_EQ(op_data_type, input_data_type); + CHECK_EQ(op_data_type, output_data_type); + + TensorShape output_shape; + size_t ndims = input_shape.GetDimension(); + size_t* in_sizes = new size_t[ndims]; + for (size_t i = 0; i < ndims; i++) { + // Outermost to innermost dimension + output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]); + in_sizes[i] = input_shape.GetSizes()[i]; + } + + // Allocate output tensor. + Tensor* output_tensor = NULL; + OP_REQUIRES_OK(context, + context->allocate_output(input_number, output_shape, &output_tensor)); + + dnnLayout_t output_layout = + static_cast<dnnLayout_t>(input_shape.GetTfLayout()); + // Execute DNNConversion. + void* input_buffer = + static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data())); + delete[] in_sizes; + void* output_buffer = + static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data())); + input_shape.GetConvertedFlatData(output_layout, input_buffer, + output_buffer); + VLOG(1) << "MKLToTFConversion complete successfully."; + } + + private: + /// Data format of the operation + string data_format_str; + + /// Data type of the operation + DataType op_data_type; + + /// CPUIDInfo + bool has_avx512f_ = false; +}; + +/////////////////////////////////////////////////////////// +// Register kernel +/////////////////////////////////////////////////////////// + +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("_MklToTf") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklToTfOp<CPUDevice, T>); + +TF_CALL_NUMBER_TYPES(REGISTER_CPU); +#undef REGISTER_CPU +} // namespace tensorflow +#endif // TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_ +#endif // INTEL_MKL diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc new file mode 100644 index 0000000000..c8b307a2e4 --- /dev/null +++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc @@ -0,0 +1,413 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/linalg_ops.cc. +// TODO(shamanDevel): Enable complex inputs. This will require a specialization +// of Gesvd for complex inputs as well as a new kernel +// definition to output the singular values as reals +// instead of complex values. The current CPU implementation +// outputs the singular values as complex values and then +// casts them to reals in the python wrapper. +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include <algorithm> +#include <vector> + +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/cuda_solvers.h" +#include "tensorflow/core/kernels/linalg_ops_common.h" +#include "tensorflow/core/kernels/transpose_functor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +static const char kErrMsg[] = + "Singular Value Decomposition was not successful. The input might not be " + "valid."; + +typedef Eigen::GpuDevice GPUDevice; + +namespace { +// This kernel computes the reduction +// V' = sum_i (M_i * U_i,1 * S_i). +// The result is stored in V[batch] and has the same sign as the +// real value of V (which should be computed) +template <class Scalar> +__global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m, + int64 ldu, const Scalar* M, + const Scalar* U, const Scalar* S, + Scalar* V) { + CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count, x) { + CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count, y) { + Scalar v = M[i + m * batch] * U[ldu * (i + m * batch)] * S[batch]; + CudaAtomicAdd(V + batch, v); + } + } +} + +// Extracts the sign of V +// V[i] = V[i]>=0 ? 1 : 0 +template <class Scalar> +__global__ void ExtractSignOfVKernel(CudaLaunchConfig config, Scalar* V) { + CUDA_1D_KERNEL_LOOP(i, config.virtual_thread_count) { + V[i] = V[i] >= 0 ? Scalar(1) : Scalar(-1); + } +} +} + +// Scalar: The input scalar type (can be complex) +template <class Scalar> +class SvdOpGpu : public AsyncOpKernel { + public: + using RealScalar = typename Eigen::NumTraits<Scalar>::Real; + + explicit SvdOpGpu(OpKernelConstruction* context) : AsyncOpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("compute_uv", &compute_uv_)); + OP_REQUIRES_OK(context, context->GetAttr("full_matrices", &full_matrices_)); + } + + void RunSVD(OpKernelContext* context, DoneCallback done, int64 m, int64 n, + int64 p, int64 batch_size, Scalar* input_ptr, + RealScalar* outputS_ptr, Scalar* outputU_ptr, + Scalar* outputVT_ptr, int* dev_info_ptr, CudaSolver& solver) { + // Save the input matrix + // Needed for the n=1 fix, see below, since SVD destroys the input + Tensor input_copy; + if (compute_uv_ && n == 1) { + OP_REQUIRES_OK_ASYNC( + context, + context->allocate_temp(DataTypeToEnum<Scalar>::v(), + TensorShape({batch_size, m}), &input_copy), + done); + const GPUDevice& d = context->eigen_device<GPUDevice>(); + d.memcpy(input_copy.flat<Scalar>().data(), input_ptr, + batch_size * m * sizeof(Scalar)); + } + + for (int64 batch = 0; batch < batch_size; ++batch) { + Scalar* input = input_ptr + batch * m * n; + RealScalar* outputS = outputS_ptr + batch * p; + Scalar* outputU = NULL; + Scalar* outputVT = NULL; + char jobu = 'N'; + char jobvt = 'N'; + + if (compute_uv_) { + if (full_matrices_) { + outputU = outputU_ptr + batch * m * m; + outputVT = outputVT_ptr + batch * n * n; + jobu = 'A'; + jobvt = 'A'; + } else { + outputU = outputU_ptr + batch * m * p; + outputVT = outputVT_ptr + batch * n * p; + jobu = 'S'; + jobvt = 'S'; + } + } + + OP_REQUIRES_OK_ASYNC( + context, solver.Gesvd(jobu, jobvt, m, n, input, m, outputS, outputU, + m, outputVT, n, dev_info_ptr + batch), + done); + } + + // This is a bug in cuSolver: + // If n is one, then outputVT only contains zeros instead of ones. + // Hence, I need to fill outputVT manually + // The question is: +1 or -1? + // -> Compute U*S and compare sign against M + // But because S is zero except for the first entry, the multiplication + // simplifies a lot. + // However, what happens if M contains zeros? At these indices, it is + // impossible to determine the value of V. + // -> Compute V for all rows in M to cope for zeros. + // 1. V' = sum_i (M_i * U_i,1 * S_i) + // 2. V = {1, V'>=0, -1, V'<0} + // TODO: what is with complex values? + if (compute_uv_ && n == 1) { + // 1. compute the (batched) sum + const GPUDevice& d = context->eigen_device<GPUDevice>(); + d.memset(outputVT_ptr, 0, batch_size * sizeof(Scalar)); + Cuda2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d); + ComputeValueOfVKernel<<<cfg2D.block_count, cfg2D.thread_per_block, 0, + d.stream()>>>( + cfg2D, m, full_matrices_ ? m : p, input_copy.flat<Scalar>().data(), + outputU_ptr, outputS_ptr, outputVT_ptr); + // 2. clamp V to -1 or +1 + CudaLaunchConfig cfg1D = GetCudaLaunchConfig(batch_size, d); + ExtractSignOfVKernel<<<cfg1D.block_count, cfg1D.thread_per_block, 0, + d.stream()>>>(cfg1D, outputVT_ptr); + } + } + + void CheckResult(OpKernelContext* context, DoneCallback done, + const std::vector<DeviceLapackInfo>& dev_info, + CudaSolver& solver, Tensor& catch1, Tensor& catch2) { + auto info_checker = [context, dev_info, done, catch1, catch2]( + const Status& status, const std::vector<HostLapackInfo>& /* unused */) { + Status full_status = status; + if (!full_status.ok()) { + full_status.Update(errors::InvalidArgument(kErrMsg)); + } + OP_REQUIRES_OK_ASYNC(context, full_status, done); + done(); + }; + + OP_REQUIRES_OK_ASYNC(context, solver.CopyLapackInfoToHostAsync( + dev_info, std::move(info_checker)), + done); + } + + // The SVD if m >= n + // TODO: can the two cases (MgeqN and MlessN) be simplified, + // common boilerplate be reduced, or even combined in one method? + void PerformSVD_MgeqN(OpKernelContext* context, DoneCallback done, int64 m, + int64 n, int64 p, const gtl::ArraySlice<int32>& perm, + const Tensor& M, Tensor* S, Tensor* U, Tensor* V) { + TensorShape shapeRaw = M.shape(); + shapeRaw.RemoveDim(shapeRaw.dims() - 1); + shapeRaw.RemoveDim(shapeRaw.dims() - 1); + + // Transpose M, because cuSolver expects it to be column-major + TensorShape input_shape = shapeRaw; + input_shape.AddDim(n); + input_shape.AddDim(m); + Tensor input_copy; + OP_REQUIRES_OK_ASYNC( + context, context->allocate_temp(M.dtype(), input_shape, &input_copy), + done); + auto device = context->eigen_device<GPUDevice>(); + OP_REQUIRES_OK_ASYNC(context, DoTranspose(device, M, perm, &input_copy), + done); + + // I need to transpose U at the end + // Not V, because cuSolver work column-major + Tensor u_copy; + if (compute_uv_) { + TensorShape u_shape; + if (full_matrices_) { + u_shape = U->shape(); + } else { + u_shape = shapeRaw; + u_shape.AddDim(p); + u_shape.AddDim(m); + } + OP_REQUIRES_OK_ASYNC( + context, context->allocate_temp(U->dtype(), u_shape, &u_copy), done); + } + + // get the pointers to the data + Scalar* input_ptr; + RealScalar* outputS_ptr; + Scalar* outputU_ptr = NULL; + Scalar* outputV_ptr = NULL; + auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>(); + input_ptr = input_reshaped.data(); + outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data(); + if (compute_uv_) { + outputU_ptr = u_copy.template flat_inner_dims<Scalar, 3>().data(); + outputV_ptr = V->template flat_inner_dims<Scalar, 3>().data(); + } + + // call the SVD + const int64 batch_size = input_reshaped.dimension(0); + std::vector<DeviceLapackInfo> dev_info; + dev_info.emplace_back(context, batch_size, "gesvd"); + CudaSolver solver(context); + RunSVD(context, done, m, n, p, batch_size, input_ptr, outputS_ptr, + outputU_ptr, outputV_ptr, dev_info.back().mutable_data(), solver); + + // Transpose U + if (compute_uv_) { + OP_REQUIRES_OK_ASYNC(context, DoTranspose(device, u_copy, perm, U), done); + } + + // now check if the SVD operation succeeded or not + CheckResult(context, done, dev_info, solver, input_copy, u_copy); + } + + // The SVD if m < n + void PerformSVD_MlessN(OpKernelContext* context, DoneCallback done, int64 m, + int64 n, int64 p, const gtl::ArraySlice<int32>& perm, + const Tensor& M, Tensor* S, Tensor* U, Tensor* V) { + // Perform the SVD on M' + + // Reuse the input buffer or make a copy for the SVD depending on whether + // this op owns the + // input buffer exclusively. This is needed because the SVD modifies the + // input + Tensor input_copy; + OP_REQUIRES_OK_ASYNC(context, context->forward_input_or_allocate_temp( + {0}, DataTypeToEnum<Scalar>::value, + M.shape(), &input_copy), + done); + + if (!M.SharesBufferWith(input_copy)) { + const GPUDevice& d = context->eigen_device<GPUDevice>(); + d.memcpy(input_copy.flat<Scalar>().data(), M.flat<Scalar>().data(), + M.NumElements() * sizeof(Scalar)); + } + + // I need to transpose V at the end + Tensor v_copy; + if (compute_uv_) { + TensorShape v_shape; + if (full_matrices_) { + v_shape = V->shape(); + } else { + TensorShape shapeRaw = M.shape(); + shapeRaw.RemoveDim(shapeRaw.dims() - 1); + shapeRaw.RemoveDim(shapeRaw.dims() - 1); + v_shape = shapeRaw; + v_shape.AddDim(p); + v_shape.AddDim(n); + } + OP_REQUIRES_OK_ASYNC( + context, context->allocate_temp(V->dtype(), v_shape, &v_copy), done); + } + + // get the pointers to the data + Scalar* input_ptr; + RealScalar* outputS_ptr; + Scalar* outputU_ptr = NULL; + Scalar* outputV_ptr = NULL; + auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>(); + input_ptr = input_reshaped.data(); + outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data(); + if (compute_uv_) { + // Note that U and V are flipped + outputU_ptr = v_copy.template flat_inner_dims<Scalar, 3>().data(); + outputV_ptr = U->template flat_inner_dims<Scalar, 3>().data(); + } + + // call the SVD + const int64 batch_size = input_reshaped.dimension(0); + std::vector<DeviceLapackInfo> dev_info; + dev_info.emplace_back(context, batch_size, "gesvd"); + CudaSolver solver(context); + // Note that m and n are flipped + RunSVD(context, done, n, m, p, batch_size, input_ptr, outputS_ptr, + outputU_ptr, outputV_ptr, dev_info.back().mutable_data(), solver); + + // Transpose V + if (compute_uv_) { + auto device = context->eigen_device<GPUDevice>(); + OP_REQUIRES_OK_ASYNC(context, DoTranspose(device, v_copy, perm, V), done); + } + + // now check if the SVD operation succeeded or not + CheckResult(context, done, dev_info, solver, input_copy, v_copy); + } + + void ComputeAsync(OpKernelContext* context, DoneCallback done) final { + const Tensor& input = context->input(0); + const int ndims = input.dims(); + const int64 m = input.dim_size(ndims - 2); + const int64 n = input.dim_size(ndims - 1); + const int64 p = std::min(m, n); + + // Validate inputs. + OP_REQUIRES_ASYNC( + context, ndims >= 2, + errors::InvalidArgument("Input must have rank >= 2, got ", ndims), + done); + + // output tensors. + Tensor* outputU = NULL; + Tensor* outputS = NULL; + Tensor* outputV = NULL; + + // compute shapes + TensorShape shapeRaw = input.shape(); + shapeRaw.RemoveDim(shapeRaw.dims() - 1); + shapeRaw.RemoveDim(shapeRaw.dims() - 1); + TensorShape shapeS = shapeRaw; + TensorShape shapeU = shapeRaw; + TensorShape shapeV = shapeRaw; + shapeS.AddDim(p); + if (compute_uv_) { + if (full_matrices_) { + shapeU.AddDim(m); + shapeU.AddDim(m); + shapeV.AddDim(n); + shapeV.AddDim(n); + } else { + shapeU.AddDim(m); + shapeU.AddDim(p); + shapeV.AddDim(n); + shapeV.AddDim(p); + } + } else { + shapeU = TensorShape({0}); + shapeV = TensorShape({0}); + } + + // allocate output + OP_REQUIRES_OK_ASYNC(context, context->allocate_output(0, shapeS, &outputS), + done); + OP_REQUIRES_OK_ASYNC(context, context->allocate_output(1, shapeU, &outputU), + done); + OP_REQUIRES_OK_ASYNC(context, context->allocate_output(2, shapeV, &outputV), + done); + + if (n == 0 || m == 0) { + // If X is an empty matrix (0 rows, 0 col), X * X' == X. + // Therefore, we return X. + done(); + return; + } + + // Prepare permutation + std::vector<int32> perm; + for (size_t i = 0; i < ndims - 2; ++i) perm.push_back(i); + perm.push_back(ndims - 1); // transpose last two dimensions + perm.push_back(ndims - 2); + gtl::ArraySlice<int32> permAS(perm); + + // call implementations + if (m >= n) { + PerformSVD_MgeqN(context, done, m, n, p, permAS, input, outputS, outputU, + outputV); + } else { + PerformSVD_MlessN(context, done, m, n, p, permAS, input, outputS, outputU, + outputV); + } + } + + private: + bool compute_uv_; + bool full_matrices_; +}; + +// TODO: add support for complex types +REGISTER_LINALG_OP_GPU("Svd", (SvdOpGpu<float>), float); +REGISTER_LINALG_OP_GPU("Svd", (SvdOpGpu<double>), double); +REGISTER_LINALG_OP_GPU("BatchSvd", (SvdOpGpu<float>), float); +REGISTER_LINALG_OP_GPU("BatchSvd", (SvdOpGpu<double>), double); + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc index 075bacb432..2191e4e8c5 100644 --- a/tensorflow/core/kernels/tensor_array_ops.cc +++ b/tensorflow/core/kernels/tensor_array_ops.cc @@ -1069,7 +1069,7 @@ class TensorArrayUnpackOrScatterOp : public OpKernel { } else { OP_REQUIRES( ctx, max_index < array_size, - errors::InvalidArgument("Max scatter index must be <= array size (", + errors::InvalidArgument("Max scatter index must be < array size (", max_index, " vs. ", array_size, ")")); } element_shape.RemoveDim(0); diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index 2a59282fa5..ef4737cafe 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -498,6 +498,24 @@ Returns x + y element-wise. [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) )doc"); +REGISTER_OP("_MklAdd") + .Input("x: T") + .Input("y: T") + .Input("mkl_x: uint8") + .Input("mkl_y: uint8") + .Output("z: T") + .Output("mkl_z: uint8") + .Attr( + "T: {half, float, double, uint8, int8, int16, int32, int64, complex64, " + "complex128, string}") + .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn) + .Doc(R"doc( +Returns x + y element-wise. + +*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting +[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) +)doc"); + REGISTER_OP("Sub") .BINARY_MORE() .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn) @@ -508,6 +526,19 @@ Returns x - y element-wise. [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) )doc"); +REGISTER_OP("_MklSub") + .BINARY_FEWER() + .Input("mkl_x: uint8") + .Input("mkl_y: uint8") + .Output("mkl_z: uint8") + .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn) + .Doc(R"doc( +Returns x - y element-wise. + +*NOTE*: `Sub` supports broadcasting. More about broadcasting +[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) +)doc"); + REGISTER_OP("Mul") .BINARY_MORE() .SetIsCommutative() @@ -519,6 +550,20 @@ Returns x * y element-wise. [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) )doc"); +REGISTER_OP("_MklMul") + .BINARY_MORE() + .Input("mkl_x: uint8") + .Input("mkl_y: uint8") + .Output("mkl_z: uint8") + .SetIsCommutative() + .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn) + .Doc(R"doc( +Returns x * y element-wise. + +*NOTE*: `Mul` supports broadcasting. More about broadcasting +[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) +)doc"); + REGISTER_OP("Div") .BINARY_MORE() .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn) @@ -577,6 +622,20 @@ Returns (x - y)(x - y) element-wise. [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) )doc"); +REGISTER_OP("_MklSquaredDifference") + .BINARY_FEWER() + .Input("mkl_x: uint8") + .Input("mkl_y: uint8") + .Output("mkl_z: uint8") + .SetIsCommutative() + .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn) + .Doc(R"doc( +Returns (x - y)(x - y) element-wise. + +*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting +[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) +)doc"); + #undef BINARY_FEWER #undef BINARY_MORE @@ -594,6 +653,23 @@ Returns the max of x and y (i.e. x > y ? x : y) element-wise. [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) )doc"); +REGISTER_OP("_MklMaximum") + .Input("x: T") + .Input("y: T") + .Input("mkl_x: uint8") + .Input("mkl_y: uint8") + .Output("z: T") + .Output("mkl_z: uint8") + .Attr("T: {half, float, double, int32, int64}") + .SetIsCommutative() + .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn) + .Doc(R"doc( +Returns the max of x and y (i.e. x > y ? x : y) element-wise. + +*NOTE*: `Maximum` supports broadcasting. More about broadcasting +[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) +)doc"); + REGISTER_OP("Minimum") .Input("x: T") .Input("y: T") @@ -2604,4 +2680,31 @@ Equivalent to np.digitize. @end_compatibility )doc"); +#ifdef INTEL_MKL +REGISTER_OP("_MklAddN") + .Input("inputs: N * T") + .Input("mkl_input: N * uint8") + .Output("sum: T") + .Output("mkl_sum: uint8") + .Attr("N: int >= 1") + .Attr("T: numbertype") + .SetIsCommutative() + .SetIsAggregate() + .SetShapeFn([](InferenceContext* c) { + ShapeHandle cur = c->input(c->num_inputs() - 1); + for (int i = c->num_inputs() - 2; i >= 0; --i) { + TF_RETURN_WITH_CONTEXT_IF_ERROR(c->Merge(c->input(i), cur, &cur), + "From merging shape ", i, + " with other shapes."); + } + c->set_output(0, cur); + return Status::OK(); + }) + .Doc(R"doc( +Add two input tensors element wise using mkl kernel sum. +inputs: Must all be the same size and shape. +)doc"); + +#endif // INTEL_MKL + } // namespace tensorflow diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index fd0b785b8f..22afa4db9a 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -3241,6 +3241,29 @@ MKL operator to convert a tensor from MKL layout to TensorFlow layout. NOTE Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); + +REGISTER_OP("_MklInputConversion") + .Input("input_0: T") + .Input("input_1: T") + .Input("mkl_input_0: uint8") + .Input("mkl_input_1: uint8") + .Output("output_0: T") + .Output("output_1: T") + .Output("mkl_output_0: uint8") + .Output("mkl_output_1: uint8") + // All datatypes supported by element-wise ops + .Attr( + "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, " + "complex64, complex128}") + .Attr(GetConvnetDataFormatAttrString()) + .Doc(R"doc( +MKL operator to process the inputs to an elementwise MKL op. Both inputs +need to be either in TF or in MKL format. This op is added before every +element-wise MKL op. + +NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); #endif // INTEL_MKL } // namespace tensorflow diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index c4bc57fd77..1b07f4ecf8 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -15866,6 +15866,25 @@ op { summary: "Transforms a serialized tensorflow.TensorProto proto into a Tensor." } op { + name: "SerializeTensor" + input_arg { + name: "tensor" + description: "A Tensor of type `T`." + type: "T" + } + output_arg { + name: "serialized" + description: "A serialized TensorProto proto of the input tensor." + type_attr: DT_STRING + } + attr { + name: "T" + type: "type" + description: "The type of the input tensor." + } + summary: "Transforms a Tensor into a serialized TensorProto proto." +} +op { name: "Placeholder" output_arg { name: "output" diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc index 1f7ebe91cf..f23ff083af 100644 --- a/tensorflow/core/ops/parsing_ops.cc +++ b/tensorflow/core/ops/parsing_ops.cc @@ -26,7 +26,7 @@ using shape_inference::ShapeHandle; REGISTER_OP("DecodeRaw") .Input("bytes: string") .Output("output: out_type") - .Attr("out_type: {half,float,double,int32,uint8,int16,int8,int64}") + .Attr("out_type: {half,float,double,int32,uint16,uint8,int16,int8,int64}") .Attr("little_endian: bool = true") .SetShapeFn([](InferenceContext* c) { // Note: last dimension is data dependent. diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index 5e99187d50..aebd14c7e5 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -381,7 +381,7 @@ input = b'thirteen' position = [1, 5, 7] length = [3, 2, 1] -output = [b'hir', b'ee', b'n"] +output = [b'hir', b'ee', b'n'] ``` input: Tensor of strings diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc index 86295592a8..639f6804ea 100644 --- a/tensorflow/core/platform/cuda_libdevice_path_test.cc +++ b/tensorflow/core/platform/cuda_libdevice_path_test.cc @@ -27,7 +27,7 @@ TEST(CudaLibdevicePathTest, LibdevicePath) { VLOG(2) << "Libdevice root = " << LibdeviceRoot(); std::vector<string> libdevice_files; TF_EXPECT_OK(Env::Default()->GetMatchingPaths( - io::JoinPath(LibdeviceRoot(), "libdevice.compute_*.bc"), + io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), &libdevice_files)); EXPECT_LT(0, libdevice_files.size()); } diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index ccb861c93a..9ba3a509c3 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -19,12 +19,12 @@ limitations under the License. // TensorFlow uses semantic versioning, see http://semver.org/. #define TF_MAJOR_VERSION 1 -#define TF_MINOR_VERSION 3 +#define TF_MINOR_VERSION 4 #define TF_PATCH_VERSION 0 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1", // "-beta", "-rc", "-rc.1") -#define TF_VERSION_SUFFIX "" +#define TF_VERSION_SUFFIX "-dev" #define TF_STR_HELPER(x) #x #define TF_STR(x) TF_STR_HELPER(x) diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h index af727c3d2b..f8eddbb2a9 100644 --- a/tensorflow/core/util/cuda_kernel_helper.h +++ b/tensorflow/core/util/cuda_kernel_helper.h @@ -25,6 +25,29 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" +#include "cuda/include/cuda.h" + +// Mask for all 32 threads in a warp. +#define CUDA_WARP_ALL 0xFFFFFFFF + +#if defined(CUDA_VERSION) && CUDA_VERSION < 9000 +// CUDA 9.0 introduces a new, light-weight barrier synchronization primitive +// that operates at the warp-scope. This is required to ensure visibility of +// reads/writes among threads that can make indepenent progress on Volta. +// For previous CUDA versions these synchronizations not necessary, and we +// define an empty function as a convenience for backward compatibility. +__device__ inline void __syncwarp(unsigned mask=CUDA_WARP_ALL) {} + +// CUDA 9.0 deprecates the warp-intrinsic functions (shfl, ballot, etc.) in +// favor of synchronizing versions. These ensure that all warp lanes specified +// in mask execute the intrinsic in convergence. Here we provide legacy mappings +// to the less-verbose routines provided in previous versions of CUDA. +#define __ballot_sync(mask, predicate) __ballot(predicate) +#define __shfl_sync(mask, val, srcLane, width) __shfl(val, srcLane, width) +#define __shfl_down_sync(mask, val, delta, width) __shfl_down(val, delta, width) +#define __shfl_up_sync(mask, val, delta, width) __shfl_up(val, delta, width) +#define __shfl_xor_sync(mask, val, laneMask, width) __shfl_xor(val, laneMask, width) +#endif // Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and // GetCuda3DLaunchConfig: @@ -613,82 +636,95 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tf_max(const T& x, const T& y) { return x < y ? y : x; } +__device__ EIGEN_ALWAYS_INLINE unsigned CudaBallot(unsigned mask, + int predicate) { + return __ballot_sync(mask, predicate); +} + template <typename T> -__device__ EIGEN_ALWAYS_INLINE T CudaShuffle(T value, int srcLane, +__device__ EIGEN_ALWAYS_INLINE T CudaShuffle(unsigned mask, T value, + int srcLane, int width = warpSize) { - return __shfl(value, srcLane, width); + return __shfl_sync(mask, value, srcLane, width); } // Variant of the (undocumented) version from the CUDA SDK, but using unsigned // instead of float for lo and hi (which is incorrect with ftz, for example). // A bug has been filed with NVIDIA and will be fixed in the next CUDA release. // TODO(csigg): remove when the bug is fixed in the next CUDA release. -__device__ EIGEN_ALWAYS_INLINE double CudaShuffle(double value, int srcLane, +__device__ EIGEN_ALWAYS_INLINE double CudaShuffle(unsigned mask, + double value, int srcLane, int width = warpSize) { unsigned lo, hi; asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value)); - hi = __shfl(hi, srcLane, width); - lo = __shfl(lo, srcLane, width); + hi = __shfl_sync(mask, hi, srcLane, width); + lo = __shfl_sync(mask, lo, srcLane, width); asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi)); return value; } template <typename T> -__device__ EIGEN_ALWAYS_INLINE T CudaShuffleUp(T value, int delta, +__device__ EIGEN_ALWAYS_INLINE T CudaShuffleUp(unsigned mask, + T value, int delta, int width = warpSize) { - return __shfl_up(value, delta, width); + return __shfl_up_sync(mask, value, delta, width); } // Variant of the (undocumented) version from the CUDA SDK, but using unsigned // instead of float for lo and hi (which is incorrect with ftz, for example). // A bug has been filed with NVIDIA and will be fixed in the next CUDA release. // TODO(csigg): remove when the bug is fixed in the next CUDA release. -__device__ EIGEN_ALWAYS_INLINE double CudaShuffleUp(double value, int delta, +__device__ EIGEN_ALWAYS_INLINE double CudaShuffleUp(unsigned mask, + double value, int delta, int width = warpSize) { unsigned lo, hi; asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value)); - hi = __shfl_up(hi, delta, width); - lo = __shfl_up(lo, delta, width); + hi = __shfl_up_sync(mask, hi, delta, width); + lo = __shfl_up_sync(mask, lo, delta, width); asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi)); return value; } template <typename T> -__device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(T value, int delta, +__device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(unsigned mask, + T value, int delta, int width = warpSize) { - return __shfl_down(value, delta, width); + return __shfl_down_sync(mask, value, delta, width); } // Variant of the (undocumented) version from the CUDA SDK, but using unsigned // instead of float for lo and hi (which is incorrect with ftz, for example). // A bug has been filed with NVIDIA and will be fixed in the next CUDA release. // TODO(csigg): remove when the bug is fixed in the next CUDA release. -__device__ EIGEN_ALWAYS_INLINE double CudaShuffleDown(double value, int delta, +__device__ EIGEN_ALWAYS_INLINE double CudaShuffleDown(unsigned mask, + double value, int delta, int width = warpSize) { unsigned lo, hi; asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value)); - hi = __shfl_down(hi, delta, width); - lo = __shfl_down(lo, delta, width); + hi = __shfl_down_sync(mask, hi, delta, width); + lo = __shfl_down_sync(mask, lo, delta, width); asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi)); return value; } template <typename T> -__device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(T value, int laneMask, +__device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(unsigned mask, + T value, int laneMask, int width = warpSize) { - return __shfl_xor(value, laneMask, width); + return __shfl_xor_sync(mask, value, laneMask, width); } // Variant of the (undocumented) version from the CUDA SDK, but using unsigned // instead of float for lo and hi (which is incorrect with ftz, for example). // A bug has been filed with NVIDIA and will be fixed in the next CUDA release. // TODO(csigg): remove when the bug is fixed in the next CUDA release. -__device__ EIGEN_ALWAYS_INLINE double CudaShuffleXor(double value, int laneMask, +__device__ EIGEN_ALWAYS_INLINE double CudaShuffleXor(unsigned mask, + double value, int laneMask, int width = warpSize) { unsigned lo, hi; asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value)); - hi = __shfl_xor(hi, laneMask, width); - lo = __shfl_xor(lo, laneMask, width); + hi = __shfl_xor_sync(mask, hi, laneMask, width); + lo = __shfl_xor_sync(mask, lo, laneMask, width); asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi)); return value; } diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index cb22a50e8f..f4bec9524a 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -65,6 +65,8 @@ class MklShape { void SetDimensions(const size_t dimension) { dimension_ = dimension; } + void SetMklLayout(dnnLayout_t mklLayout) { mklLayout_ = mklLayout; } + void SetMklLayout(const void* primitive, size_t resourceType) { CHECK_EQ( dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive, @@ -135,6 +137,7 @@ class MklShape { size_t GetDimension() const { return dimension_; } const size_t* GetSizes() const { return sizes_; } int64 dim_size(int index) const { return sizes_[index]; } + int64 tf_dim_size(int index) const { return sizes_[tf_to_mkl_dim_map_[index]]; } const size_t* GetStrides() const { return strides_; } const size_t* GetTfToMklDimMap() const { return tf_to_mkl_dim_map_; } size_t tf_dim_idx(int index) const { return tf_to_mkl_dim_map_[index]; } @@ -581,7 +584,7 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, context->set_output(idx_data_out, output); } -inline void FowardTfTensorInToOut(OpKernelContext* context, +inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in, int idx_out) { int num_inputs = context->num_inputs(); int num_outputs = context->num_outputs(); @@ -598,7 +601,7 @@ inline void FowardTfTensorInToOut(OpKernelContext* context, } } -inline void ForwarMklTensorInToOut(OpKernelContext* context, +inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in, int idx_out) { int num_inputs = context->num_inputs(); int num_outputs = context->num_outputs(); @@ -616,6 +619,98 @@ inline void ForwarMklTensorInToOut(OpKernelContext* context, } } +// Forward the MKL shape ONLY (used in elementwise and other ops where +// we call the eigen implementation and MKL shape is not used) +inline void ForwardMklMetaDataInToOut(OpKernelContext* context, + uint idx_data_in, uint idx_data_out) { + uint idx_meta_in = GetTensorMetaDataIndex(idx_data_in, context->num_inputs()); + uint idx_meta_out = + GetTensorMetaDataIndex(idx_data_out, context->num_outputs()); + + if (IsRefType(context->input_dtype(idx_data_in))) { + context->forward_ref_input_to_ref_output(idx_meta_in, idx_meta_out); + } else { + context->set_output(idx_meta_out, context->input(idx_meta_in)); + } +} + +// Set a dummy MKL shape (called when the output is in TF format) +inline void SetDummyMklShapeOutput(OpKernelContext* context, + uint idx_data_out) { + MklShape mkl_shape_output; + mkl_shape_output.SetMklTensor(false); + AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output); +} + +// Checks if the TF shape for both MKL tensors is the same or not +// Returns: true if both TF shapes are the same, false otherwise +inline bool MklCompareShapes(const MklShape* input_shape_0, + const MklShape* input_shape_1) { + // Check for number of dimensions + if (input_shape_0->GetDimension() != input_shape_1->GetDimension()) { + return false; + } + + // Check size of each dimension + size_t ndims = input_shape_0->GetDimension(); + for (size_t i = 0; i < ndims; i++) { + if (input_shape_0->dim_size(i) != input_shape_1->dim_size(i)) { + return false; + } + } + + return true; +} + +// Checks if the TF shape for both tensors is the same or not +// Returns: true if TF shapes for both are the same, false otherwise +inline bool MklCompareShapes(const MklShape* input_shape_0, + const TensorShape* input_shape_1) { + // Check for number of dimensions + if (input_shape_0->GetDimension() != input_shape_1->dims()) { + return false; + } + + // Check size of each dimension + size_t ndims = input_shape_0->GetDimension(); + for (size_t i = 0; i < ndims; i++) { + if (input_shape_0->tf_dim_size(i) != input_shape_1->dim_size(i)) { + return false; + } + } + + return true; +} + +// Checks if the TF shape for both tensors is the same or not +// Returns: true if TF shapes for both are the same, false otherwise +inline bool MklCompareShapes(const TensorShape* input_shape_0, + const MklShape* input_shape_1) { + return MklCompareShapes(input_shape_1, input_shape_0); +} + +// Checks if the TF shape for both tensors is the same or not +// Returns: true if TF shapes for both are the same, false otherwise +inline bool MklCompareShapes(const TensorShape* input_shape_0, + const TensorShape* input_shape_1) { + // Check for number of dimensions + if (input_shape_0->dims() != input_shape_1->dims()) { + return false; + } + + // Check size of each dimension + size_t ndims = input_shape_0->dims(); + for (size_t i = 0; i < ndims; i++) { + if (input_shape_0->dim_size(i) != input_shape_1->dim_size(i)) { + return false; + } + } + + return true; +} + +// TODO(intel_tf): Remove this routine when faster MKL layout conversion is +// out. inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) { const float* buf_in = input.flat<float>().data(); float* buf_out = (*output)->flat<float>().data(); @@ -652,11 +747,19 @@ namespace mkl_op_registry { static const char* kMklOpLabel = "MklOp"; static const char* kMklOpLabelPattern = "label='MklOp'"; +// Get the name of Mkl op from original TensorFlow op +// We prefix 'Mkl' to the original op to get Mkl op. +inline string GetMklOpName(const string& name) { + // Prefix that we add to Tensorflow op name to construct Mkl op name. + const char* const kMklOpPrefix = "_Mkl"; + return string(kMklOpPrefix) + name; +} + // Check whether opname with type T is registered as MKL-compliant. // // @input: name of the op // @input: T datatype to be used for checking op -// @return: true if opname is registered as Mkl op +// @return: true if opname is registered as Mkl op; false otherwise static inline bool IsMklOp(const std::string& op_name, DataType T) { string kernel = KernelsRegisteredForOp(op_name); bool result = @@ -667,6 +770,28 @@ static inline bool IsMklOp(const std::string& op_name, DataType T) { return result; } +// Check whether opname with type T is registered as MKL-compliant and +// is element-wise. +// +// @input: name of the op +// @input: T datatype to be used for checking op +// @return: true if opname is registered as element-wise Mkl op; false otherwise +static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) { + if (!IsMklOp(op_name, T)) { + return false; + } + + bool result = (0 == op_name.compare(GetMklOpName("Add")) || + 0 == op_name.compare(GetMklOpName("Sub")) || + 0 == op_name.compare(GetMklOpName("Mul")) || + 0 == op_name.compare(GetMklOpName("Maximum")) || + 0 == op_name.compare(GetMklOpName("SquaredDifference"))); + + VLOG(1) << "mkl_op_registry::" << op_name + << " is elementwise MKL op: " << result; + return result; +} + } // namespace mkl_op_registry } // namespace tensorflow diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md index 0c0e88c1fe..c9f0c532c6 100644 --- a/tensorflow/docs_src/about/bib.md +++ b/tensorflow/docs_src/about/bib.md @@ -37,7 +37,7 @@ system, we suggest you cite this whitepaper. <pre> @misc{tensorflow2015-whitepaper, title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems}, -url={http://tensorflow.org/}, +url={https://www.tensorflow.org/}, note={Software available from tensorflow.org}, author={ Mart\'{\i}n~Abadi and diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md index 5265e5889b..5defade7ae 100644 --- a/tensorflow/docs_src/extend/estimators.md +++ b/tensorflow/docs_src/extend/estimators.md @@ -15,7 +15,7 @@ as regressors and classifiers: Construct a neural network regression model. * @{tf.estimator.DNNLinearCombinedClassifier}: Construct a neural network and linear combined classification model. -* @{tf.estimator.DNNRegressor}: +* @{tf.estimator.DNNLinearCombinedRegressor}: Construct a neural network and linear combined regression model. But what if none of `tf.estimator`'s predefined model types meets your needs? diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md index 8eed9b5c5b..67fddfe809 100644 --- a/tensorflow/docs_src/get_started/get_started.md +++ b/tensorflow/docs_src/get_started/get_started.md @@ -447,7 +447,7 @@ estimator = tf.estimator.Estimator(model_fn=model_fn) x_train = np.array([1., 2., 3., 4.]) y_train = np.array([0., -1., -2., -3.]) x_eval = np.array([2., 5., 8., 1.]) -y_eval = np.array([-1.01, -4.1, -7, 0.]) +y_eval = np.array([-1.01, -4.1, -7., 0.]) input_fn = tf.estimator.inputs.numpy_input_fn( {"x": x_train}, y_train, batch_size=4, num_epochs=None, shuffle=True) train_input_fn = tf.estimator.inputs.numpy_input_fn( diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md index 7ebf5c4a2c..04cd462848 100644 --- a/tensorflow/docs_src/install/install_c.md +++ b/tensorflow/docs_src/install/install_c.md @@ -35,7 +35,7 @@ enable TensorFlow for C: OS="linux" # Change to "darwin" for Mac OS TARGET_DIRECTORY="/usr/local" curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.3.0.tar.gz" | + "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0-dev.tar.gz" | sudo tar -C $TARGET_DIRECTORY -xz The `tar` command extracts the TensorFlow C library into the `lib` diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md index b991fd0f93..b7fa1fe39a 100644 --- a/tensorflow/docs_src/install/install_go.md +++ b/tensorflow/docs_src/install/install_go.md @@ -35,7 +35,7 @@ steps to install this library and enable TensorFlow for Go: TF_TYPE="cpu" # Change to "gpu" for GPU support TARGET_DIRECTORY='/usr/local' curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.3.0.tar.gz" | + "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0-dev.tar.gz" | sudo tar -C $TARGET_DIRECTORY -xz The `tar` command extracts the TensorFlow C library into the `lib` diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md index 2adcd4da73..e1200dde12 100644 --- a/tensorflow/docs_src/install/install_java.md +++ b/tensorflow/docs_src/install/install_java.md @@ -34,7 +34,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs: <dependency> <groupId>org.tensorflow</groupId> <artifactId>tensorflow</artifactId> - <version>1.3.0</version> + <version>1.4.0-dev</version> </dependency> ``` @@ -63,7 +63,7 @@ As an example, these steps will create a Maven project that uses TensorFlow: <dependency> <groupId>org.tensorflow</groupId> <artifactId>tensorflow</artifactId> - <version>1.3.0</version> + <version>1.4.0-dev</version> </dependency> </dependencies> </project> @@ -122,7 +122,7 @@ refer to the simpler instructions above instead. Take the following steps to install TensorFlow for Java on Linux or Mac OS: 1. Download - [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0.jar), + [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-dev.jar), which is the TensorFlow Java Archive (JAR). 2. Decide whether you will run TensorFlow for Java on CPU(s) only or with @@ -141,7 +141,7 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS: OS=$(uname -s | tr '[:upper:]' '[:lower:]') mkdir -p ./jni curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.3.0.tar.gz" | + "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0-dev.tar.gz" | tar -xz -C ./jni ### Install on Windows @@ -149,10 +149,10 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS: Take the following steps to install TensorFlow for Java on Windows: 1. Download - [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0.jar), + [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-dev.jar), which is the TensorFlow Java Archive (JAR). 2. Download the following Java Native Interface (JNI) file appropriate for - [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.3.0.zip). + [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0-dev.zip). 3. Extract this .zip file. @@ -200,7 +200,7 @@ must be part of your `classpath`. For example, you can include the downloaded `.jar` in your `classpath` by using the `-cp` compilation flag as follows: -<pre><b>javac -cp libtensorflow-1.3.0.jar HelloTF.java</b></pre> +<pre><b>javac -cp libtensorflow-1.4.0-dev.jar HelloTF.java</b></pre> ### Running @@ -214,11 +214,11 @@ two files are available to the JVM: For example, the following command line executes the `HelloTF` program on Linux and Mac OS X: -<pre><b>java -cp libtensorflow-1.3.0.jar:. -Djava.library.path=./jni HelloTF</b></pre> +<pre><b>java -cp libtensorflow-1.4.0-dev.jar:. -Djava.library.path=./jni HelloTF</b></pre> And the following command line executes the `HelloTF` program on Windows: -<pre><b>java -cp libtensorflow-1.3.0.jar;. -Djava.library.path=jni HelloTF</b></pre> +<pre><b>java -cp libtensorflow-1.4.0-dev.jar;. -Djava.library.path=jni HelloTF</b></pre> If the program prints <tt>Hello from <i>version</i></tt>, you've successfully installed TensorFlow for Java and are ready to use the API. If the program diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md index d5e481520c..b759797082 100644 --- a/tensorflow/docs_src/install/install_linux.md +++ b/tensorflow/docs_src/install/install_linux.md @@ -172,7 +172,7 @@ Take the following steps to install TensorFlow with Virtualenv: virtualenv environment: <pre>(tensorflow)$ <b>pip3 install --upgrade \ - https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp34-cp34m-linux_x86_64.whl</b></pre> + https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl</b></pre> If you encounter installation problems, see [Common Installation Problems](#common_installation_problems). @@ -277,7 +277,7 @@ take the following steps: <pre> $ <b>sudo pip3 install --upgrade \ - https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp34-cp34m-linux_x86_64.whl</b> + https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl</b> </pre> If this step fails, see @@ -464,7 +464,7 @@ Take the following steps to install TensorFlow in an Anaconda environment: <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \ - https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp34-cp34m-linux_x86_64.whl</b></pre> + https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl</b></pre> <a name="ValidateYourInstallation"></a> @@ -632,14 +632,14 @@ This section documents the relevant values for Linux installations. CPU only: <pre> -https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp27-none-linux_x86_64.whl +https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp27-none-linux_x86_64.whl </pre> GPU support: <pre> -https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp27-none-linux_x86_64.whl +https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0dev-cp27-none-linux_x86_64.whl </pre> Note that GPU support requires the NVIDIA hardware and software described in @@ -651,14 +651,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only: <pre> -https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp34-cp34m-linux_x86_64.whl +https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl </pre> GPU support: <pre> -https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp34-cp34m-linux_x86_64.whl +https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0dev-cp34-cp34m-linux_x86_64.whl </pre> Note that GPU support requires the NVIDIA hardware and software described in @@ -670,14 +670,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only: <pre> -https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp35-cp35m-linux_x86_64.whl +https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp35-cp35m-linux_x86_64.whl </pre> GPU support: <pre> -https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp35-cp35m-linux_x86_64.whl +https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0dev-cp35-cp35m-linux_x86_64.whl </pre> @@ -689,14 +689,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only: <pre> -https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0-cp36-cp36m-linux_x86_64.whl +https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0dev-cp36-cp36m-linux_x86_64.whl </pre> GPU support: <pre> -https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0-cp36-cp36m-linux_x86_64.whl +https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0dev-cp36-cp36m-linux_x86_64.whl </pre> diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md index 6552bff459..448e300b17 100644 --- a/tensorflow/docs_src/install/install_mac.md +++ b/tensorflow/docs_src/install/install_mac.md @@ -109,7 +109,7 @@ Take the following steps to install TensorFlow with Virtualenv: TensorFlow in the active Virtualenv is as follows: <pre> $ <b>pip3 install --upgrade \ - https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any.whl</b></pre> + https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py2-none-any.whl</b></pre> If you encounter installation problems, see [Common Installation Problems](#common-installation-problems). @@ -230,7 +230,7 @@ take the following steps: issue the following command: <pre> $ <b>sudo pip3 install --upgrade \ - https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any.whl</b> </pre> + https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py2-none-any.whl</b> </pre> If the preceding command fails, see [installation problems](#common-installation-problems). @@ -339,7 +339,7 @@ Take the following steps to install TensorFlow in an Anaconda environment: TensorFlow for Python 2.7: <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \ - https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any.whl</b></pre> + https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py2-none-any.whl</b></pre> <a name="ValidateYourInstallation"></a> @@ -512,7 +512,7 @@ This section documents the relevant values for Mac OS installations. <pre> -https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any.whl +https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py2-none-any.whl </pre> @@ -520,7 +520,7 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py2-none-any. <pre> -https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0-py3-none-any.whl +https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0dev-py3-none-any.whl </pre> diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md index d58db00a4c..d8925d3909 100644 --- a/tensorflow/docs_src/install/install_sources.md +++ b/tensorflow/docs_src/install/install_sources.md @@ -342,10 +342,10 @@ Invoke `pip install` to install that pip package. The filename of the `.whl` file depends on your platform. For example, the following command will install the pip package -for TensorFlow 1.3.0 on Linux: +for TensorFlow 1.4.0dev on Linux: <pre> -$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.3.0-py2-none-any.whl</b> +$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.4.0dev-py2-none-any.whl</b> </pre> ## Validate your installation diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md index 3025c9971a..ae8749c231 100644 --- a/tensorflow/docs_src/install/install_windows.md +++ b/tensorflow/docs_src/install/install_windows.md @@ -153,6 +153,9 @@ TensorFlow}. If the system outputs an error message instead of a greeting, see [Common installation problems](#common_installation_problems). +There is also a helpful [script](https://gist.github.com/mrry/ee5dbcfdd045fa48a27d56664411d41c) +for Windows TensorFlow installation issues. + ## Common installation problems We are relying on Stack Overflow to document TensorFlow installation problems diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md index 989018bc86..6ba8bb7a34 100644 --- a/tensorflow/docs_src/programmers_guide/graphs.md +++ b/tensorflow/docs_src/programmers_guide/graphs.md @@ -319,7 +319,7 @@ described below. * **`target`.** If this argument is left empty (the default), the session will only use devices in the local machine. However, you may also specify a `grpc://` URL to specify the address of a TensorFlow server, which gives the - session access to all devices on machines that that server controls. See + session access to all devices on machines that this server controls. See @{tf.train.Server} for details of how to create a TensorFlow server. For example, in the common **between-graph replication** configuration, the @{tf.Session} connects to a @{tf.train.Server} in the same diff --git a/tensorflow/docs_src/programmers_guide/tensors.md b/tensorflow/docs_src/programmers_guide/tensors.md index ff747f326f..cc4181e75e 100644 --- a/tensorflow/docs_src/programmers_guide/tensors.md +++ b/tensorflow/docs_src/programmers_guide/tensors.md @@ -147,7 +147,7 @@ Passing a single number, however, returns a subvector of a matrix, as follows: ```python -my_row_vetor = my_matrix[2] +my_row_vector = my_matrix[2] my_column_vector = my_matrix[:, 3] ``` diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md index 0815cc2a17..8037c92c73 100644 --- a/tensorflow/docs_src/tutorials/layers.md +++ b/tensorflow/docs_src/tutorials/layers.md @@ -270,7 +270,7 @@ The `padding` argument specifies one of two enumerated values (case-insensitive): `valid` (default value) or `same`. To specify that the output tensor should have the same width and height values as the input tensor, we set `padding=same` here, which instructs TensorFlow to add 0 values to the -edges of the output tensor to preserve width and height of 28. (Without padding, +edges of the input tensor to preserve width and height of 28. (Without padding, a 5x5 convolution over a 28x28 tensor will produce a 24x24 tensor, as there are 24x24 locations to extract a 5x5 tile from a 28x28 grid.) diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md index dda6e94f4b..bed8e21498 100644 --- a/tensorflow/examples/android/README.md +++ b/tensorflow/examples/android/README.md @@ -37,7 +37,7 @@ on API >= 14 devices. 4. [TF Speech](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java): Runs a simple speech recognition model built by the [audio training - tutorial](https://www.tensorflow.org/tutorials/image_retraining). Listens + tutorial](https://www.tensorflow.org/versions/master/tutorials/audio_recognition). Listens for a small set of words, and highlights them in the UI when they are recognized. diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java index 03294436b8..83cf9f0a2a 100644 --- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java +++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java @@ -110,7 +110,7 @@ public abstract class CameraActivity extends Activity implements OnImageAvailabl rgbBytes = new int[previewWidth * previewHeight]; onPreviewSizeChosen(new Size(previewSize.width, previewSize.height), 90); } - ImageUtils.convertYUV420SPToARGB8888(bytes, rgbBytes, previewWidth, previewHeight, false); + ImageUtils.convertYUV420SPToARGB8888(bytes, previewWidth, previewHeight, rgbBytes); } catch (final Exception e) { LOGGER.e(e, "Exception!"); return; diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java index 5f2ff9164c..5629f179c4 100644 --- a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java +++ b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java @@ -27,7 +27,7 @@ import java.io.FileOutputStream; public class ImageUtils { @SuppressWarnings("unused") private static final Logger LOGGER = new Logger(); - + static { try { System.loadLibrary("tensorflow_demo"); @@ -98,73 +98,105 @@ public class ImageUtils { // Always prefer the native implementation if available. private static boolean useNativeConversion = true; - public static void convertYUV420ToARGB8888( - byte[] yData, - byte[] uData, - byte[] vData, + public static void convertYUV420SPToARGB8888( + byte[] input, int width, int height, - int yRowStride, - int uvRowStride, - int uvPixelStride, - int[] out) { + int[] output) { if (useNativeConversion) { try { - convertYUV420ToARGB8888( - yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false); + ImageUtils.convertYUV420SPToARGB8888(input, output, width, height, false); return; } catch (UnsatisfiedLinkError e) { - LOGGER.w("Native YUV -> RGB implementation not found, falling back to Java implementation"); + LOGGER.w( + "Native YUV420SP -> RGB implementation not found, falling back to Java implementation"); useNativeConversion = false; } } - int i = 0; - for (int y = 0; y < height; y++) { - int pY = yRowStride * y; - int uv_row_start = uvRowStride * (y >> 1); - int pUV = uv_row_start; - int pV = uv_row_start; - - for (int x = 0; x < width; x++) { - int uv_offset = pUV + (x >> 1) * uvPixelStride; - out[i++] = - YUV2RGB( - convertByteToInt(yData, pY + x), - convertByteToInt(uData, uv_offset), - convertByteToInt(vData, uv_offset)); + // Java implementation of YUV420SP to ARGB8888 converting + final int frameSize = width * height; + for (int j = 0, yp = 0; j < height; j++) { + int uvp = frameSize + (j >> 1) * width; + int u = 0; + int v = 0; + + for (int i = 0; i < width; i++, yp++) { + int y = 0xff & input[yp]; + if ((i & 1) == 0) { + v = 0xff & input[uvp++]; + u = 0xff & input[uvp++]; + } + + output[yp] = YUV2RGB(y, u, v); } } } - private static int convertByteToInt(byte[] arr, int pos) { - return arr[pos] & 0xFF; - } - - private static int YUV2RGB(int nY, int nU, int nV) { - nY -= 16; - nU -= 128; - nV -= 128; - if (nY < 0) nY = 0; + private static int YUV2RGB(int y, int u, int v) { + // Adjust and check YUV values + y = (y - 16) < 0 ? 0 : (y - 16); + u -= 128; + v -= 128; // This is the floating point equivalent. We do the conversion in integer // because some Android devices do not have floating point in hardware. // nR = (int)(1.164 * nY + 2.018 * nU); // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU); // nB = (int)(1.164 * nY + 1.596 * nV); + int y1192 = 1192 * y; + int r = (y1192 + 1634 * v); + int g = (y1192 - 833 * v - 400 * u); + int b = (y1192 + 2066 * u); + + // Clipping RGB values to be inside boundaries [ 0 , kMaxChannelValue ] + r = r > kMaxChannelValue ? kMaxChannelValue : (r < 0 ? 0 : r); + g = g > kMaxChannelValue ? kMaxChannelValue : (g < 0 ? 0 : g); + b = b > kMaxChannelValue ? kMaxChannelValue : (b < 0 ? 0 : b); + + return 0xff000000 | ((r << 6) & 0xff0000) | ((g >> 2) & 0xff00) | ((b >> 10) & 0xff); + } + + + public static void convertYUV420ToARGB8888( + byte[] yData, + byte[] uData, + byte[] vData, + int width, + int height, + int yRowStride, + int uvRowStride, + int uvPixelStride, + int[] out) { + if (useNativeConversion) { + try { + convertYUV420ToARGB8888( + yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false); + return; + } catch (UnsatisfiedLinkError e) { + LOGGER.w( + "Native YUV420 -> RGB implementation not found, falling back to Java implementation"); + useNativeConversion = false; + } + } - final int foo = 1192 * nY; - int nR = foo + 1634 * nV; - int nG = foo - 833 * nV - 400 * nU; - int nB = foo + 2066 * nU; + int yp = 0; + for (int j = 0; j < height; j++) { + int pY = yRowStride * j; + int pUV = uvRowStride * (j >> 1); - nR = Math.min(kMaxChannelValue, Math.max(0, nR)); - nG = Math.min(kMaxChannelValue, Math.max(0, nG)); - nB = Math.min(kMaxChannelValue, Math.max(0, nB)); + for (int i = 0; i < width; i++) { + int uv_offset = pUV + (i >> 1) * uvPixelStride; - return 0xff000000 | ((nR << 6) & 0x00ff0000) | ((nG >> 2) & 0x0000FF00) | ((nB >> 10) & 0xff); + out[yp++] = YUV2RGB( + 0xff & yData[pY + i], + 0xff & uData[uv_offset], + 0xff & vData[uv_offset]); + } + } } + /** * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width and height. The * input and output must already be allocated and non-null. For efficiency, no error checking is @@ -176,7 +208,7 @@ public class ImageUtils { * @param height The height of the input image. * @param halfSize If true, downsample to 50% in each dimension, otherwise not. */ - public static native void convertYUV420SPToARGB8888( + private static native void convertYUV420SPToARGB8888( byte[] input, int[] output, int width, int height, boolean halfSize); /** @@ -193,7 +225,7 @@ public class ImageUtils { * @param halfSize If true, downsample to 50% in each dimension, otherwise not. * @param output A pre-allocated array for the ARGB 8:8:8:8 output data. */ - public static native void convertYUV420ToARGB8888( + private static native void convertYUV420ToARGB8888( byte[] y, byte[] u, byte[] v, @@ -215,7 +247,7 @@ public class ImageUtils { * @param width The width of the input image. * @param height The height of the input image. */ - public static native void convertYUV420SPToRGB565( + private static native void convertYUV420SPToRGB565( byte[] input, byte[] output, int width, int height); /** @@ -228,7 +260,7 @@ public class ImageUtils { * @param width The width of the input image. * @param height The height of the input image. */ - public static native void convertARGB8888ToYUV420SP( + private static native void convertARGB8888ToYUV420SP( int[] input, byte[] output, int width, int height); /** @@ -241,7 +273,7 @@ public class ImageUtils { * @param width The width of the input image. * @param height The height of the input image. */ - public static native void convertRGB565ToYUV420SP( + private static native void convertRGB565ToYUV420SP( byte[] input, byte[] output, int width, int height); /** diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md index 7974b8c879..7d2eb870be 100644 --- a/tensorflow/examples/ios/README.md +++ b/tensorflow/examples/ios/README.md @@ -30,7 +30,7 @@ cp ~/graphs/inception5h/* tensorflow/examples/ios/simple/data/ long time since it is big (~450MB). For example, if you want to run the simple example, then: ```bash -cd tensorflow/ios/simple +cd tensorflow/examples/ios/simple pod install open tf_simple_example.xcworkspace # obs, not the .xcodeproj directory ``` diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py index 8298a90b44..c92c38b23c 100644 --- a/tensorflow/examples/speech_commands/train.py +++ b/tensorflow/examples/speech_commands/train.py @@ -15,9 +15,10 @@ r"""Simple speech recognition to spot a limited number of keywords. This is a self-contained example script that will train a very basic audio -recognition model in TensorFlow. It can download the necessary training data, -and runs with reasonable defaults to train within a few hours even only using a -CPU. For more information see http://tensorflow.org/tutorials/audio_recognition. +recognition model in TensorFlow. It downloads the necessary training data and +runs with reasonable defaults to train within a few hours even only using a CPU. +For more information, please see +https://www.tensorflow.org/tutorials/audio_recognition. It is intended as an introduction to using neural networks for audio recognition, and is not a full speech recognition system. For more advanced diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java index ffaa242a31..4424100390 100644 --- a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java +++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java @@ -515,8 +515,6 @@ public final class Tensor implements AutoCloseable { private static int elemByteSize(DataType dataType) { switch (dataType) { - case UINT8: - return 1; case FLOAT: case INT32: return 4; @@ -524,6 +522,7 @@ public final class Tensor implements AutoCloseable { case INT64: return 8; case BOOL: + case UINT8: return 1; case STRING: throw new IllegalArgumentException("STRING tensors do not have a fixed element size"); diff --git a/tensorflow/java/src/main/native/tensor_jni.cc b/tensorflow/java/src/main/native/tensor_jni.cc index 7bfe6c896d..745abec244 100644 --- a/tensorflow/java/src/main/native/tensor_jni.cc +++ b/tensorflow/java/src/main/native/tensor_jni.cc @@ -41,8 +41,11 @@ size_t elemByteSize(TF_DataType dtype) { // have the same byte sizes. Validate that: switch (dtype) { case TF_BOOL: + case TF_UINT8: static_assert(sizeof(jboolean) == 1, "Java boolean not compatible with TF_BOOL"); + static_assert(sizeof(jbyte) == 1, + "Java byte not compatible with TF_UINT8"); return 1; case TF_FLOAT: case TF_INT32: @@ -90,6 +93,7 @@ void writeScalar(JNIEnv* env, jobject src, TF_DataType dtype, void* dst, CASE(TF_DOUBLE, jdouble, "doubleValue", "()D", Double); CASE(TF_INT32, jint, "intValue", "()I", Int); CASE(TF_INT64, jlong, "longValue", "()J", Long); + CASE(TF_UINT8, jbyte, "byteValue", "()B", Byte); #undef CASE case TF_BOOL: { jclass clazz = env->FindClass("java/lang/Boolean"); @@ -134,6 +138,7 @@ size_t write1DArray(JNIEnv* env, jarray array, TF_DataType dtype, void* dst, CASE(TF_INT32, jint, Int); CASE(TF_INT64, jlong, Long); CASE(TF_BOOL, jboolean, Boolean); + CASE(TF_UINT8, jbyte, Byte); #undef CASE default: throwException(env, kIllegalStateException, "invalid DataType(%d)", @@ -168,6 +173,7 @@ size_t read1DArray(JNIEnv* env, TF_DataType dtype, const void* src, CASE(TF_INT32, jint, Int); CASE(TF_INT64, jlong, Long); CASE(TF_BOOL, jboolean, Boolean); + CASE(TF_UINT8, jbyte, Byte); #undef CASE default: throwException(env, kIllegalStateException, "invalid DataType(%d)", diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 9c0db82cbf..524f128154 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -3785,6 +3785,7 @@ py_library( "layers/convolutional.py", "layers/core.py", "layers/layers.py", + "layers/maxout.py", "layers/normalization.py", "layers/pooling.py", ], @@ -3866,6 +3867,22 @@ py_test( ) py_test( + name = "layers_maxout_test", + size = "small", + srcs = ["layers/maxout_test.py"], + main = "layers/maxout_test.py", + srcs_version = "PY2AND3", + deps = [ + ":client_testlib", + ":framework_for_generated_wrappers", + ":layers", + ":math_ops", + ":nn_ops", + ":random_ops", + ], +) + +py_test( name = "layers_utils_test", size = "small", srcs = ["layers/utils_test.py"], diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py index e986b7ff2b..009f3ea4b3 100644 --- a/tensorflow/python/kernel_tests/decode_raw_op_test.py +++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py @@ -93,6 +93,22 @@ class DecodeRawOpTest(test.TestCase): result = decode.eval(feed_dict={in_bytes: [""]}) self.assertEqual(len(result), 1) + def testToUInt16(self): + with self.test_session(): + in_bytes = array_ops.placeholder(dtypes.string, shape=[None]) + decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.uint16) + self.assertEqual([None, None], decode.get_shape().as_list()) + + # Use FF/EE/DD/CC so that decoded value is higher than 32768 for uint16 + result = decode.eval(feed_dict={in_bytes: [b"\xFF\xEE\xDD\xCC"]}) + self.assertAllEqual( + [[0xFF + 0xEE * 256, 0xDD + 0xCC * 256]], result) + + with self.assertRaisesOpError( + "Input to DecodeRaw has length 3 that is not a multiple of 2, the " + "size of uint16"): + decode.eval(feed_dict={in_bytes: ["123", "456"]}) + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py index 543039bdd3..cce705110c 100644 --- a/tensorflow/python/kernel_tests/metrics_test.py +++ b/tensorflow/python/kernel_tests/metrics_test.py @@ -3538,7 +3538,7 @@ class MeanPerClassAccuracyTest(test.TestCase): weights_queue = data_flow_ops.FIFOQueue( 6, dtypes=dtypes_lib.float32, shapes=(1, 1)) _enqueue_vector(sess, weights_queue, [1.0]) - _enqueue_vector(sess, weights_queue, [1.0]) + _enqueue_vector(sess, weights_queue, [0.5]) _enqueue_vector(sess, weights_queue, [1.0]) _enqueue_vector(sess, weights_queue, [0.0]) _enqueue_vector(sess, weights_queue, [1.0]) @@ -3551,7 +3551,7 @@ class MeanPerClassAccuracyTest(test.TestCase): variables.local_variables_initializer().run() for _ in range(6): sess.run(update_op) - desired_output = np.mean([2.0 / 2.0, 1.0 / 2.0]) + desired_output = np.mean([2.0 / 2.0, 0.5 / 1.5]) self.assertAlmostEqual(desired_output, mean_accuracy.eval()) def testMultipleUpdatesWithMissingClass(self): diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py index bf20f5d1a9..516a9d000e 100644 --- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py +++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py @@ -645,7 +645,6 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper): with self.assertRaisesOpError(r"Segment id 0 out of range \[0, 0\)"): s.eval() - class SegmentReductionOpBenchmark(test.Benchmark): outer_dim_options = [2**x for x in range(9, 14, 2)] ratio_options = [2**x for x in range(1, 6, 2)] diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py index fd49e1a6cc..32a623e74a 100644 --- a/tensorflow/python/kernel_tests/svd_op_test.py +++ b/tensorflow/python/kernel_tests/svd_op_test.py @@ -41,23 +41,19 @@ class SvdOpTest(test.TestCase): linalg_ops.svd(vector) -def _GetSvdOpTest(dtype_, shape_, use_static_shape_): +def _GetSvdOpTest(dtype_, shape_, use_static_shape_, use_gpu_): is_complex = dtype_ in (np.complex64, np.complex128) is_single = dtype_ in (np.float32, np.complex64) + + # The gpu version returns results that are much less precise + precision_factor = 100 if use_gpu_ else 1 + tol = precision_factor * (3e-4 if is_single else 1e-12) def CompareSingularValues(self, x, y): - if is_single: - tol = 5e-5 - else: - tol = 1e-14 self.assertAllClose(x, y, atol=(x[0] + y[0]) * tol) def CompareSingularVectors(self, x, y, rank): - if is_single: - atol = 5e-4 - else: - atol = 5e-14 # We only compare the first 'rank' singular vectors since the # remainder form an arbitrary orthonormal basis for the # (row- or column-) null space, whose exact value depends on @@ -72,13 +68,9 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_): sum_of_ratios = np.sum(np.divide(y, x), -2, keepdims=True) phases = np.divide(sum_of_ratios, np.abs(sum_of_ratios)) x *= phases - self.assertAllClose(x, y, atol=atol) + self.assertAllClose(x, y, atol=2 * tol) def CheckApproximation(self, a, u, s, v, full_matrices): - if is_single: - tol = 1e-5 - else: - tol = 1e-14 # Tests that a ~= u*diag(s)*transpose(v). batch_shape = a.shape[:-2] m = a.shape[-2] @@ -99,10 +91,6 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_): # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. xx = math_ops.matmul(x, x, adjoint_a=True) identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0) - if is_single: - tol = 1e-5 - else: - tol = 1e-14 self.assertAllClose(identity.eval(), xx.eval(), atol=tol) def Test(self): @@ -116,7 +104,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_): for compute_uv in False, True: for full_matrices in False, True: - with self.test_session() as sess: + with self.test_session(use_gpu = use_gpu_) as sess: if use_static_shape_: x_tf = constant_op.constant(x_np) else: @@ -167,14 +155,15 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_): if __name__ == "__main__": - for dtype in np.float32, np.float64, np.complex64, np.complex128: - for rows in 1, 2, 5, 10, 32, 100: - for cols in 1, 2, 5, 10, 32, 100: - for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10): - shape = batch_dims + (rows, cols) - for use_static_shape in True, False: - name = "%s_%s_%s" % (dtype.__name__, "_".join(map(str, shape)), - use_static_shape) - setattr(SvdOpTest, "testSvd_" + name, - _GetSvdOpTest(dtype, shape, use_static_shape)) + for use_gpu in False, True: + for dtype in np.float32, np.float64, np.complex64, np.complex128: + for rows in 1, 2, 5, 10, 32, 100: + for cols in 1, 2, 5, 10, 32, 100: + for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10): + shape = batch_dims + (rows, cols) + for use_static_shape in True, False: + name = "%s_%s_%s_%s" % (dtype.__name__, "_".join(map(str, shape)), + use_static_shape, use_gpu) + setattr(SvdOpTest, "testSvd_" + name, + _GetSvdOpTest(dtype, shape, use_static_shape, use_gpu)) test.main() diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py index 43e4bb0ee2..3db5e4754a 100644 --- a/tensorflow/python/layers/base.py +++ b/tensorflow/python/layers/base.py @@ -25,24 +25,20 @@ from __future__ import print_function import collections import copy -import functools import re import weakref -from six.moves import xrange # pylint: disable=redefined-builtin import numpy as np -import six - from tensorflow.python.eager import context from tensorflow.python.estimator import util as estimator_util -from tensorflow.python.framework import ops from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops -from tensorflow.python.ops import variables as tf_variables from tensorflow.python.ops import variable_scope as vs -from tensorflow.python.util import nest +from tensorflow.python.ops import variables as tf_variables from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.util import nest class Layer(object): diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py index 41c67743b6..9dec3b5a47 100644 --- a/tensorflow/python/layers/convolutional.py +++ b/tensorflow/python/layers/convolutional.py @@ -20,23 +20,13 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import six -from six.moves import xrange # pylint: disable=redefined-builtin -import numpy as np - from tensorflow.python.eager import context -from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops import nn -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import init_ops -from tensorflow.python.ops import standard_ops -from tensorflow.python.ops import variable_scope as vs from tensorflow.python.framework import tensor_shape from tensorflow.python.layers import base from tensorflow.python.layers import utils -from tensorflow.python import framework +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import nn class _Conv(base.Layer): diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py index 3570c003ef..4eecf9c9a1 100644 --- a/tensorflow/python/layers/core.py +++ b/tensorflow/python/layers/core.py @@ -22,6 +22,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function + import six from six.moves import xrange # pylint: disable=redefined-builtin import numpy as np @@ -29,15 +30,13 @@ import numpy as np from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape +from tensorflow.python.layers import base +from tensorflow.python.layers import utils from tensorflow.python.ops import array_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.ops import standard_ops -from tensorflow.python.ops import variable_scope as vs - -from tensorflow.python.layers import base -from tensorflow.python.layers import utils class Dense(base.Layer): diff --git a/tensorflow/python/layers/maxout.py b/tensorflow/python/layers/maxout.py new file mode 100644 index 0000000000..1ea36dbf6a --- /dev/null +++ b/tensorflow/python/layers/maxout.py @@ -0,0 +1,108 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# pylint: disable=unused-import,g-bad-import-order +"""Contains the maxout layer +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import gen_array_ops + +from tensorflow.python.layers import base + + +def maxout(inputs, num_units, axis=-1, name=None): + """Adds a maxout op from https://arxiv.org/abs/1302.4389 + + "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, + Yoshua Bengio + + Usually the operation is performed in the filter/channel dimension. This can also be + used after fully-connected layers to reduce number of features. + + Arguments: + inputs: Tensor input + num_units: Specifies how many features will remain after maxout in the `axis` dimension + (usually channel). This must be multiple of number of `axis`. + axis: The dimension where max pooling will be performed. Default is the + last dimension. + name: Optional scope for name_scope. + + Returns: + A `Tensor` representing the results of the pooling operation. + + Raises: + ValueError: if num_units is not multiple of number of features. + """ + return MaxOut(num_units=num_units, axis=axis, name=name)(inputs) + + +class MaxOut(base.Layer): + """Adds a maxout op from https://arxiv.org/abs/1302.4389 + + "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, Yoshua + Bengio + + Usually the operation is performed in the filter/channel dimension. This can also be + used after fully-connected layers to reduce number of features. + + Arguments: + inputs: Tensor input + num_units: Specifies how many features will remain after maxout in the `axis` dimension + (usually channel). + This must be multiple of number of `axis`. + axis: The dimension where max pooling will be performed. Default is the + last dimension. + name: Optional scope for name_scope. + + Returns: + A `Tensor` representing the results of the pooling operation. + + Raises: + ValueError: if num_units is not multiple of number of features. + """ + + def __init__(self, + num_units, + axis=-1, + name=None, + **kwargs): + super(MaxOut, self).__init__( + name=name, trainable=False, **kwargs) + self.axis = axis + self.num_units = num_units + + def call(self, inputs): + inputs = ops.convert_to_tensor(inputs) + shape = inputs.get_shape().as_list() + num_channels = shape[self.axis] + if num_channels % self.num_units: + raise ValueError('number of features({}) is not ' + 'a multiple of num_units({})' + .format(num_channels, self.num_units)) + shape[self.axis] = -1 + shape += [num_channels // self.num_units] + + # Dealing with batches with arbitrary sizes + for i in range(len(shape)): + if shape[i] is None: + shape[i] = gen_array_ops.shape(inputs)[i] + outputs = math_ops.reduce_max(gen_array_ops.reshape(inputs, shape), -1, keep_dims=False) + + return outputs diff --git a/tensorflow/python/layers/maxout_test.py b/tensorflow/python/layers/maxout_test.py new file mode 100644 index 0000000000..26acac57c4 --- /dev/null +++ b/tensorflow/python/layers/maxout_test.py @@ -0,0 +1,61 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +# pylint: disable=unused-import,g-bad-import-order + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.layers import maxout +from tensorflow.python.layers import convolutional as conv_layers +from tensorflow.python.layers import core as core_layers + +from tensorflow.python.ops import random_ops +from tensorflow.python.platform import test +import numpy as np + +""" +Contains the maxout layer tests +""" + + +class MaxOutTest(test.TestCase): + def test_simple(self): + inputs = random_ops.random_uniform((64, 10, 36), seed=1) + graph = maxout.maxout(inputs, num_units=3) + self.assertEqual(graph.get_shape().as_list(), [64, 10, 3]) + + def test_fully_connected(self): + inputs = random_ops.random_uniform((64, 50), seed=1) + graph = core_layers.dense(inputs, 50) + graph = maxout.maxout(graph, num_units=10) + self.assertEqual(graph.get_shape().as_list(), [64, 10]) + + def test_nchw(self): + inputs = random_ops.random_uniform((10, 100, 100, 3), seed=1) + graph = conv_layers.conv2d(inputs, 10, 3, padding="SAME") + graph = maxout.maxout(graph, num_units=1) + self.assertEqual(graph.get_shape().as_list(), [10, 100, 100, 1]) + + def test_invalid_shape(self): + inputs = random_ops.random_uniform((10, 100, 100, 3), seed=1) + graph = conv_layers.conv2d(inputs, 3, 10, strides=(1, 1)) + with self.assertRaisesRegexp(ValueError, 'number of features'): + graph = maxout.maxout(graph, num_units=2) + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py index 222817cd3a..3bd9a0f491 100644 --- a/tensorflow/python/layers/normalization.py +++ b/tensorflow/python/layers/normalization.py @@ -26,24 +26,18 @@ import numpy as np from tensorflow.python.eager import context from tensorflow.python.framework import constant_op -from tensorflow.python.framework import dtypes -from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.layers import base +from tensorflow.python.layers import utils from tensorflow.python.ops import array_ops from tensorflow.python.ops import nn from tensorflow.python.ops import gen_resource_variable_ops from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import init_ops -from tensorflow.python.ops import standard_ops from tensorflow.python.ops import state_ops -from tensorflow.python.ops import variable_scope as vs from tensorflow.python.training import moving_averages -from tensorflow.python.framework import tensor_util -from tensorflow.python.ops import variables - -from tensorflow.python.layers import base -from tensorflow.python.layers import utils class BatchNormalization(base.Layer): diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py index e903afa0a8..6245ec5054 100644 --- a/tensorflow/python/layers/pooling.py +++ b/tensorflow/python/layers/pooling.py @@ -20,21 +20,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import six -from six.moves import xrange # pylint: disable=redefined-builtin -import numpy as np - -from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops import nn -from tensorflow.python.ops import init_ops -from tensorflow.python.ops import standard_ops -from tensorflow.python.ops import variable_scope as vs from tensorflow.python.framework import tensor_shape from tensorflow.python.layers import base from tensorflow.python.layers import utils -from tensorflow.python import framework +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import nn class _Pooling1D(base.Layer): diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py index 98c287e63e..7c71d3c952 100644 --- a/tensorflow/python/layers/utils.py +++ b/tensorflow/python/layers/utils.py @@ -20,13 +20,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import six -from six.moves import xrange # pylint: disable=redefined-builtin -import numpy as np - from tensorflow.python.ops import variables from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops import math_ops from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py index eb0b08c5fd..bfacf151e7 100644 --- a/tensorflow/python/ops/metrics_impl.py +++ b/tensorflow/python/ops/metrics_impl.py @@ -259,11 +259,10 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None): update_op: An operation that increments the confusion matrix. """ # Local variable to accumulate the predictions in the confusion matrix. - cm_dtype = dtypes.int64 if weights is not None else dtypes.float64 total_cm = _create_local( 'total_confusion_matrix', shape=[num_classes, num_classes], - dtype=cm_dtype) + dtype=dtypes.float64) # Cast the type to int64 required by confusion_matrix_ops. predictions = math_ops.to_int64(predictions) @@ -282,7 +281,7 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None): # Accumulate the prediction to current confusion matrix. current_cm = confusion_matrix.confusion_matrix( - labels, predictions, num_classes, weights=weights, dtype=cm_dtype) + labels, predictions, num_classes, weights=weights, dtype=dtypes.float64) update_op = state_ops.assign_add(total_cm, current_cm) return total_cm, update_op diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py index 0060b58bd7..eb07343850 100644 --- a/tensorflow/python/training/moving_averages.py +++ b/tensorflow/python/training/moving_averages.py @@ -278,14 +278,12 @@ class ExponentialMovingAverage(object): # Create an ExponentialMovingAverage object ema = tf.train.ExponentialMovingAverage(decay=0.9999) - # Create the shadow variables, and add ops to maintain moving averages - # of var0 and var1. - maintain_averages_op = ema.apply([var0, var1]) - - # Create an op that will update the moving averages after each training - # step. This is what we will use in place of the usual training op. with tf.control_dependencies([opt_op]): - training_op = tf.group(maintain_averages_op) + # Create the shadow variables, and add ops to maintain moving averages + # of var0 and var1. This also creates an op that will update the moving + # averages after each training step. This is what we will use in place + # of the usual training op. + training_op = ema.apply([var0, var1]) ...train the model by running training_op... ``` diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 5c156e7ee2..54649dab01 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -980,6 +980,7 @@ check_deps = rule( def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]): cuda_deps = [ clean_dep("//tensorflow/core:stream_executor_headers_lib"), + "@local_config_cuda//cuda:cuda_headers", "@local_config_cuda//cuda:cudart_static", ] deps = deps + tf_custom_op_library_additional_deps() diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds index 850f0edd94..bddb87f00c 100644 --- a/tensorflow/tf_exported_symbols.lds +++ b/tensorflow/tf_exported_symbols.lds @@ -1,6 +1,6 @@ *tensorflow* *perftools*gputools* *tf_* -TF_* -TFE_* +*TF_* +*TFE_* *nsync_* diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds index 73d4c0cae4..11f66c5c8b 100644 --- a/tensorflow/tf_version_script.lds +++ b/tensorflow/tf_version_script.lds @@ -2,8 +2,8 @@ tensorflow { global: *tensorflow*; *perftools*gputools*; - TF_*; - TFE_*; + *TF_*; + *TFE_*; *nsync_*; local: *; diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh index 61f5ed084c..f6e3d2e6c7 100644 --- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh +++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh @@ -60,8 +60,11 @@ reinstall_tensorflow_pip ${PIP_NAME} # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore, # which will result testing system installed tensorflow +# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after +# https://github.com/tensorflow/tensorflow/issues/12844 is fixed. bazel test -c opt $BUILD_OPTS -k --test_output=errors \ --define=no_tensorflow_py_deps=true --test_lang_filters=py \ --test_tag_filters=-no_pip,-no_windows \ --build_tag_filters=-no_pip,-no_windows --build_tests_only \ + --test_env=TF_SAVER_LENIENT_NAMES=True \ //${PY_TEST_DIR}/tensorflow/python/... diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh index e1972a3100..25d327c818 100644 --- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh +++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh @@ -61,8 +61,11 @@ reinstall_tensorflow_pip ${PIP_NAME} # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore, # which will result testing system installed tensorflow # GPU tests are very flaky when running concurrently, so set local_test_jobs=1 +# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after +# https://github.com/tensorflow/tensorflow/issues/12844 is fixed. bazel test -c opt $BUILD_OPTS -k --test_output=errors \ --define=no_tensorflow_py_deps=true --test_lang_filters=py \ --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \ --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \ + --test_env=TF_SAVER_LENIENT_NAMES=True \ --local_test_jobs=1 --build_tests_only //${PY_TEST_DIR}/tensorflow/python/... diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel index 1b97c0d108..4cfaf68ef3 100644 --- a/tensorflow/tools/docker/Dockerfile.devel +++ b/tensorflow/tools/docker/Dockerfile.devel @@ -72,7 +72,7 @@ RUN mkdir /bazel && \ RUN git clone https://github.com/tensorflow/tensorflow.git && \ cd tensorflow && \ - git checkout r1.3 + git checkout r1.4 WORKDIR /tensorflow # TODO(craigcitro): Don't install the pip package, since it makes it diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu index 80b45ae704..8d7e759bb2 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-gpu +++ b/tensorflow/tools/docker/Dockerfile.devel-gpu @@ -73,7 +73,7 @@ RUN mkdir /bazel && \ RUN git clone https://github.com/tensorflow/tensorflow.git && \ cd tensorflow && \ - git checkout r1.3 + git checkout r1.4 WORKDIR /tensorflow # Configure the build for our CUDA configuration. diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh index f48fdcc9ec..cbf06a97d0 100755 --- a/tensorflow/tools/pip_package/build_pip_package.sh +++ b/tensorflow/tools/pip_package/build_pip_package.sh @@ -98,6 +98,7 @@ function main() { "${TMPDIR}/external" RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow else + RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow if [ -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external ]; then # Old-style runfiles structure (--legacy_external_runfiles). cp -R \ @@ -108,12 +109,12 @@ function main() { bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \ "${TMPDIR}/external" # Copy MKL libs over so they can be loaded at runtime - so_lib_dir="bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8" - if [ -d ${so_lib_dir} ]; then - mkl_so_dir=$(ls ${so_lib_dir} | grep mkl) - if [ $? -eq 0 ]; then - mkdir "${TMPDIR}/_solib_k8" - cp -R ${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/_solib_k8" + so_lib_dir=$(ls $RUNFILES | grep solib) || true + if [ -n "${so_lib_dir}" ]; then + mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true + if [ -n "${mkl_so_dir}" ]; then + mkdir "${TMPDIR}/${so_lib_dir}" + cp -R ${RUNFILES}/${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/${so_lib_dir}" fi fi else @@ -127,16 +128,15 @@ function main() { bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \ "${TMPDIR}/external" # Copy MKL libs over so they can be loaded at runtime - so_lib_dir="bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/_solib_k8" - if [ -d ${so_lib_dir} ]; then - mkl_so_dir=$(ls ${so_lib_dir} | grep mkl) - if [ $? -eq 0 ]; then - mkdir "${TMPDIR}/_solib_k8" - cp -R ${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/_solib_k8" + so_lib_dir=$(ls $RUNFILES | grep solib) || true + if [ -n "${so_lib_dir}" ]; then + mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true + if [ -n "${mkl_so_dir}" ]; then + mkdir "${TMPDIR}/${so_lib_dir}" + cp -R ${RUNFILES}/${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/${so_lib_dir}" fi fi fi - RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow fi # protobuf pip package doesn't ship with header files. Copy the headers diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index dd5a1d7449..00dffc4d27 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -29,7 +29,7 @@ from setuptools.dist import Distribution # This version string is semver compatible, but incompatible with pip. # For pip, we will remove all '-' characters from this string, and use the # result for pip. -_VERSION = '1.3.0' +_VERSION = '1.4.0-dev' REQUIRED_PACKAGES = [ 'enum34 >= 1.1.6', @@ -192,7 +192,7 @@ setup( version=_VERSION.replace('-', ''), description='TensorFlow helps the tensors flow', long_description='', - url='http://tensorflow.org/', + url='https://www.tensorflow.org/', author='Google Inc.', author_email='opensource@google.com', # Contained modules and scripts. @@ -233,8 +233,8 @@ setup( 'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Software Development', - 'Topic :: Software Development :: Libraries', - 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules', ], license='Apache 2.0', keywords='tensorflow tensor machine learning',) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 431676c52d..2f24e2f019 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -99,7 +99,8 @@ def _execute_and_check_ret_code(repo_ctx, cmd_and_args): # Apply a patch_file to the repository root directory # Runs 'patch -p1' def _apply_patch(repo_ctx, patch_file): - if not repo_ctx.which("patch"): + # Don't check patch on Windows, because patch is only available under bash. + if not _is_windows(repo_ctx) and not repo_ctx.which("patch"): fail("patch command is not found, please install it") cmd = [ @@ -628,11 +629,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): temp_workaround_http_archive( name = "nccl_archive", urls = [ - "http://mirror.bazel.build/github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz", - "https://github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz", + "http://mirror.bazel.build/github.com/nvidia/nccl/archive/29a1a916dc14bb2c00feed3d4820d51fa85be1e6.tar.gz", + "https://github.com/nvidia/nccl/archive/29a1a916dc14bb2c00feed3d4820d51fa85be1e6.tar.gz", ], - sha256 = "6c34a0862d9f8ed4ad5984c6a8206b351957bb14cf6ad7822720f285f4aada04", - strip_prefix = "nccl-ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b", + sha256 = "6387030e37d14762f87eefbc86ee527293ec04745c66ccd820cf7fc0fdc23f92", + strip_prefix = "nccl-29a1a916dc14bb2c00feed3d4820d51fa85be1e6", build_file = str(Label("//third_party:nccl.BUILD")), repository = tf_repo_name, ) |