From 3e975ea978bac4d861bb09328b06f3c316212611 Mon Sep 17 00:00:00 2001 From: Andrew Harp Date: Wed, 1 Mar 2017 17:59:22 -0800 Subject: Merge changes from github. Change: 148954491 --- README.md | 11 +- RELEASE.md | 2 + tensorflow/cc/training/coordinator.h | 2 - tensorflow/compiler/tf2xla/kernels/binary_ops.cc | 37 ++- tensorflow/compiler/tf2xla/kernels/cwise_ops.h | 6 +- tensorflow/compiler/tf2xla/kernels/relu_op.cc | 43 ++-- .../compiler/xla/client/computation_builder.cc | 5 +- .../compiler/xla/client/computation_builder.h | 5 +- tensorflow/contrib/cmake/external/eigen.cmake | 1 + tensorflow/contrib/cmake/tf_tests.cmake | 1 + tensorflow/contrib/hvx/README.md | 26 +++ .../contrib/learn/python/learn/estimators/dnn.py | 11 +- tensorflow/contrib/rnn/python/ops/lstm_ops.py | 9 +- .../contrib/training/python/training/evaluation.py | 3 +- tensorflow/core/BUILD | 5 + .../core/common_runtime/sycl/sycl_allocator.cc | 7 +- .../core/common_runtime/sycl/sycl_allocator.h | 11 +- tensorflow/core/common_runtime/sycl/sycl_device.cc | 2 +- tensorflow/core/common_runtime/sycl/sycl_device.h | 9 +- .../common_runtime/sycl/sycl_device_context.cc | 256 ++++++++++----------- .../core/common_runtime/sycl/sycl_device_context.h | 6 +- .../common_runtime/sycl/sycl_device_factory.cc | 12 +- .../core/distributed_runtime/master_session.cc | 1 - tensorflow/core/framework/register_types_traits.h | 19 ++ tensorflow/core/kernels/BUILD | 15 ++ tensorflow/core/kernels/cast_op.cc | 56 ++++- tensorflow/core/kernels/cast_op_impl.h | 29 +++ tensorflow/core/kernels/cast_op_impl_bool.cc | 10 + tensorflow/core/kernels/cast_op_impl_double.cc | 10 + tensorflow/core/kernels/cast_op_impl_float.cc | 10 + tensorflow/core/kernels/cast_op_impl_int32.cc | 10 + tensorflow/core/kernels/cast_op_impl_int64.cc | 12 + tensorflow/core/kernels/cast_op_test.cc | 14 ++ tensorflow/core/kernels/concat_lib.h | 8 + tensorflow/core/kernels/concat_lib_cpu.cc | 19 ++ tensorflow/core/kernels/concat_lib_cpu.h | 35 +++ tensorflow/core/kernels/concat_op.cc | 50 ++++ tensorflow/core/kernels/constant_op.cc | 18 +- tensorflow/core/kernels/control_flow_ops.cc | 24 ++ tensorflow/core/kernels/cwise_op_acos.cc | 1 + tensorflow/core/kernels/cwise_op_add_1.cc | 15 +- tensorflow/core/kernels/cwise_op_asin.cc | 1 + tensorflow/core/kernels/cwise_op_atan.cc | 1 + tensorflow/core/kernels/cwise_op_ceil.cc | 1 + tensorflow/core/kernels/cwise_op_cos.cc | 1 + tensorflow/core/kernels/cwise_op_div.cc | 12 +- tensorflow/core/kernels/cwise_op_equal_to_1.cc | 12 + tensorflow/core/kernels/cwise_op_expm1.cc | 3 + tensorflow/core/kernels/cwise_op_floor.cc | 1 + tensorflow/core/kernels/cwise_op_floor_div.cc | 21 +- tensorflow/core/kernels/cwise_op_floor_mod.cc | 10 + tensorflow/core/kernels/cwise_op_greater.cc | 14 ++ tensorflow/core/kernels/cwise_op_greater_equal.cc | 11 + tensorflow/core/kernels/cwise_op_isfinite.cc | 1 + tensorflow/core/kernels/cwise_op_isinf.cc | 1 + tensorflow/core/kernels/cwise_op_isnan.cc | 1 + tensorflow/core/kernels/cwise_op_less.cc | 10 + tensorflow/core/kernels/cwise_op_less_equal.cc | 12 + tensorflow/core/kernels/cwise_op_log.cc | 1 + tensorflow/core/kernels/cwise_op_log1p.cc | 1 + tensorflow/core/kernels/cwise_op_maximum.cc | 15 ++ tensorflow/core/kernels/cwise_op_minimum.cc | 12 + tensorflow/core/kernels/cwise_op_mul_1.cc | 8 + tensorflow/core/kernels/cwise_op_pow.cc | 1 + tensorflow/core/kernels/cwise_op_reciprocal.cc | 6 + tensorflow/core/kernels/cwise_op_round.cc | 4 +- tensorflow/core/kernels/cwise_op_rsqrt.cc | 1 + tensorflow/core/kernels/cwise_op_select.cc | 59 ++++- tensorflow/core/kernels/cwise_op_sigmoid.cc | 6 + tensorflow/core/kernels/cwise_op_sign.cc | 13 ++ tensorflow/core/kernels/cwise_op_sin.cc | 1 + tensorflow/core/kernels/cwise_op_sqrt.cc | 3 +- tensorflow/core/kernels/cwise_op_square.cc | 1 + tensorflow/core/kernels/cwise_op_tan.cc | 1 + tensorflow/core/kernels/cwise_op_tanh.cc | 1 + tensorflow/core/kernels/cwise_ops_gradients.h | 15 ++ tensorflow/core/kernels/cwise_ops_test.cc | 51 ++++ tensorflow/core/kernels/debug_ops.cc | 2 + tensorflow/core/kernels/dense_update_ops.cc | 1 + tensorflow/core/kernels/fill_functor.cc | 2 + tensorflow/core/kernels/function_ops.cc | 28 +++ tensorflow/core/kernels/matmul_op.cc | 60 ++++- tensorflow/core/kernels/mkl_matmul_op.cc | 217 +++++++++++++++++ tensorflow/core/kernels/pack_op.cc | 1 + tensorflow/core/kernels/pad_op.cc | 29 +++ tensorflow/core/kernels/reduction_ops_common.h | 25 ++ tensorflow/core/kernels/reduction_ops_max.cc | 23 ++ tensorflow/core/kernels/reduction_ops_mean.cc | 13 ++ tensorflow/core/kernels/reduction_ops_min.cc | 23 ++ tensorflow/core/kernels/reduction_ops_prod.cc | 24 ++ tensorflow/core/kernels/reduction_ops_sum.cc | 1 - tensorflow/core/kernels/relu_op.cc | 29 +++ tensorflow/core/kernels/relu_op.h | 4 + tensorflow/core/kernels/resize_bicubic_op.cc | 2 +- tensorflow/core/kernels/reverse_op.cc | 35 +++ tensorflow/core/kernels/scatter_op.cc | 4 +- tensorflow/core/kernels/sequence_ops.cc | 9 +- tensorflow/core/kernels/shape_ops.cc | 83 ++++++- tensorflow/core/kernels/softmax_op.cc | 24 +- tensorflow/core/kernels/stage_op.cc | 6 + tensorflow/core/kernels/strided_slice_op.cc | 67 ++++++ tensorflow/core/kernels/strided_slice_op_impl.h | 14 ++ tensorflow/core/kernels/tile_ops.cc | 31 +++ tensorflow/core/kernels/tile_ops_cpu_impl.h | 4 + tensorflow/core/kernels/training_ops.cc | 2 + tensorflow/core/kernels/transpose_functor_cpu.cc | 1 + tensorflow/core/kernels/transpose_op.cc | 29 +++ tensorflow/core/kernels/transpose_op.h | 11 + tensorflow/core/kernels/unpack_op.cc | 1 + tensorflow/core/kernels/variable_ops.cc | 3 +- tensorflow/core/kernels/xent_op.cc | 26 ++- tensorflow/core/ops/math_grad_test.cc | 8 +- tensorflow/core/ops/ops.pbtxt | 53 +++++ tensorflow/core/public/version.h | 2 +- .../docs_src/programmers_guide/meta_graph.md | 2 +- tensorflow/examples/image_retraining/retrain.py | 3 +- tensorflow/examples/learn/mnist.py | 4 +- tensorflow/examples/learn/text_classification.py | 9 +- .../examples/tutorials/word2vec/word2vec_basic.py | 2 +- tensorflow/examples/udacity/README.md | 7 +- tensorflow/python/client/device_lib_test.py | 2 +- tensorflow/python/framework/test_util.py | 21 +- tensorflow/python/kernel_tests/stage_op_test.py | 4 +- tensorflow/python/kernel_tests/variables_test.py | 8 + tensorflow/python/kernel_tests/xent_op_test.py | 35 ++- tensorflow/python/ops/array_ops.py | 6 +- tensorflow/python/ops/clip_ops.py | 6 +- tensorflow/python/ops/nn_grad.py | 26 ++- tensorflow/python/ops/rnn.py | 2 +- tensorflow/python/ops/special_math_ops.py | 54 +++-- tensorflow/python/ops/special_math_ops_test.py | 14 +- tensorflow/python/ops/variables.py | 7 +- tensorflow/python/platform/test.py | 10 +- .../stream_executor/cuda/cuda_diagnostics.cc | 8 +- tensorflow/tools/ci_build/README.md | 4 +- tensorflow/tools/ci_build/builds/pip.sh | 30 +-- tensorflow/tools/ci_build/ci_build.sh | 2 +- .../tools/ci_build/ci_parameterized_build.sh | 19 +- .../tools/ci_build/install/install_buildifier.sh | 5 +- .../tools/docker/parameterized_docker_build.sh | 2 +- tensorflow/tools/graph_transforms/README.md | 4 +- tensorflow/tools/pip_package/setup.py | 2 +- tensorflow/workspace.bzl | 8 +- third_party/curl.BUILD | 46 ++-- .../Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h | 98 ++++++++ third_party/mkl/BUILD | 11 +- third_party/nccl.BUILD | 18 ++ third_party/sycl/crosstool/computecpp.tpl | 33 ++- tools/bazel.rc.template | 2 +- 149 files changed, 2122 insertions(+), 428 deletions(-) create mode 100644 tensorflow/core/kernels/mkl_matmul_op.cc diff --git a/README.md b/README.md index 2dc3dcb3e6..c99e9350dd 100644 --- a/README.md +++ b/README.md @@ -33,11 +33,12 @@ and discussion.** People who are a little more adventurous can also try our nightly binaries: -* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/)) -* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) -* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/)) -* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/)) -* [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/)) +* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave)) +* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) +* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/)) +* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/)) +* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.0.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/)) +* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.0.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/)) * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/)) diff --git a/RELEASE.md b/RELEASE.md index e9f0361511..f7f5e37bd7 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -89,6 +89,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change * Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args. ## Bug Fixes and Other Changes +* Numerous C++ API updates. * New op: `parallel_stack`. * Introducing common tf io compression options constants for RecordReader/RecordWriter. @@ -127,6 +128,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change * `tf.divide` now honors the name field. * Make metrics weight broadcasting more strict. * Add new queue-like `StagingArea` and new ops: `stage` and `unstage`. +* Enable inplace update ops for strings on CPU. Speed up string concat. ## Thanks to our Contributors diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h index dbcf072015..1b107e2d06 100644 --- a/tensorflow/cc/training/coordinator.h +++ b/tensorflow/cc/training/coordinator.h @@ -115,8 +115,6 @@ class Coordinator { std::vector> runners_ GUARDED_BY(runners_lock_); - std::atomic num_runners_to_cancel_; - TF_DISALLOW_COPY_AND_ASSIGN(Coordinator); }; diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc index 39c5567f80..1f9ac029c7 100644 --- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc @@ -127,32 +127,21 @@ XLA_MAKE_BINARY(GreaterEqual, b->Ge(lhs, rhs, extend_dimensions)); XLA_MAKE_BINARY(Less, b->Lt(lhs, rhs, extend_dimensions)); XLA_MAKE_BINARY(LessEqual, b->Le(lhs, rhs, extend_dimensions)); -#undef XLA_MAKE_BINARY +// Non-linear ops +XLA_MAKE_BINARY(SigmoidGrad, + b->Mul(b->Mul(rhs, lhs), + b->Sub(XlaHelpers::One(b, input_type(0)), lhs))); -#define XLA_MAKE_BINARY_MAP(Name, HLO) \ - class Name##Op : public XlaBinaryMapOp { \ - public: \ - explicit Name##Op(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {} \ - void BuildMapLambda(xla::ComputationBuilder* b, \ - const xla::ComputationDataHandle& lhs, \ - const xla::ComputationDataHandle& rhs) override { \ - HLO; \ - } \ - }; \ - REGISTER_XLA_OP(#Name, Name##Op) +XLA_MAKE_BINARY(SoftplusGrad, + b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)), + XlaHelpers::One(b, input_type(1))))); + +XLA_MAKE_BINARY(TanhGrad, b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)), + b->Mul(lhs, lhs)))); -XLA_MAKE_BINARY_MAP(Pow, b->Pow(lhs, rhs)); -XLA_MAKE_BINARY_MAP(SigmoidGrad, - b->Mul(b->Mul(rhs, lhs), - b->Sub(XlaHelpers::One(b, input_type(0)), lhs))); -XLA_MAKE_BINARY_MAP(SoftplusGrad, - b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)), - XlaHelpers::One(b, input_type(1))))); -XLA_MAKE_BINARY_MAP(TanhGrad, - b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)), - b->Mul(lhs, lhs)))); - -#undef XLA_MAKE_BINARY_MAP +XLA_MAKE_BINARY(Pow, b->Pow(lhs, rhs, extend_dimensions)); + +#undef XLA_MAKE_BINARY } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h index f0687c1d4b..ba38693325 100644 --- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h +++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h @@ -32,9 +32,7 @@ namespace tensorflow { // description of the operation; and Computation adds the // implementation of the operation to a xla::ComputationBuilder. For most // arithmetic Ops XLA handles the broadcasting automatically given the input -// tensors. Ops like ReluGrad that need to map a scalar function over the inputs -// can use the XlaBinaryMapOp subclass below which handles manual -// broadcasting of the inputs. +// tensors. class XlaBinaryOp : public XlaOpKernel { public: explicit XlaBinaryOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { @@ -83,6 +81,8 @@ class XlaBinaryOp : public XlaOpKernel { // virtual methods to override: description is a textual description // of the mapped function; and BuildMapLambda adds the // implementation of the lambda to a xla::ComputationBuilder. +// Operations may have better performance if implemented as graphs of +// element-wise tensor operations. class XlaBinaryMapOp : public XlaBinaryOp { public: explicit XlaBinaryMapOp(OpKernelConstruction* ctx) : XlaBinaryOp(ctx) {} diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc index 2dea4032c0..d1b857c22a 100644 --- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc @@ -50,36 +50,37 @@ class Relu6Op : public XlaOpKernel { } }; -// A subclass of a XlaBinaryMapOp must build the lambda computation -// that describes the (scalar,scalar)->scalar function to apply to -// each element of the input. We have to use XlaBinaryMapOp instead of -// XlaBinaryOp here because XLA Select does not do automatic -// broadcasting. -class ReluGradOp : public XlaBinaryMapOp { +class ReluGradOp : public XlaOpKernel { public: - explicit ReluGradOp(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {} + explicit ReluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} // Return the lhs (incoming gradient) if the rhs (input feature) > 0, // otherwise return 0. - void BuildMapLambda(xla::ComputationBuilder* b, - const xla::ComputationDataHandle& gradient, - const xla::ComputationDataHandle& feature) override { - const auto zero = XlaHelpers::Zero(b, input_type(0)); - b->Select(b->Gt(feature, zero), gradient, zero); + void Compile(XlaOpKernelContext* ctx) { + xla::ComputationBuilder* b = ctx->builder(); + const TensorShape shape = ctx->InputShape(0); + const auto zero = + b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes()); + const auto pred = b->Gt(ctx->Input(1), zero); + ctx->SetOutput(0, b->Select(pred, ctx->Input(0), zero)); } }; -class Relu6GradOp : public XlaBinaryMapOp { +class Relu6GradOp : public XlaOpKernel { public: - explicit Relu6GradOp(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {} + explicit Relu6GradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} // Return the lhs (incoming gradient) if the rhs (input feature) > 0, // otherwise return 0. - void BuildMapLambda(xla::ComputationBuilder* b, - const xla::ComputationDataHandle& gradient, - const xla::ComputationDataHandle& feature) override { - const auto zero = XlaHelpers::Zero(b, input_type(0)); - auto six = XlaHelpers::IntegerLiteral(b, input_type(0), 6); - b->Select(b->LogicalAnd(b->Lt(feature, six), b->Gt(feature, zero)), - gradient, zero); + void Compile(XlaOpKernelContext* ctx) { + xla::ComputationBuilder* b = ctx->builder(); + const TensorShape shape = ctx->InputShape(0); + const auto zero = + b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes()); + const auto six = b->Broadcast( + XlaHelpers::IntegerLiteral(b, input_type(0), 6), shape.dim_sizes()); + auto out = b->Select( + b->LogicalAnd(b->Lt(ctx->Input(1), six), b->Gt(ctx->Input(1), zero)), + ctx->Input(0), zero); + ctx->SetOutput(0, out); } }; diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc index ae7695ade5..2ee6b51c18 100644 --- a/tensorflow/compiler/xla/client/computation_builder.cc +++ b/tensorflow/compiler/xla/client/computation_builder.cc @@ -1010,8 +1010,9 @@ ComputationDataHandle ComputationBuilder::SqrtF32( } ComputationDataHandle ComputationBuilder::Pow( - const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) { - return BinaryOp(BINOP_POW, lhs, rhs, /*broadcast_dimensions=*/{}); + const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, + tensorflow::gtl::ArraySlice broadcast_dimensions) { + return BinaryOp(BINOP_POW, lhs, rhs, broadcast_dimensions); } ComputationDataHandle ComputationBuilder::ConvertElementType( diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h index a49e5a8843..98649ba02f 100644 --- a/tensorflow/compiler/xla/client/computation_builder.h +++ b/tensorflow/compiler/xla/client/computation_builder.h @@ -504,8 +504,9 @@ class ComputationBuilder { ComputationDataHandle SquareF32(const ComputationDataHandle& operand); // Enqueues a lhs^rhs computation onto the computation. - ComputationDataHandle Pow(const ComputationDataHandle& lhs, - const ComputationDataHandle& rhs); + ComputationDataHandle Pow( + const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, + tensorflow::gtl::ArraySlice broadcast_dimensions = {}); // Enqueues an operator that tests if the operand's values are finite, i.e., // not Inf or NaN. Defined only for floating-point types. Returns an array of diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake index a2f5774b9f..7dd7dd4070 100644 --- a/tensorflow/contrib/cmake/external/eigen.cmake +++ b/tensorflow/contrib/cmake/external/eigen.cmake @@ -36,4 +36,5 @@ ExternalProject_Add(eigen -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF -DCMAKE_INSTALL_PREFIX:STRING=${eigen_INSTALL} -DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive + -DBUILD_TESTING:BOOL=OFF ) diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake index 84107706ca..7762328cfe 100644 --- a/tensorflow/contrib/cmake/tf_tests.cmake +++ b/tensorflow/contrib/cmake/tf_tests.cmake @@ -163,6 +163,7 @@ if (tensorflow_BUILD_PYTHON_TESTS) "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py" # training tests "${tensorflow_source_dir}/tensorflow/python/training/basic_session_run_hooks_test.py" # Needs tf.contrib fix. + "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py" # Needs tf.contrib fix. "${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py" # Needs portpicker. "${tensorflow_source_dir}/tensorflow/python/training/monitored_session_test.py" # Needs tf.contrib fix. "${tensorflow_source_dir}/tensorflow/python/training/saver_large_variable_test.py" # Overflow error. diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md index ef9c8b2e8e..6f64cbee3f 100644 --- a/tensorflow/contrib/hvx/README.md +++ b/tensorflow/contrib/hvx/README.md @@ -108,3 +108,29 @@ Finally, you can run the inference tests on your device. adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \ "/data/local/tmp/hexagon_graph_execution" ``` + +#### Troubleshooting +If you're using the Open-Q 820 Snapdragon development kit, you may run into an issue with running the executable due to a missing testsig library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project. + +If the lack of a testsig library is your problem, you will see errors of the type: +`vendor/qcom/proprietary/adsprpc/src/fastrpc_apps_user.c:169::error: -1: 0 == (nErr = remotectl_open(name, (int*)ph, dlerrstr, sizeof(dlerrstr), &dlerr))` +appearing in adb logcat. + +There are several ways to create the testsig library, the only prerequisite is Python and the correct version of the Hexagon-SDK. The following steps is one way to create this library: +1. Run adb as root: `adb root` +2. Run the command `adb shell cat /sys/devices/soc0/serial_number` +3. Convert the decimal number you get as output to hex +4. Run the python script: `python ${QUALCOMM_SDK}/tools/elfsigner/elfsigner.py -t $(SERIAL_NUMBER_HEX_VALUE)` +5. The output of the python script is a shared library stored in ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so +6. Push the shared library to your device: +``` +adb root +adb wait-for-device +adb remount +adb wait-for-device +adb shell mkdir /system/lib/rfsa +adb shell mkdir /system/lib/rfsa/adsp +adb push ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so /system/lib/rfsa/adsp/ +``` + +After rebooting your device, you should be able to run the sample application. diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py index bfe1cd0aed..f9ba6711e6 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py +++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py @@ -748,15 +748,16 @@ class _DNNEstimator(estimator.Estimator): To create a _DNNEstimator for binary classification, where estimator = _DNNEstimator( feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb], - head=head=head_lib._multi_class__head(n_classes=2), + head=head_lib._multi_class__head(n_classes=2), hidden_units=[1024, 512, 256]) If your label is keyed with "y" in your labels dict, and weights are keyed with "w" in features dict, and you want to enable centered bias, - head=head_lib._multi_class__head(n_classes=2, - label_name="x" - weight_column_name="w", - enable_centered_bias=True) + head = head_lib._multi_class__head( + n_classes=2, + label_name="x", + weight_column_name="w", + enable_centered_bias=True) estimator = _DNNEstimator( feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb], head=head, diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py index 2e6f2ac05c..c1ec46d763 100644 --- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py +++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py @@ -362,7 +362,7 @@ class LSTMBlockCell(core_rnn_cell.RNNCell): @property def state_size(self): - return (self._num_units,) * 2 + return core_rnn_cell.LSTMStateTuple(self._num_units, self._num_units) @property def output_size(self): @@ -401,7 +401,8 @@ class LSTMBlockCell(core_rnn_cell.RNNCell): forget_bias=self._forget_bias, use_peephole=self._use_peephole) - return (h, (cs, h)) + new_state = core_rnn_cell.LSTMStateTuple(cs, h) + return h, new_state class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell): @@ -544,7 +545,9 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell): # Input was a list, so return a list outputs = array_ops.unstack(outputs) - return outputs, (final_cell_state, final_output) + final_state = core_rnn_cell.LSTMStateTuple(final_cell_state, + final_output) + return outputs, final_state def _gather_states(self, data, indices, batch_size): """Produce `out`, s.t. out(i, j) = data(indices(i), i, j).""" diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py index c14f4ee440..212f2c7a7b 100644 --- a/tensorflow/contrib/training/python/training/evaluation.py +++ b/tensorflow/contrib/training/python/training/evaluation.py @@ -410,8 +410,7 @@ def evaluate_repeatedly(checkpoint_dir, '%Y-%m-%d-%H:%M:%S', time.gmtime())) num_evaluations += 1 - reached_max = num_evaluations >= max_number_of_evaluations - if max_number_of_evaluations and reached_max: + if max_number_of_evaluations is not None and num_evaluations >= max_number_of_evaluations: return final_ops_hook.final_ops_values logging.info('Timed-out waiting for a checkpoint.') diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 2f8bacca85..b59b749182 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -117,6 +117,8 @@ load( "//third_party/mkl:build_defs.bzl", "if_mkl", ) +load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl") + # ----------------------------------------------------------------------------- # Public targets @@ -686,6 +688,8 @@ cc_library( "//tensorflow/core/kernels:array_not_windows", "//tensorflow/core/kernels:math_not_windows", "//tensorflow/core/kernels:quantized_ops", + ]) + if_mkl([ + "//tensorflow/core/kernels:mkl_ops", ]), ) @@ -745,6 +749,7 @@ cc_library( ":proto_text", ":protos_all_cc", ":shape_inference_testutil", + ":sycl_runtime", ":tensor_testutil", ":test", "//tensorflow/core/kernels:constant_op", diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc index 0d238276f4..b7ef9361e9 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc @@ -25,6 +25,9 @@ string SYCLAllocator::Name() { return "device:SYCL"; } void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { assert(device_); + if (num_bytes == 0) { + return device_->allocate(1); + } auto p = device_->allocate(num_bytes); return p; } @@ -42,6 +45,6 @@ void SYCLAllocator::EnterLameDuckMode() { } } -} // namespace tensorflow +} // namespace tensorflow -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h index c896f7f603..15d9ab41a4 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h +++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h @@ -27,8 +27,8 @@ limitations under the License. namespace tensorflow { class SYCLAllocator : public Allocator { -public: - SYCLAllocator(Eigen::QueueInterface* device) : device_(device) {} + public: + SYCLAllocator(Eigen::QueueInterface *device) : device_(device) {} virtual ~SYCLAllocator() override; string Name() override; void *AllocateRaw(size_t alignment, size_t num_bytes) override; @@ -36,11 +36,12 @@ public: void EnterLameDuckMode(); virtual bool ShouldAllocateEmptyTensors() override final { return true; } -private: + + private: Eigen::QueueInterface *device_; // not owned TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator); }; -} // namespace tensorflow +} // namespace tensorflow -#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_ +#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc index 0abe25c373..2c2185b2c0 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc @@ -23,7 +23,7 @@ limitations under the License. namespace tensorflow { -static std::unordered_set live_devices; +static std::unordered_set live_devices; static bool first_time = true; void ShutdownSycl() { diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h index b5a72d9476..a5c7c5f0ec 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device.h +++ b/tensorflow/core/common_runtime/sycl/sycl_device.h @@ -34,10 +34,11 @@ class SYCLDevice : public LocalDevice { Bytes memory_limit, const DeviceLocality &locality, const string &physical_device_desc, SYCLSelector sycl_selector, Allocator *cpu_allocator) - : LocalDevice(options, Device::BuildDeviceAttributes( - name, DEVICE_SYCL, memory_limit, locality, - physical_device_desc), - nullptr), + : LocalDevice( + options, + Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit, + locality, physical_device_desc), + nullptr), cpu_allocator_(cpu_allocator), sycl_queue_(new Eigen::QueueInterface(sycl_selector)), sycl_device_(new Eigen::SyclDevice(sycl_queue_)), diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc index a6be9195d4..1c868f5606 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc @@ -17,8 +17,8 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h" #include "tensorflow/core/common_runtime/dma_helper.h" +#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h" namespace tensorflow { @@ -31,68 +31,68 @@ void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor, const void *src_ptr = DMAHelper::base(cpu_tensor); void *dst_ptr = DMAHelper::base(device_tensor); switch (cpu_tensor->dtype()) { - case DT_FLOAT: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_DOUBLE: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_INT32: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_INT64: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_HALF: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), - static_cast(src_ptr), total_bytes); - break; - case DT_COMPLEX64: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast *>(dst_ptr), - static_cast *>(src_ptr), total_bytes); - break; - case DT_COMPLEX128: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast *>(dst_ptr), - static_cast *>(src_ptr), total_bytes); - break; - case DT_INT8: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_INT16: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_UINT8: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_UINT16: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_BOOL: - device->eigen_sycl_device()->memcpyHostToDevice( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - default: - assert(false && "unsupported type"); + case DT_FLOAT: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_DOUBLE: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), + static_cast(src_ptr), total_bytes); + break; + case DT_INT32: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_INT64: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_HALF: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), + static_cast(src_ptr), total_bytes); + break; + case DT_COMPLEX64: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast *>(dst_ptr), + static_cast *>(src_ptr), total_bytes); + break; + case DT_COMPLEX128: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast *>(dst_ptr), + static_cast *>(src_ptr), total_bytes); + break; + case DT_INT8: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_INT16: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_UINT8: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_UINT16: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), + static_cast(src_ptr), total_bytes); + break; + case DT_BOOL: + device->eigen_sycl_device()->memcpyHostToDevice( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + default: + assert(false && "unsupported type"); } } device->eigen_sycl_device()->synchronize(); @@ -106,71 +106,71 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor, StatusCallback done) { const int64 total_bytes = device_tensor->TotalBytes(); if (total_bytes > 0) { - const void* src_ptr = DMAHelper::base(device_tensor); - void* dst_ptr = DMAHelper::base(cpu_tensor); + const void *src_ptr = DMAHelper::base(device_tensor); + void *dst_ptr = DMAHelper::base(cpu_tensor); switch (device_tensor->dtype()) { - case DT_FLOAT: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_DOUBLE: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_INT32: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_INT64: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_HALF: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), - static_cast(src_ptr), total_bytes); - break; - case DT_COMPLEX64: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast *>(dst_ptr), - static_cast *>(src_ptr), total_bytes); - break; - case DT_COMPLEX128: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast *>(dst_ptr), - static_cast *>(src_ptr), total_bytes); - break; - case DT_INT8: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_INT16: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_UINT8: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_UINT16: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - case DT_BOOL: - device->eigen_sycl_device()->memcpyDeviceToHost( - static_cast(dst_ptr), static_cast(src_ptr), - total_bytes); - break; - default: - assert(false && "unsupported type"); + case DT_FLOAT: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_DOUBLE: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), + static_cast(src_ptr), total_bytes); + break; + case DT_INT32: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_INT64: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_HALF: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), + static_cast(src_ptr), total_bytes); + break; + case DT_COMPLEX64: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast *>(dst_ptr), + static_cast *>(src_ptr), total_bytes); + break; + case DT_COMPLEX128: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast *>(dst_ptr), + static_cast *>(src_ptr), total_bytes); + break; + case DT_INT8: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_INT16: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_UINT8: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + case DT_UINT16: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), + static_cast(src_ptr), total_bytes); + break; + case DT_BOOL: + device->eigen_sycl_device()->memcpyDeviceToHost( + static_cast(dst_ptr), static_cast(src_ptr), + total_bytes); + break; + default: + assert(false && "unsupported type"); } } device->eigen_sycl_device()->synchronize(); @@ -178,4 +178,4 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor, } } // namespace tensorflow -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.h b/tensorflow/core/common_runtime/sycl/sycl_device_context.h index 1f7ad543d9..0f8f17b805 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device_context.h +++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.h @@ -26,7 +26,7 @@ limitations under the License. namespace tensorflow { class SYCLDeviceContext : public DeviceContext { -public: + public: SYCLDeviceContext() {} ~SYCLDeviceContext() override {} @@ -40,6 +40,6 @@ public: StatusCallback done) override; }; -} // namespace tensorflow +} // namespace tensorflow -#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_ +#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_ diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc index 51eb4973d8..a643fc7258 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc @@ -21,7 +21,7 @@ limitations under the License. namespace tensorflow { class SYCLDeviceFactory : public DeviceFactory { -public: + public: Status CreateDevices(const SessionOptions &options, const string &name_prefix, std::vector *devices) override { int n = 1; @@ -31,10 +31,10 @@ public: } for (int i = 0; i < n; i++) { string name = strings::StrCat(name_prefix, "/device:SYCL:", i); - devices->push_back(new SYCLDevice(options, name, Bytes(256 << 20), - DeviceLocality(), - SYCLDevice::GetShortDeviceDescription(), - cl::sycl::gpu_selector(), cpu_allocator())); + devices->push_back( + new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality(), + SYCLDevice::GetShortDeviceDescription(), + cl::sycl::gpu_selector(), cpu_allocator())); } return Status::OK(); } @@ -43,4 +43,4 @@ public: REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200); } -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc index 9bb92ad7ef..41ea0267b2 100644 --- a/tensorflow/core/distributed_runtime/master_session.cc +++ b/tensorflow/core/distributed_runtime/master_session.cc @@ -383,7 +383,6 @@ Status MasterSession::ReffedClientGraph::DoBuildPartitions( // Partition the graph. Status s; - std::unordered_map graph_partitions; return Partition(popts, &client_graph_->graph, out_partitions); } diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h index 8f8d9fd08e..c1fe5517c6 100644 --- a/tensorflow/core/framework/register_types_traits.h +++ b/tensorflow/core/framework/register_types_traits.h @@ -21,6 +21,10 @@ limitations under the License. typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL + #include "tensorflow/core/framework/numeric_types.h" #include "tensorflow/core/platform/types.h" @@ -66,6 +70,17 @@ struct proxy_type_pod { typedef Eigen::half type; }; +#ifdef TENSORFLOW_USE_SYCL +template <> +struct proxy_type_pod { + typedef double type; +}; +template <> +struct proxy_type_pod { + typedef float type; +}; +#endif // TENSORFLOW_USE_SYCL + /// If POD we use proxy_type_pod, otherwise this maps to identiy. template struct proxy_type { @@ -81,6 +96,10 @@ struct proxy_type { TF_CALL_int8(m) TF_CALL_complex128(m) #define TF_CALL_GPU_PROXY_TYPES(m) \ TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m) +#ifdef TENSORFLOW_USE_SYCL +#define TF_CALL_SYCL_PROXY_TYPES(m) \ + TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m) +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow #endif // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_ diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 2503b441f7..fb7d63767d 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -39,6 +39,10 @@ load( "tf_proto_library", "tf_kernel_tests_linkstatic", ) +load( + "//third_party/mkl:build_defs.bzl", + "if_mkl", +) config_setting( # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for @@ -4294,6 +4298,17 @@ tf_cc_test( ], ) +if_mkl( + tf_kernel_library( + name = "mkl_ops", + prefix = "mkl_matmul", + deps = [ + ":math", + "//third_party/mkl:intel_binary_blob", + ], + ), +) + # ----------------------------------------------------------------------------- # Google-internal targets. These must be at the end for syncrepo. diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc index ab82c247d6..562934ed63 100644 --- a/tensorflow/core/kernels/cast_op.cc +++ b/tensorflow/core/kernels/cast_op.cc @@ -34,6 +34,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL #define CURRY_TYPES2(FN, arg0) \ FN(arg0, bool); \ @@ -206,6 +209,52 @@ REGISTER_CAST_GPU(bfloat16, float); #undef REGISTER_CAST_GPU #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +class SyclCastOp : public CastOpBase { + public: + explicit SyclCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) { + OP_REQUIRES_OK(ctx, Prepare()); + } + + private: + Status Prepare() { + if (src_dtype_ == dst_dtype_) { + work_ = nullptr; // Identity + return Status::OK(); + } + if (src_dtype_ == DT_BOOL) { + work_ = GetSyclCastFromBool(dst_dtype_); + } else if (src_dtype_ == DT_INT32) { + work_ = GetSyclCastFromInt32(dst_dtype_); + } else if (src_dtype_ == DT_INT64) { + work_ = GetSyclCastFromInt64(dst_dtype_); + } else if (src_dtype_ == DT_FLOAT) { + work_ = GetSyclCastFromFloat(dst_dtype_); + } else if (src_dtype_ == DT_DOUBLE) { + work_ = GetSyclCastFromDouble(dst_dtype_); + } + + return work_ == nullptr ? Unimplemented() : Status::OK(); + } +}; + +#define REGISTER_CAST_SYCL(srctype, dsttype) \ + REGISTER_KERNEL_BUILDER(Name("Cast") \ + .TypeConstraint("SrcT") \ + .TypeConstraint("DstT") \ + .Device(DEVICE_SYCL), \ + SyclCastOp) + +CURRY_TYPES2(REGISTER_CAST_SYCL, bool); +CURRY_TYPES2(REGISTER_CAST_SYCL, int32); +CURRY_TYPES2(REGISTER_CAST_SYCL, int64); +CURRY_TYPES2(REGISTER_CAST_SYCL, float); +CURRY_TYPES2(REGISTER_CAST_SYCL, double); + +#undef REGISTER_CAST_SYCL + +#endif // TENSORFLOW_USE_SYCL + #undef CURRY_TYPES2 // HostCast differs from Cast in that its input and output are in host memory. @@ -213,5 +262,10 @@ REGISTER_KERNEL_BUILDER(Name("_HostCast").Device(DEVICE_CPU), CpuCastOp); REGISTER_KERNEL_BUILDER( Name("_HostCast").Device(DEVICE_GPU).HostMemory("x").HostMemory("y"), CpuCastOp); - +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER( + Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"), + CpuCastOp); +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow + diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h index cb7cc81937..1ee0796ac1 100644 --- a/tensorflow/core/kernels/cast_op_impl.h +++ b/tensorflow/core/kernels/cast_op_impl.h @@ -33,6 +33,16 @@ struct CastFunctor { } }; +#ifdef TENSORFLOW_USE_SYCL +template +struct CastFunctor { + void operator()(const Eigen::SyclDevice& d, typename TTypes::Flat o, + typename TTypes::ConstFlat i) { + o.device(d) = i.template cast(); + } +}; +#endif // TENSORFLOW_USE_SYCL + } // namespace functor #define CURRY_TYPES3(FN, arg0, arg1) \ @@ -140,6 +150,25 @@ GetGpuCastFromBfloat(DataType dst_dtype); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +std::function +GetSyclCastFromBool(DataType dst_dtype); + +std::function +GetSyclCastFromInt32(DataType dst_dtype); + +std::function +GetSyclCastFromInt64(DataType dst_dtype); + +std::function +GetSyclCastFromFloat(DataType dst_dtype); + +std::function +GetSyclCastFromDouble(DataType dst_dtype); + +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow #endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_ + diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc index 92fee89a47..a13f163009 100644 --- a/tensorflow/core/kernels/cast_op_impl_bool.cc +++ b/tensorflow/core/kernels/cast_op_impl_bool.cc @@ -34,4 +34,14 @@ GetGpuCastFromBool(DataType dst_dtype) { } #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +std::function +GetSyclCastFromBool(DataType dst_dtype) { + CURRY_TYPES3(CAST_CASE, SYCLDevice, bool); + return nullptr; +} +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow + diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc index fd20061d21..fdc8d51158 100644 --- a/tensorflow/core/kernels/cast_op_impl_double.cc +++ b/tensorflow/core/kernels/cast_op_impl_double.cc @@ -34,4 +34,14 @@ GetGpuCastFromDouble(DataType dst_dtype) { } #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +std::function +GetSyclCastFromDouble(DataType dst_dtype) { + CURRY_TYPES3(CAST_CASE, SYCLDevice, double); + return nullptr; +} +#endif // TENSORFLOW_USE_SYC + } // namespace tensorflow + diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc index 71e63fbff0..1241dcd8f2 100644 --- a/tensorflow/core/kernels/cast_op_impl_float.cc +++ b/tensorflow/core/kernels/cast_op_impl_float.cc @@ -49,4 +49,14 @@ GetGpuCastFromFloat(DataType dst_dtype) { } #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +std::function +GetSyclCastFromFloat(DataType dst_dtype) { + CURRY_TYPES3(CAST_CASE, SYCLDevice, float); + return nullptr; +} +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow + diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc index 0fc6e16afe..fca9cd60ec 100644 --- a/tensorflow/core/kernels/cast_op_impl_int32.cc +++ b/tensorflow/core/kernels/cast_op_impl_int32.cc @@ -34,4 +34,14 @@ GetGpuCastFromInt32(DataType dst_dtype) { } #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +std::function +GetSyclCastFromInt32(DataType dst_dtype) { + CURRY_TYPES3(CAST_CASE, CPUDevice, int32); + return nullptr; +} +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow + diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc index b5571b19a5..c0a543708d 100644 --- a/tensorflow/core/kernels/cast_op_impl_int64.cc +++ b/tensorflow/core/kernels/cast_op_impl_int64.cc @@ -19,6 +19,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL std::function GetCpuCastFromInt64(DataType dst_dtype) { @@ -34,4 +37,13 @@ GetGpuCastFromInt64(DataType dst_dtype) { } #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +std::function +GetSyclCastFromInt64(DataType dst_dtype) { + CURRY_TYPES3(CAST_CASE, SYCLDevice, int64); + return nullptr; +} +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc index 5b7529bb8a..a106f287c1 100644 --- a/tensorflow/core/kernels/cast_op_test.cc +++ b/tensorflow/core/kernels/cast_op_test.cc @@ -105,7 +105,12 @@ static void BM_gpu_float_int64(int iters, int num) { testing::BytesProcessed(static_cast(iters) * num * (sizeof(float) + sizeof(int64))); testing::UseRealTime(); +#if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); +#endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL + test::Benchmark("sycl", Cast(num)).Run(iters); +#endif // TENSORFLOW_USE_SYCL } BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20); @@ -123,7 +128,12 @@ static void BM_gpu_bool_float(int iters, int num) { testing::BytesProcessed(static_cast(iters) * num * (sizeof(bool) + sizeof(float))); testing::UseRealTime(); +#if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); +#endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL + test::Benchmark("sycl", Cast(num)).Run(iters); +#endif // TENSORFLOW_USE_SYCL } BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20); @@ -168,7 +178,9 @@ static void BM_gpu_float_half(int iters, int num) { testing::BytesProcessed(static_cast(iters) * num * (sizeof(float) + sizeof(Eigen::half))); testing::UseRealTime(); +#if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); +#endif // GOOGLE_CUDA } BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20); @@ -177,7 +189,9 @@ static void BM_gpu_half_float(int iters, int num) { testing::BytesProcessed(static_cast(iters) * num * (sizeof(float) + sizeof(Eigen::half))); testing::UseRealTime(); +#if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); +#endif // GOOGLE_CUDA } BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20); diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h index cef873f804..14e6e1bc32 100644 --- a/tensorflow/core/kernels/concat_lib.h +++ b/tensorflow/core/kernels/concat_lib.h @@ -38,6 +38,14 @@ void ConcatGPU( Tensor* output, typename TTypes::Tensor* output_flat); #endif // GOOGLE_CUDA + +#ifdef TENSORFLOW_USE_SYCL +template +void ConcatSYCL(const Eigen::SyclDevice& d, + const std::vector< + std::unique_ptr::ConstMatrix>>& inputs, + typename TTypes::Matrix* output); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow #endif // TENSORFLOW_KERNELS_CONCAT_LIB_H_ diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc index f83aed6aef..f89948350c 100644 --- a/tensorflow/core/kernels/concat_lib_cpu.cc +++ b/tensorflow/core/kernels/concat_lib_cpu.cc @@ -74,4 +74,23 @@ REGISTER(qint16) REGISTER(qint32) REGISTER(bfloat16) +#ifdef TENSORFLOW_USE_SYCL +template +void ConcatSYCL(const Eigen::SyclDevice& d, + const std::vector< + std::unique_ptr::ConstMatrix>>& inputs, + typename TTypes::Matrix* output) { + ConcatSYCLImpl(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier(), + output); +} +#define REGISTER_SYCL(T) \ + template void ConcatSYCL( \ + const Eigen::SyclDevice&, \ + const std::vector::ConstMatrix>>&, \ + typename TTypes::Matrix* output); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL) + +#undef REGISTER_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h index 9d37cafb4e..6a933efde4 100644 --- a/tensorflow/core/kernels/concat_lib_cpu.h +++ b/tensorflow/core/kernels/concat_lib_cpu.h @@ -126,4 +126,39 @@ void ConcatCPUImpl( cost_per_unit, work); } +#ifdef TENSORFLOW_USE_SYCL +template +void ConcatSYCLImpl( + const Eigen::SyclDevice& d, + const std::vector::ConstMatrix>>& + inputs, + int64 cost_per_unit, ElementCopier copier, + typename TTypes::Matrix* output) { + size_t num_inputs = inputs.size(); + + std::vector sizes; + sizes.reserve(num_inputs); + int64 row_size = 0; + for (const auto& input : inputs) { + sizes.push_back(input->dimension(1)); + row_size += sizes.back(); + } + + T* out = &(*output)(0, 0); + std::vector inp; + inp.reserve(num_inputs); + for (const auto& input : inputs) { + inp.push_back(&(*input)(0, 0)); + } + const int64 dim0 = output->dimension(0); + for (int64 i = 0; i < dim0; ++i) { + for (int64 j = 0; j < num_inputs; ++j) { + auto size = sizes[j]; + d.memcpy(out, inp[j], size * sizeof(T)); + out += size; + inp[j] += size; + } + } +} +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc index e6dae5fa7e..9628a7efa4 100644 --- a/tensorflow/core/kernels/concat_op.cc +++ b/tensorflow/core/kernels/concat_op.cc @@ -35,6 +35,9 @@ typedef Eigen::ThreadPoolDevice CPUDevice; #if GOOGLE_CUDA typedef Eigen::GpuDevice GPUDevice; #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM }; @@ -134,6 +137,12 @@ class ConcatBaseOp : public OpKernel { return; } #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL + if (std::is_same::value) { + ConcatSYCL(c->eigen_sycl_device(), inputs_flat, &output_flat); + return; + } +#endif // TENSORFLOW_USE_SYCL ConcatCPU(c->device(), inputs_flat, &output_flat); } } @@ -207,6 +216,39 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2") #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL(type) \ + REGISTER_KERNEL_BUILDER(Name("Concat") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("concat_dim"), \ + ConcatOp) \ + REGISTER_KERNEL_BUILDER(Name("ConcatV2") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tidx") \ + .HostMemory("axis"), \ + ConcatV2Op) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL); +REGISTER_KERNEL_BUILDER(Name("Concat") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .HostMemory("concat_dim") + .HostMemory("values") + .HostMemory("output"), + ConcatOp); +REGISTER_KERNEL_BUILDER(Name("ConcatV2") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tidx") + .HostMemory("values") + .HostMemory("axis") + .HostMemory("output"), + ConcatV2Op); +#undef REGISTER_SYCL +#endif // TENSORFLOW_USE_SYCL + class ConcatOffsetOp : public OpKernel { public: explicit ConcatOffsetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} @@ -293,4 +335,12 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset") .HostMemory("offset"), ConcatOffsetOp); +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("ConcatOffset") + .Device(DEVICE_SYCL) + .HostMemory("concat_dim") + .HostMemory("shape") + .HostMemory("offset"), + ConcatOffsetOp); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc index 115a842d1c..d444ddec1d 100644 --- a/tensorflow/core/kernels/constant_op.cc +++ b/tensorflow/core/kernels/constant_op.cc @@ -57,7 +57,10 @@ REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp); REGISTER_KERNEL_BUILDER( \ Name("Const").Device(DEVICE_SYCL).TypeConstraint("dtype"), \ ConstantOp); -TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL); +REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); +REGISTER_SYCL_KERNEL(bool); +REGISTER_SYCL_KERNEL(int64); #undef REGISTER_SYCL_KERNEL #endif @@ -112,6 +115,17 @@ REGISTER_KERNEL_BUILDER(Name("Const") HostConstantOp); #endif +#ifdef TENSORFLOW_USE_SYCL +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Const") + .Device(DEVICE_SYCL) + .HostMemory("output") + .TypeConstraint("dtype"), + HostConstantOp); +#endif // TENSORFLOW_USE_SYCL + typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL @@ -186,6 +200,7 @@ REGISTER_KERNEL(CPU, quint8); #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL(SYCL, float) +REGISTER_KERNEL(SYCL, double) REGISTER_KERNEL_BUILDER(Name("Fill") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -246,6 +261,7 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CPU); #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL(float, SYCL); +REGISTER_KERNEL(bool, SYCL); REGISTER_KERNEL_BUILDER(Name("ZerosLike") .Device(DEVICE_SYCL) .TypeConstraint("T") diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc index 1a73a3d0f8..6a79be5a95 100644 --- a/tensorflow/core/kernels/control_flow_ops.cc +++ b/tensorflow/core/kernels/control_flow_ops.cc @@ -321,6 +321,30 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_REF_KERNEL +#define REGISTER_SYCL_HOST_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Enter") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + EnterOp) + +#define REGISTER_SYCL_HOST_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("RefEnter") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + EnterOp) + +REGISTER_SYCL_HOST_KERNEL(int32); +REGISTER_SYCL_HOST_REF_KERNEL(int32); +REGISTER_SYCL_HOST_KERNEL(string); +REGISTER_SYCL_HOST_REF_KERNEL(string); +REGISTER_SYCL_HOST_KERNEL(ResourceHandle); + +#undef REGISTER_SYCL_HOST_KERNEL +#undef REGISTER_SYCL_HOST_REF_KERNEL #endif // Special GPU kernels for int32 and string. diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc index 1d2d815027..65801da3c7 100644 --- a/tensorflow/core/kernels/cwise_op_acos.cc +++ b/tensorflow/core/kernels/cwise_op_acos.cc @@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc index a6bff78694..f6e9b59cf8 100644 --- a/tensorflow/core/kernels/cwise_op_add_1.cc +++ b/tensorflow/core/kernels/cwise_op_add_1.cc @@ -18,7 +18,7 @@ limitations under the License. namespace tensorflow { REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32, int64); - + #if TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNEL(TYPE) \ REGISTER_KERNEL_BUILDER( \ @@ -26,10 +26,19 @@ REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32, .Device(DEVICE_SYCL) \ .TypeConstraint("T"), \ BinaryOp>); - REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL + +REGISTER_KERNEL_BUILDER(Name("Add") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); #endif // TENSORFLOW_USE_SYCL - + #if GOOGLE_CUDA REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double); diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc index 92a22e90c4..c9ebfe759b 100644 --- a/tensorflow/core/kernels/cwise_op_asin.cc +++ b/tensorflow/core/kernels/cwise_op_asin.cc @@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc index 825e85283f..72645b303f 100644 --- a/tensorflow/core/kernels/cwise_op_atan.cc +++ b/tensorflow/core/kernels/cwise_op_atan.cc @@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc index c5a4aaf831..c74e10576d 100644 --- a/tensorflow/core/kernels/cwise_op_ceil.cc +++ b/tensorflow/core/kernels/cwise_op_ceil.cc @@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc index a758da5842..634c90adc6 100644 --- a/tensorflow/core/kernels/cwise_op_cos.cc +++ b/tensorflow/core/kernels/cwise_op_cos.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc index 74d8faedb5..1e2300832f 100644 --- a/tensorflow/core/kernels/cwise_op_div.cc +++ b/tensorflow/core/kernels/cwise_op_div.cc @@ -37,8 +37,18 @@ REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double, .TypeConstraint("T"), \ BinaryOp>); REGISTER_SYCL_KERNEL(float) -REGISTER_SYCL_KERNEL(int32) +REGISTER_SYCL_KERNEL(double) #undef REGISTER_SYCL_KERNEL +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Div") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); #endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8, diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc index 7bd44abd39..93ea768836 100644 --- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc +++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc @@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("Equal") BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER2(BinaryOp, SYCL, "Equal", functor::equal_to, float, double); + +REGISTER_KERNEL_BUILDER(Name("Equal") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc index f1c53ca272..5573c2bcc2 100644 --- a/tensorflow/core/kernels/cwise_op_expm1.cc +++ b/tensorflow/core/kernels/cwise_op_expm1.cc @@ -21,4 +21,7 @@ REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double, #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(UnaryOp, SYCL, "Expm1", functor::expm1, float); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc index 129d754b82..59e32d7f6f 100644 --- a/tensorflow/core/kernels/cwise_op_floor.cc +++ b/tensorflow/core/kernels/cwise_op_floor.cc @@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc index 8a600f8f95..fa81ef0872 100644 --- a/tensorflow/core/kernels/cwise_op_floor_div.cc +++ b/tensorflow/core/kernels/cwise_op_floor_div.cc @@ -21,17 +21,6 @@ REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16, REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float, Eigen::half, double); -#if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("FloorDiv") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - BinaryOp>); -REGISTER_SYCL_KERNEL(float) -#undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL - #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16, int64); @@ -51,4 +40,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv") .TypeConstraint("T"), BinaryOp>); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("FloorDiv") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc index 4e641a8bb3..55f8a30461 100644 --- a/tensorflow/core/kernels/cwise_op_floor_mod.cc +++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc @@ -31,4 +31,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod") .TypeConstraint("T"), BinaryOp>); #endif + +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("FloorMod") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc index 8c9691d1ea..6b5a806aa2 100644 --- a/tensorflow/core/kernels/cwise_op_greater.cc +++ b/tensorflow/core/kernels/cwise_op_greater.cc @@ -33,5 +33,19 @@ REGISTER_KERNEL_BUILDER(Name("Greater") .TypeConstraint("T"), BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(BinaryOp, SYCL, "Greater", functor::greater, float); + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Greater") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc index a6083cb9cd..ac21528256 100644 --- a/tensorflow/core/kernels/cwise_op_greater_equal.cc +++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc @@ -34,4 +34,15 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual") BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float); + +REGISTER_KERNEL_BUILDER(Name("GreaterEqual") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc index 59976141c7..0faeffa95c 100644 --- a/tensorflow/core/kernels/cwise_op_isfinite.cc +++ b/tensorflow/core/kernels/cwise_op_isfinite.cc @@ -27,6 +27,7 @@ REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc index 675cb95b95..df63006b3f 100644 --- a/tensorflow/core/kernels/cwise_op_isinf.cc +++ b/tensorflow/core/kernels/cwise_op_isinf.cc @@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc index c394087ed8..e1cf7a8637 100644 --- a/tensorflow/core/kernels/cwise_op_isnan.cc +++ b/tensorflow/core/kernels/cwise_op_isnan.cc @@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc index 701007d637..a38f1024a9 100644 --- a/tensorflow/core/kernels/cwise_op_less.cc +++ b/tensorflow/core/kernels/cwise_op_less.cc @@ -33,5 +33,15 @@ REGISTER_KERNEL_BUILDER(Name("Less") .TypeConstraint("T"), BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64); +REGISTER_KERNEL_BUILDER(Name("Less") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc index 97fd1ae919..3a2cc2ae0e 100644 --- a/tensorflow/core/kernels/cwise_op_less_equal.cc +++ b/tensorflow/core/kernels/cwise_op_less_equal.cc @@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual") BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(BinaryOp, SYCL, "LessEqual", functor::less_equal, float); + +REGISTER_KERNEL_BUILDER(Name("LessEqual") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc index 71c4588b3d..5e74e778c7 100644 --- a/tensorflow/core/kernels/cwise_op_log.cc +++ b/tensorflow/core/kernels/cwise_op_log.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc index 03ea3a0a89..edb821318e 100644 --- a/tensorflow/core/kernels/cwise_op_log1p.cc +++ b/tensorflow/core/kernels/cwise_op_log1p.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc index f93b5a8303..7311f25ec0 100644 --- a/tensorflow/core/kernels/cwise_op_maximum.cc +++ b/tensorflow/core/kernels/cwise_op_maximum.cc @@ -34,4 +34,19 @@ REGISTER_KERNEL_BUILDER(Name("Maximum") BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(BinaryOp, SYCL, "Maximum", functor::maximum, float); + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Maximum") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc index 36800975a8..99e5a76620 100644 --- a/tensorflow/core/kernels/cwise_op_minimum.cc +++ b/tensorflow/core/kernels/cwise_op_minimum.cc @@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("Minimum") BinaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(BinaryOp, SYCL, "Minimum", functor::minimum, float); + +REGISTER_KERNEL_BUILDER(Name("Minimum") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc index e23fe6761d..5273522626 100644 --- a/tensorflow/core/kernels/cwise_op_mul_1.cc +++ b/tensorflow/core/kernels/cwise_op_mul_1.cc @@ -28,7 +28,15 @@ REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, .TypeConstraint("T"), \ BinaryOp>); REGISTER_SYCL_KERNEL(float) +REGISTER_SYCL_KERNEL(double) #undef REGISTER_SYCL_KERNEL +REGISTER_KERNEL_BUILDER(Name("Mul") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint("T"), + BinaryOp>); #endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double, diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc index 8eeba6ab14..f1780168e4 100644 --- a/tensorflow/core/kernels/cwise_op_pow.cc +++ b/tensorflow/core/kernels/cwise_op_pow.cc @@ -27,6 +27,7 @@ REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32, .TypeConstraint("T"), \ BinaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc index d858a077f5..8c0e21f9cf 100644 --- a/tensorflow/core/kernels/cwise_op_reciprocal.cc +++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc @@ -36,6 +36,9 @@ REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half, REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half, double, int64); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float); +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float, Eigen::half, double, complex64, complex128); @@ -43,4 +46,7 @@ REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float, REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float, Eigen::half, double); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc index 7a4482dbb2..e192f89782 100644 --- a/tensorflow/core/kernels/cwise_op_round.cc +++ b/tensorflow/core/kernels/cwise_op_round.cc @@ -20,9 +20,9 @@ REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double, int32, int64); #ifdef TENSORFLOW_USE_SYCL -REGISTER(UnaryOp, SYCL, "Round", functor::round, float); +REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double); namespace functor { -DEFINE_UNARY1(round, float); +DEFINE_UNARY2(round, float, double); } // namespace functor #endif diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc index 7dc96d47a6..f23725f48e 100644 --- a/tensorflow/core/kernels/cwise_op_rsqrt.cc +++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc index add26b5ac8..709628da13 100644 --- a/tensorflow/core/kernels/cwise_op_select.cc +++ b/tensorflow/core/kernels/cwise_op_select.cc @@ -28,6 +28,10 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL + template class SelectOp : public OpKernel { public: @@ -169,12 +173,24 @@ REGISTER_SELECT_GPU(complex128); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +// Registration of the SYCL implementations. +#define REGISTER_SELECT_SYCL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Select").Device(DEVICE_SYCL).TypeConstraint("T"), \ + SelectOp); + +REGISTER_SELECT_SYCL(float); +REGISTER_SELECT_SYCL(int32); +#undef REGISTER_SELECT_SYCL +#endif // TENSORFLOW_USE_SYCL + namespace functor { // CPU Specializations of Select functors. -template -struct SelectFunctor { - void operator()(const CPUDevice& d, typename TTypes::Flat out, +template +struct SelectFunctorBase { + void operator()(const Device& d, typename TTypes::Flat out, typename TTypes::ConstFlat cond_flat, typename TTypes::ConstFlat then_flat, typename TTypes::ConstFlat else_flat) { @@ -182,10 +198,18 @@ struct SelectFunctor { } }; -// CPU Specializations of Select functors with scalar template -struct SelectScalarFunctor { - void operator()(const CPUDevice& d, typename TTypes::Flat out, +struct SelectFunctor + : SelectFunctorBase {}; +#ifdef TENSORFLOW_USE_SYCL +template +struct SelectFunctor + : SelectFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL + +template +struct SelectScalarFunctorBase { + void operator()(const Device& d, typename TTypes::Flat out, TTypes::ConstScalar cond, typename TTypes::ConstFlat then_flat, typename TTypes::ConstFlat else_flat) { @@ -193,9 +217,19 @@ struct SelectScalarFunctor { } }; +// CPU Specializations of Select functors with scalar template -struct BatchSelectFunctor { - void operator()(const CPUDevice& d, +struct SelectScalarFunctor + : SelectScalarFunctorBase {}; +#ifdef TENSORFLOW_USE_SYCL +template +struct SelectScalarFunctor + : SelectScalarFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL + +template +struct BatchSelectFunctorBase { + void operator()(const Device& d, typename TTypes::Matrix output_flat_outer_dims, TTypes::ConstVec cond_vec, typename TTypes::ConstMatrix then_flat_outer_dims, @@ -220,6 +254,15 @@ struct BatchSelectFunctor { } }; +template +struct BatchSelectFunctor + : BatchSelectFunctorBase {}; +#ifdef TENSORFLOW_USE_SYCL +template +struct BatchSelectFunctor + : BatchSelectFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL + } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc index cc1f9b8f03..a76a088ac8 100644 --- a/tensorflow/core/kernels/cwise_op_sigmoid.cc +++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc @@ -23,6 +23,9 @@ REGISTER5(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, Eigen::half, double, REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half, double); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float); +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float, Eigen::half, double, complex64, complex128); @@ -30,5 +33,8 @@ REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float, REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float, Eigen::half, double); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc index 568906612a..dedd414db5 100644 --- a/tensorflow/core/kernels/cwise_op_sign.cc +++ b/tensorflow/core/kernels/cwise_op_sign.cc @@ -33,4 +33,17 @@ REGISTER_KERNEL_BUILDER(Name("Sign") UnaryOp>); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER(UnaryOp, SYCL, "Sign", functor::sign, float); +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Sign") + .Device(DEVICE_SYCL) + .HostMemory("x") + .HostMemory("y") + .TypeConstraint("T"), + UnaryOp>); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc index 8d0c0959f7..ab54c61b56 100644 --- a/tensorflow/core/kernels/cwise_op_sin.cc +++ b/tensorflow/core/kernels/cwise_op_sin.cc @@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYC diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc index 710001517b..55acf648db 100644 --- a/tensorflow/core/kernels/cwise_op_sqrt.cc +++ b/tensorflow/core/kernels/cwise_op_sqrt.cc @@ -27,8 +27,9 @@ REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYC +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double); diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc index f867f127a7..afcacfec1c 100644 --- a/tensorflow/core/kernels/cwise_op_square.cc +++ b/tensorflow/core/kernels/cwise_op_square.cc @@ -27,6 +27,7 @@ REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYC diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc index ac49cad88f..9c850c9420 100644 --- a/tensorflow/core/kernels/cwise_op_tan.cc +++ b/tensorflow/core/kernels/cwise_op_tan.cc @@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double); .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYC diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc index ae2c473e20..1dbc13061b 100644 --- a/tensorflow/core/kernels/cwise_op_tanh.cc +++ b/tensorflow/core/kernels/cwise_op_tanh.cc @@ -28,6 +28,7 @@ REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double, .TypeConstraint("T"), \ UnaryOp>); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif // TENSORFLOW_USE_SYC diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h index 671de380d3..77b330f589 100644 --- a/tensorflow/core/kernels/cwise_ops_gradients.h +++ b/tensorflow/core/kernels/cwise_ops_gradients.h @@ -171,6 +171,21 @@ struct SimpleBinaryFunctor { } }; + +#ifdef TENSORFLOW_USE_SYCL +// Partial specialization of BinaryFunctor for SYCL devices +typedef Eigen::SyclDevice SYCLDevice; +template +struct SimpleBinaryFunctor { + void operator()(const SYCLDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in0, + typename Functor::tin_type in1) { + out.device(d) = in0.binaryExpr(in1, typename Functor::func()); + } +}; + +#endif // TENSORFLOW_USE_SYCL + template struct tanh_grad : base> {}; diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc index 6250928aca..92018ec871 100644 --- a/tensorflow/core/kernels/cwise_ops_test.cc +++ b/tensorflow/core/kernels/cwise_ops_test.cc @@ -51,18 +51,38 @@ static int ColsFromArg(int arg) { return (arg % kRows); } BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20); BM_UNARY(cpu, Floor, float, DT_FLOAT); +#if GOOGLE_CUDA BM_UNARY(gpu, Floor, float, DT_FLOAT); +#endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +BM_UNARY(sycl, Floor, float, DT_FLOAT); +#endif // TENSORFLOW_USE_SYCL + BM_UNARY(cpu, Floor, double, DT_DOUBLE); +#if GOOGLE_CUDA BM_UNARY(gpu, Floor, double, DT_DOUBLE); +#endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +BM_UNARY(sycl, Floor, double, DT_DOUBLE); +#endif // TENSORFLOW_USE_SYCL + BM_UNARY(cpu, Conj, std::complex, DT_COMPLEX64); +#if GOOGLE_CUDA BM_UNARY(gpu, Conj, std::complex, DT_COMPLEX64); +#endif // GOOGLE_CUDA BM_UNARY(cpu, Conj, std::complex, DT_COMPLEX128); +#if GOOGLE_CUDA BM_UNARY(gpu, Conj, std::complex, DT_COMPLEX128); +#endif // GOOGLE_CUDA BM_UNARY(cpu, Rint, double, DT_DOUBLE); +#if GOOGLE_CUDA BM_UNARY(gpu, Rint, double, DT_DOUBLE); +#endif // GOOGLE_CUDA BM_UNARY(cpu, Rint, float, DT_FLOAT); +#if GOOGLE_CUDA BM_UNARY(gpu, Rint, float, DT_FLOAT); +#endif // GOOGLE_CUDA // data func scalar. static Graph* BinaryScalar(int num, const string& func) { @@ -90,9 +110,20 @@ static Graph* BinaryScalar(int num, const string& func) { ->Arg(1048576); BM_BINARY_SCALAR(cpu, Less); +#if GOOGLE_CUDA BM_BINARY_SCALAR(gpu, Less); +#endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +BM_BINARY_SCALAR(sycl, Less); +#endif // TENSORFLOW_USE_SYCL + BM_BINARY_SCALAR(cpu, Add); +#if GOOGLE_CUDA BM_BINARY_SCALAR(gpu, Add); +#endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +BM_BINARY_SCALAR(sycl, Add); +#endif // TENSORFLOW_USE_SYCL #undef BM_BINARY_SCALAR template @@ -130,9 +161,13 @@ static Graph* BiasAdd(int rows, int cols, DataType type) { using Eigen::half; BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT); +#if GOOGLE_CUDA BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT); +#endif // GOOGLE_CUDA BM_BIAS_ADD_ALL(cpu, half, DT_HALF); +#if GOOGLE_CUDA BM_BIAS_ADD_ALL(gpu, half, DT_HALF); +#endif // GOOGLE_CUDA #undef BM_BIAS_ADD_ALL #undef BM_BIAS_ADD @@ -180,12 +215,18 @@ static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type, BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1); using Eigen::half; +#if GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT); BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF); +#endif // GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT); +#if GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT); +#endif // GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF); +#if GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF); +#endif // GOOGLE_CUDA #undef BM_BIAS_ADD_GRAD_ALL #undef BM_BIAS_ADD_GRAD @@ -223,7 +264,12 @@ static Graph* BcastAdd(int rows, int cols, int dim) { BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \ BM_BCAST_ADD_ROW(DEVICE, 4096, 512); BM_BCAST_ADD_ROW_ALL(cpu); +#if GOOGLE_CUDA BM_BCAST_ADD_ROW_ALL(gpu); +#endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +BM_BCAST_ADD_ROW_ALL(sycl); +#endif // TENSORFLOW_USE_SYCL #undef BM_BCAST_ADD_ROW_ALL #undef BM_BCAST_ADD_ROW @@ -244,7 +290,12 @@ BM_BCAST_ADD_ROW_ALL(gpu); BM_BCAST_ADD_COL(DEVICE, 2048, 512); \ BM_BCAST_ADD_COL(DEVICE, 4096, 512); BM_BCAST_ADD_COL_ALL(cpu); +#if GOOGLE_CUDA BM_BCAST_ADD_COL_ALL(gpu); +#endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +BM_BCAST_ADD_COL_ALL(sycl); +#endif // TENSORFLOW_USE_SYCL #undef BM_BCAST_ADD_COL_ALL #undef BM_BCAST_ADD_COL diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc index e9ddd535ec..55a7657ea8 100644 --- a/tensorflow/core/kernels/debug_ops.cc +++ b/tensorflow/core/kernels/debug_ops.cc @@ -97,6 +97,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double); .TypeConstraint("T"), \ DebugNanCountOp); REGISTER_GPU_DEBUG_NAN_COUNT(float); +REGISTER_GPU_DEBUG_NAN_COUNT(double); #endif // TENSORFLOW_USE_SYCL // Register debug numeric summary ops. @@ -133,6 +134,7 @@ TF_CALL_double(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT); .TypeConstraint("T"), \ DebugNumericSummaryOp); REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(float); +REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(double); #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc index 42fe6e88c9..767f143727 100644 --- a/tensorflow/core/kernels/dense_update_ops.cc +++ b/tensorflow/core/kernels/dense_update_ops.cc @@ -152,6 +152,7 @@ typedef Eigen::SyclDevice SYCLDevice; DenseUpdateOp); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL #endif diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc index 08ec4baff3..0df8f9d3ed 100644 --- a/tensorflow/core/kernels/fill_functor.cc +++ b/tensorflow/core/kernels/fill_functor.cc @@ -62,6 +62,8 @@ void SetZeroFunctor::operator()( #define DEFINE_SETZERO_SYCL(T) \ template struct SetZeroFunctor; DEFINE_SETZERO_SYCL(float); +DEFINE_SETZERO_SYCL(bool); +DEFINE_SETZERO_SYCL(double); #undef DEFINE_SETZERO_SYCL #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc index 9aa289c3c9..d08dec46d1 100644 --- a/tensorflow/core/kernels/function_ops.cc +++ b/tensorflow/core/kernels/function_ops.cc @@ -185,6 +185,34 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList") .TypeConstraint("T"), PassOn); +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint("T"),\ + PassOn); \ + REGISTER_KERNEL_BUILDER( \ + Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint("T"),\ + PassOn); + +REGISTER_SYCL_KERNELS(float); +REGISTER_SYCL_KERNELS(double); + +#undef REGISTER_SYCL_KERNELS + +REGISTER_KERNEL_BUILDER(Name("_ListToArray") + .Device(DEVICE_SYCL) + .HostMemory("input") + .HostMemory("output") + .TypeConstraint("T"), + PassOn); +REGISTER_KERNEL_BUILDER(Name("_ArrayToList") + .Device(DEVICE_SYCL) + .HostMemory("input") + .HostMemory("output") + .TypeConstraint("T"), + PassOn); +#endif // TENSORFLOW_USE_SYCL + class SymbolicGradientOp : public AsyncOpKernel { public: SymbolicGradientOp(OpKernelConstruction* ctx) diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc index a2b0127fac..94fe22ed31 100644 --- a/tensorflow/core/kernels/matmul_op.cc +++ b/tensorflow/core/kernels/matmul_op.cc @@ -46,6 +46,9 @@ perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory) { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL template struct LaunchMatMul; @@ -118,27 +121,42 @@ bool ExplicitVectorMatrixOptimization( return false; } -// On CPUs, we ignore USE_CUBLAS -template -struct LaunchMatMulCPU { +template +struct LaunchMatMulBase { static void launch( OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b, const Eigen::array, 1>& dim_pair, Tensor* out) { +#ifndef TENSORFLOW_USE_SYCL // An explicit vector-matrix multiply is much better optimized than an // implicit one and this is a bottleneck during non-batched inference. bool was_vector = ExplicitVectorMatrixOptimization(a, b, dim_pair, out); if (!was_vector) { - functor::MatMulFunctor()(ctx->eigen_device(), +#endif // TENSORFLOW_USE_SYCL + functor::MatMulFunctor()(ctx->eigen_device(), out->matrix(), a.matrix(), b.matrix(), dim_pair); +#ifndef TENSORFLOW_USE_SYCL } +#endif // TENSORFLOW_USE_SYCL } }; +// On CPUs, we ignore USE_CUBLAS +template +struct LaunchMatMulCPU : LaunchMatMulBase {}; + template struct LaunchMatMul : public LaunchMatMulCPU {}; +#ifdef TENSORFLOW_USE_SYCL +template +struct LaunchMatMulSYCL : LaunchMatMulBase {}; + +template +struct LaunchMatMul : public LaunchMatMulSYCL {}; +#endif // TENSORFLOW_USE_SYCL + #if GOOGLE_CUDA template @@ -256,6 +274,20 @@ struct MatMulFunctor { } }; +#ifdef TENSORFLOW_USE_SYCL +// Partial specialization MatMulFunctor. +template +struct MatMulFunctor { + void operator()( + const SYCLDevice& d, typename MatMulTypes::out_type out, + typename MatMulTypes::in_type in0, + typename MatMulTypes::in_type in1, + const Eigen::array, 1>& dim_pair) { + MatMul(d, out, in0, in1, dim_pair); + } +}; +#endif // TENSORFLOW_USE_SYCL + } // end namespace functor #define REGISTER_CPU(T) \ @@ -276,6 +308,12 @@ struct MatMulFunctor { .Label("cublas"), \ MatMulOp) +#if defined (INTEL_MKL) +// MKL does not support half and int32 types for matrix-multiplication, so +// register the kernel to use default Eigen based implementations for these types +TF_CALL_half(REGISTER_CPU); +TF_CALL_int32(REGISTER_CPU); +#else TF_CALL_float(REGISTER_CPU); TF_CALL_double(REGISTER_CPU); TF_CALL_half(REGISTER_CPU); @@ -283,6 +321,7 @@ TF_CALL_half(REGISTER_CPU); TF_CALL_int32(REGISTER_CPU); TF_CALL_complex64(REGISTER_CPU); TF_CALL_complex128(REGISTER_CPU); +#endif #if GOOGLE_CUDA TF_CALL_float(REGISTER_GPU); @@ -294,4 +333,17 @@ TF_CALL_half(REGISTER_GPU); #endif #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("MatMul").Device(DEVICE_SYCL).TypeConstraint("T"), \ + MatMulOp); \ + REGISTER_KERNEL_BUILDER(Name("MatMul") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .Label("eigen"), \ + MatMulOp) +TF_CALL_float(REGISTER_SYCL); + +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc new file mode 100644 index 0000000000..3ba28c13ed --- /dev/null +++ b/tensorflow/core/kernels/mkl_matmul_op.cc @@ -0,0 +1,217 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/math_ops.cc. + +// This file uses MKL CBLAS xGEMM for acceleration of TF Matrix-Matrix +// Multiplication (MatMul) operations. +// We currently register this kernel only for MKL supported data +// types (float, double, complex64, complex128). The macro INTEL_MKL is defined +// by the build system only when MKL is chosen as an option at configure stage +// and when it is undefined at build time, this file becomes an empty +// compilation unit + +#if defined(INTEL_MKL) + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/fill_functor.h" +#include "third_party/mkl/include/mkl_cblas.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template +class MklMatMulOp : public OpKernel { + public: + explicit MklMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& a = ctx->input(0); + const Tensor& b = ctx->input(1); + + // Check that the dimensions of the two matrices are valid. + OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()), + errors::InvalidArgument("In[0] is not a matrix")); + OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()), + errors::InvalidArgument("In[1] is not a matrix")); + Eigen::array, 1> dim_pair; + dim_pair[0].first = transpose_a_ ? 0 : 1; + dim_pair[0].second = transpose_b_ ? 1 : 0; + + OP_REQUIRES(ctx, + a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second), + errors::InvalidArgument("Matrix size-incompatible: In[0]: ", + a.shape().DebugString(), ", In[1]: ", + b.shape().DebugString())); + int a_dim_remaining = 1 - dim_pair[0].first; + int b_dim_remaining = 1 - dim_pair[0].second; + TensorShape out_shape( + {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)}); + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); + + if (out->NumElements() == 0) { + // If a has shape [0, x] or b has shape [x, 0], the output shape + // is a 0-element matrix, so there is nothing to do. + return; + } + + if (a.NumElements() == 0 || b.NumElements() == 0) { + // If a has shape [x, 0] and b has shape [0, y], the + // output shape is [x, y] where x and y are non-zero, so we fill + // the output with zeros. + functor::SetZeroFunctor f; + f(ctx->eigen_device(), out->flat()); + return; + } + + const int m = a.dim_size(1 - dim_pair[0].first); + const int k = a.dim_size(dim_pair[0].first); + const int n = b.dim_size(1 - dim_pair[0].second); + bool transpose_a = dim_pair[0].first == 0; + bool transpose_b = dim_pair[0].second == 1; + + auto a_ptr = (a.template flat().data()); + auto b_ptr = (b.template flat().data()); + auto c_ptr = (out->template flat().data()); + + MklBlasGemm(transpose_a, transpose_b, m, n, k, a_ptr, transpose_a ? m : k, + b_ptr, transpose_b ? k : n, c_ptr, n); + } + + private: + bool transpose_a_; + bool transpose_b_; + + // -------------------------------------------------------------------------- + // + // @brief Matrix-Matrix Multiplication with FP32 tensors, a, b, c using CBLAS + // interface. c = op(a) * op(b) + // + // @param transa Specifies the form of op(a) used in MatMul. If transa is + // true, then op(a) = a^T, otherwise op(a) = a + // + // @param transb Specifies the form of op(b) used in MatMul. If transb is + // true, then op(b) = b^T, otherwise op(b) = b + // + // @param m Specifies the number of rows of the matrix op(a) and of the + // matrix c. The value of m must be at least zero. + // + // @param n Specifies the number of columns of the matrix op(b) and the + // number of columns of the matrix c. The value of n must be at least zero. + // + // @param k Specifies the number of columns of the matrix op(a) and the + // number of rows of the matrix op(b) + // + // @param a Address of matrix a + // + // @param lda Leading dimension of 'a' matrix. This is set at calling site + // depending on transa parameter. Since TF uses row-major + // layout, leading dimension is the stride between consecutive rows + // lda = max(1,k) when transa is false, otherwise lda = max(1,m) + // + // @param b Address of matrix b + // + // @param ldb Leading dimension of 'b' matrix. This is set at calling site + // depending on transb parameter. Since TF uses row-major + // layout, leading dimension is the stride between consecutive rows + // ldb = max(1,n) when transb is false, otherwise ldb = max(1,k) + // + // @param c Address of matrix c + // + // @param ldc Leading dimension of 'c' matrix. Since TF uses row-major + // layout, leading dimension is the stride between consecutive rows, max(1,n) + // + // -------------------------------------------------------------------------- + void MklBlasGemm(bool transa, bool transb, const int m, const int n, + const int k, const float* a, const int lda, const float* b, + const int ldb, float* c, const int ldc) { + // BLAS GEMM API defines Matrix Multiplication as c = alpha * op(a) * op(b) + // + beta * c. + // Since TF MatMul does not have parameters for alpha, beta, we set them to + // 1.0 and 0.0 respectively. + const float alpha = 1.0f; + const float beta = 0.0f; + cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, + transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc); + } + + // Matrix-Matrix Multiplication with FP64 tensors. For detailed info about + // parameters, look at FP32 function description. + void MklBlasGemm(bool transa, bool transb, const int m, const int n, + const int k, const double* a, const int lda, const double* b, + const int ldb, double* c, const int ldc) { + const double alpha = 1.0; + const double beta = 0.0; + cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, + transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc); + } + + // Matrix-Matrix Multiplication with Complex64 (std::complex) tensors. + // For detailed info about parameters, look at FP32 function description. + void MklBlasGemm(bool transa, bool transb, const int m, const int n, + const int k, const std::complex* a, const int lda, + const std::complex* b, const int ldb, + std::complex* c, int const ldc) { + const MKL_Complex8 alpha = {1.0f, 0.0f}; + const MKL_Complex8 beta = {0.0f, 0.0f}; + cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, + transb ? CblasTrans : CblasNoTrans, m, n, k, + static_cast(&alpha), static_cast(a), + lda, static_cast(b), ldb, + static_cast(&beta), static_cast(c), ldc); + } + + // Matrix-Matrix Multiplication with Complex128 (std::complex) + // tensors. For detailed info about parameters, look at FP32 function + // description. + void MklBlasGemm(bool transa, bool transb, const int m, const int n, + const int k, const std::complex* a, const int lda, + const std::complex* b, const int ldb, + std::complex* c, const int ldc) { + const MKL_Complex16 alpha = {1.0, 0.0}; + const MKL_Complex16 beta = {0.0, 0.0}; + cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, + transb ? CblasTrans : CblasNoTrans, m, n, k, + static_cast(&alpha), static_cast(a), + lda, static_cast(b), ldb, + static_cast(&beta), static_cast(c), ldc); + } +}; + +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("MatMul").Device(DEVICE_CPU).TypeConstraint("T"), \ + MklMatMulOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("MatMul").Device(DEVICE_CPU).TypeConstraint("T").Label("MKL"), \ + MklMatMulOp) + +// TODO:Consider template specialization when adding/removing additional types +TF_CALL_float(REGISTER_CPU); +TF_CALL_double(REGISTER_CPU); +TF_CALL_complex64(REGISTER_CPU); +TF_CALL_complex128(REGISTER_CPU); + +} // namespace tensorflow +#endif // INTEL_MKL diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc index 4977ad1d7c..a6650f369b 100644 --- a/tensorflow/core/kernels/pack_op.cc +++ b/tensorflow/core/kernels/pack_op.cc @@ -167,6 +167,7 @@ REGISTER_KERNEL_BUILDER(Name("Pack") PackOp) REGISTER_SYCL(float); +REGISTER_SYCL(double); #undef REGISTER_SYCL // A special GPU kernel for int32. diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc index bec2d02cb5..91984319c6 100644 --- a/tensorflow/core/kernels/pad_op.cc +++ b/tensorflow/core/kernels/pad_op.cc @@ -38,6 +38,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL template class PadOp : public OpKernel { @@ -199,4 +202,30 @@ REGISTER_KERNEL_BUILDER(Name("Pad") PadOp); #endif +#ifdef TENSORFLOW_USE_SYCL +// Registration of the GPU implementations. +#define REGISTER_SYCL_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("Pad") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .HostMemory("paddings"), \ + PadOp) + +REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Pad") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tpaddings") + .HostMemory("input") + .HostMemory("paddings") + .HostMemory("output"), + PadOp); +#endif // TENSORFLOW_USE_SYCL + } // end namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h index 1bc7e14187..0d309c2185 100644 --- a/tensorflow/core/kernels/reduction_ops_common.h +++ b/tensorflow/core/kernels/reduction_ops_common.h @@ -268,6 +268,31 @@ struct ReduceFunctor template struct ReduceFunctor : ReduceFunctorBase{}; + +template +struct ReduceFunctor > { + template + static void Reduce(const SYCLDevice& d, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Eigen::internal::MeanReducer& reducer) { + typedef typename IN_T::Index Index; + // Eigen sum reductions are much faster on GPU than mean reductions: + // Simply trigger them by computing the sum of the weighted inputs. + Index num_coeffs_to_reduce = 1; + for (int i = 0; i < Eigen::internal::array_size::value; + ++i) { + num_coeffs_to_reduce *= in.dimension(reduction_axes[i]); + } + T scale = T(1.0) / num_coeffs_to_reduce; + out.device(d) = (in * scale).sum(reduction_axes); + } + + template + static void FillIdentity(const SYCLDevice& d, OUT_T out, + const Eigen::internal::MeanReducer& reducer) { + FillIdentityEigenImpl(d, out, reducer); + } +}; #endif // TENSORFLOW_USE_SYCL } // namespace functor diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc index db86157c8e..5ab97d1eee 100644 --- a/tensorflow/core/kernels/reduction_ops_max.cc +++ b/tensorflow/core/kernels/reduction_ops_max.cc @@ -57,4 +57,27 @@ REGISTER_KERNEL_BUILDER( #endif +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Max") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tidx") \ + .HostMemory("reduction_indices"), \ + ReductionOp>); +REGISTER_SYCL_KERNELS(float); +#undef REGISTER_SYCL_KERNELS + +REGISTER_KERNEL_BUILDER( + Name("Max") + .Device(DEVICE_SYCL) + .HostMemory("reduction_indices") + .HostMemory("input") + .HostMemory("output") + .TypeConstraint("T") + .TypeConstraint("Tidx"), + ReductionOp>); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc index fef3cd0699..e018cb55dd 100644 --- a/tensorflow/core/kernels/reduction_ops_mean.cc +++ b/tensorflow/core/kernels/reduction_ops_mean.cc @@ -44,4 +44,17 @@ REGISTER_GPU_KERNELS(double); #endif +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Mean") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tidx") \ + .HostMemory("reduction_indices"), \ + ReductionOp>); +REGISTER_SYCL_KERNELS(float); +#undef REGISTER_SYCL_KERNELS +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc index c362bc8867..ec240421b9 100644 --- a/tensorflow/core/kernels/reduction_ops_min.cc +++ b/tensorflow/core/kernels/reduction_ops_min.cc @@ -57,4 +57,27 @@ REGISTER_KERNEL_BUILDER( #endif +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Min") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tidx") \ + .HostMemory("reduction_indices"), \ + ReductionOp>); +REGISTER_SYCL_KERNELS(float); +#undef REGISTER_SYCL_KERNELS + +REGISTER_KERNEL_BUILDER( + Name("Min") + .Device(DEVICE_SYCL) + .HostMemory("reduction_indices") + .HostMemory("input") + .HostMemory("output") + .TypeConstraint("T") + .TypeConstraint("Tidx"), + ReductionOp>); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc index c6aff8c2ed..e04c655dab 100644 --- a/tensorflow/core/kernels/reduction_ops_prod.cc +++ b/tensorflow/core/kernels/reduction_ops_prod.cc @@ -45,4 +45,28 @@ REGISTER_GPU_KERNELS(double); #endif +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Prod") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tidx") \ + .HostMemory("reduction_indices"), \ + ReductionOp>); +REGISTER_SYCL_KERNELS(float); +REGISTER_SYCL_KERNELS(double); +#undef REGISTER_SYCL_KERNELS + +REGISTER_KERNEL_BUILDER( + Name("Prod") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tidx") + .HostMemory("input") + .HostMemory("output") + .HostMemory("reduction_indices"), + ReductionOp>); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc index 3aa38f418e..938ca66a0c 100644 --- a/tensorflow/core/kernels/reduction_ops_sum.cc +++ b/tensorflow/core/kernels/reduction_ops_sum.cc @@ -74,7 +74,6 @@ REGISTER_KERNEL_BUILDER( .HostMemory("reduction_indices"), \ ReductionOp>); REGISTER_SYCL_KERNELS(float); -REGISTER_SYCL_KERNELS(double); #undef REGISTER_SYCL_KERNELS // A special GPU kernel for int32. diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc index f24a71ec8c..d70398bea5 100644 --- a/tensorflow/core/kernels/relu_op.cc +++ b/tensorflow/core/kernels/relu_op.cc @@ -29,6 +29,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL #define REGISTER_RELU_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -131,4 +134,30 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +// Registration of the GPU implementations. +#define REGISTER_SYCL_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Relu").Device(DEVICE_SYCL).TypeConstraint("T"), \ + ReluOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint("T"), \ + ReluGradOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("Relu6").Device(DEVICE_SYCL).TypeConstraint("T"), \ + Relu6Op); \ + REGISTER_KERNEL_BUILDER( \ + Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint("T"), \ + Relu6GradOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("Elu").Device(DEVICE_SYCL).TypeConstraint("T"), \ + EluOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint("T"), \ + EluGradOp) + +REGISTER_SYCL_KERNELS(float); +#undef REGISTER_SYCL_KERNELS +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h index 365c6201a5..e2e0bd48dd 100644 --- a/tensorflow/core/kernels/relu_op.h +++ b/tensorflow/core/kernels/relu_op.h @@ -175,6 +175,10 @@ void EluGradOp::OperateNoTemplate(OpKernelContext* context, } // namespace tensorflow +#ifdef TENSORFLOW_USE_SYCL +#undef EIGEN_USE_SYCL +#endif // TENSORFLOW_USE_SYCL + #undef EIGEN_USE_THREADS #endif // TENSORFLOW_KERNELS_RELU_OP_H_ diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc index 5df36ef4cd..c5c805c44f 100644 --- a/tensorflow/core/kernels/resize_bicubic_op.cc +++ b/tensorflow/core/kernels/resize_bicubic_op.cc @@ -130,7 +130,7 @@ class CachedInterpolation { } // We use 2 hands and walk through, copying from one to another where // we already have values. - // Invarient, new_indicies_hand <= cached_values_hand + // Invariant, new_indicies_hand <= cached_values_hand const std::array new_x_indices{{x_0, x_1, x_2, x_3}}; int cached_values_hand = 0; int new_indicies_hand = 0; diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc index ec3b27ed6f..24b3ba31b8 100644 --- a/tensorflow/core/kernels/reverse_op.cc +++ b/tensorflow/core/kernels/reverse_op.cc @@ -33,6 +33,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL namespace { @@ -344,4 +347,36 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2") ReverseV2Op); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(T) \ + REGISTER_KERNEL_BUILDER(Name("Reverse") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("dims"), \ + ReverseOp) \ + REGISTER_KERNEL_BUILDER(Name("ReverseV2") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tidx") \ + .HostMemory("axis"), \ + ReverseV2Op) +TF_CALL_float(REGISTER_SYCL_KERNELS); + +REGISTER_KERNEL_BUILDER(Name("Reverse") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .HostMemory("tensor") + .HostMemory("dims") + .HostMemory("output"), + ReverseOp); +REGISTER_KERNEL_BUILDER(Name("ReverseV2") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tidx") + .HostMemory("tensor") + .HostMemory("axis") + .HostMemory("output"), + ReverseV2Op); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc index 827eb7dbca..51dad49cfe 100644 --- a/tensorflow/core/kernels/scatter_op.cc +++ b/tensorflow/core/kernels/scatter_op.cc @@ -180,8 +180,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU); #define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL); -REGISTER_SCATTER_ARITHEMTIC_SYCL(float); -REGISTER_SCATTER_UPDATE_SYCL(float); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_SYCL); +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL); #undef REGISTER_SCATTER_ARITHEMTIC_SYCL #undef REGISTER_SCATTER_UPDATE_SYCL diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc index c24ecdf8b9..c8ea923020 100644 --- a/tensorflow/core/kernels/sequence_ops.cc +++ b/tensorflow/core/kernels/sequence_ops.cc @@ -92,9 +92,11 @@ class RangeOp : public OpKernel { #ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T) TF_CALL_float(REGISTER_SYCL_KERNEL); +TF_CALL_double(REGISTER_SYCL_KERNEL); TF_CALL_int32(REGISTER_SYCL_KERNEL); TF_CALL_int64(REGISTER_SYCL_KERNEL); -#endif // TENSORFLOW_USE_SYCL +#undef REGISTER_SYCL_KERNEL +#endif // TENSORFLOW_USE_SYCL TF_CALL_float(REGISTER_CPU_KERNEL); TF_CALL_double(REGISTER_CPU_KERNEL); @@ -170,4 +172,9 @@ TF_CALL_double(REGISTER_CPU_KERNEL); TF_CALL_float(REGISTER_GPU_KERNEL); TF_CALL_double(REGISTER_GPU_KERNEL); +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T) +TF_CALL_float(REGISTER_SYCL_KERNEL); +TF_CALL_double(REGISTER_SYCL_KERNEL); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc index 6bc0b4560b..177a32464b 100644 --- a/tensorflow/core/kernels/shape_ops.cc +++ b/tensorflow/core/kernels/shape_ops.cc @@ -201,6 +201,7 @@ REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"), .HostMemory("output"), \ RankOp); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL // A special GPU kernel for int32 and bool. @@ -297,6 +298,43 @@ REGISTER_KERNEL_BUILDER(Name("Size") SizeOp); #endif +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Size") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("out_type") \ + .HostMemory("output"), \ + SizeOp); \ + REGISTER_KERNEL_BUILDER(Name("Size") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("out_type") \ + .HostMemory("output"), \ + SizeOp); +REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); +#undef REGISTER_SYCL_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Size") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("out_type") + .HostMemory("input") + .HostMemory("output"), + SizeOp); +REGISTER_KERNEL_BUILDER(Name("Size") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("out_type") + .HostMemory("input") + .HostMemory("output"), + SizeOp); +#endif // TENSORFLOW_USE_SYCL + // ExpandDims ------------------------------------ REGISTER_KERNEL_BUILDER(Name("ExpandDims") .Device(DEVICE_CPU) @@ -323,7 +361,30 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims") .HostMemory("dim") .HostMemory("output"), ExpandDimsOp); -#endif +#endif // GOOGLE_CUDA + +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("ExpandDims") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tdim") \ + .HostMemory("dim"), \ + ExpandDimsOp); +REGISTER_SYCL_KERNEL(float) +REGISTER_SYCL_KERNEL(double) + +#undef REGISTER_SYCL_KERNEL + +REGISTER_KERNEL_BUILDER(Name("ExpandDims") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tdim") + .HostMemory("input") + .HostMemory("dim") + .HostMemory("output"), + ExpandDimsOp); +#endif // TENSORFLOW_USE_SYCL // Squeeze --------------------------------------- REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp); @@ -347,4 +408,24 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze") SqueezeOp); #endif +#if TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint("T"),\ + SqueezeOp); +REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); +#undef REGISTER_SYCL_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Squeeze") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .HostMemory("input") + .HostMemory("output"), + SqueezeOp); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc index c7ae93852f..de11de32f1 100644 --- a/tensorflow/core/kernels/softmax_op.cc +++ b/tensorflow/core/kernels/softmax_op.cc @@ -28,17 +28,27 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL // Partial specialization for a CPUDevice, that uses the Eigen implementation // from SoftmaxEigenImpl. namespace functor { -template -struct SoftmaxFunctor { - void operator()(const CPUDevice& d, typename TTypes::ConstMatrix logits, +template +struct SoftmaxFunctorBase { + void operator()(const Device& d, typename TTypes::ConstMatrix logits, typename TTypes::Matrix softmax, const bool log) { - SoftmaxEigenImpl::Compute(d, logits, softmax, log); + SoftmaxEigenImpl::Compute(d, logits, softmax, log); } }; +template +struct SoftmaxFunctor : SoftmaxFunctorBase {}; + +#ifdef TENSORFLOW_USE_SYCL +template +struct SoftmaxFunctor : SoftmaxFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL } // namespace functor #define REGISTER_CPU(T) \ @@ -76,4 +86,10 @@ REGISTER_KERNEL_BUILDER( SoftmaxOp); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER( + Name("Softmax").Device(DEVICE_SYCL).TypeConstraint("T"), + SoftmaxOp); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc index c18b992ea1..161ba89212 100644 --- a/tensorflow/core/kernels/stage_op.cc +++ b/tensorflow/core/kernels/stage_op.cc @@ -99,6 +99,9 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp); #if GOOGLE_CUDA REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp); +#endif // TENSORFLOW_USE_SYCL class UnstageOp : public OpKernel { public: @@ -126,5 +129,8 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp); #if GOOGLE_CUDA REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp); #endif +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc index 6580d271d1..10593516f7 100644 --- a/tensorflow/core/kernels/strided_slice_op.cc +++ b/tensorflow/core/kernels/strided_slice_op.cc @@ -451,4 +451,71 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") #undef REGISTER_GPU #endif // GOOGLE_CUDA + +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL(type) \ + REGISTER_KERNEL_BUILDER(Name("StridedSlice") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("begin") \ + .HostMemory("end") \ + .HostMemory("strides") \ + .TypeConstraint("Index"), \ + StridedSliceOp) \ + REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("shape") \ + .HostMemory("begin") \ + .HostMemory("end") \ + .HostMemory("strides") \ + .TypeConstraint("Index"), \ + StridedSliceGradOp)\ + REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("begin") \ + .HostMemory("end") \ + .HostMemory("strides") \ + .TypeConstraint("Index"), \ + StridedSliceAssignOp) + +REGISTER_SYCL(float); +REGISTER_SYCL(double); + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("StridedSlice") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Index") + .HostMemory("input") + .HostMemory("begin") + .HostMemory("end") + .HostMemory("strides") + .HostMemory("output"), + StridedSliceOp); +REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Index") + .HostMemory("shape") + .HostMemory("begin") + .HostMemory("end") + .HostMemory("strides") + .HostMemory("dy") + .HostMemory("output"), + StridedSliceGradOp); +REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Index") + .HostMemory("ref") + .HostMemory("begin") + .HostMemory("end") + .HostMemory("strides"), + StridedSliceAssignOp) +#undef REGISTER_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h index 6341964b4b..d820db348e 100644 --- a/tensorflow/core/kernels/strided_slice_op_impl.h +++ b/tensorflow/core/kernels/strided_slice_op_impl.h @@ -287,6 +287,20 @@ DECLARE_FOR_N_GPU(int32); TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU); DECLARE_FOR_N_CPU(bfloat16); +#ifdef TENSORFLOW_USE_SYCL +#define PREVENT_FOR_N_SYCL(T) \ + PREVENT_INSTANTIATE(T, STRIDED_SLICE_INSTANTIATE_DIM) + +#define DECLARE_FOR_N_SYCL(T) \ + INSTANTIATE(SYCLDevice, T, STRIDED_SLICE_INSTANTIATE_DIM) + +TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL); +TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_SYCL); +DECLARE_FOR_N_SYCL(int32); + +#undef DECLARE_FOR_N_SYCL +#endif // TENSORFLOW_USE_SYCL + #undef INSTANTIATE #undef DECLARE_FOR_N_CPU #undef DECLARE_FOR_N_GPU diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc index e55c8679e9..9822b021eb 100644 --- a/tensorflow/core/kernels/tile_ops.cc +++ b/tensorflow/core/kernels/tile_ops.cc @@ -260,6 +260,8 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU); #ifdef TENSORFLOW_USE_SYCL TF_CALL_float(HANDLE_TYPE_NAME_SYCL); +TF_CALL_double(HANDLE_TYPE_NAME_SYCL); +TF_CALL_int32(HANDLE_TYPE_NAME_SYCL); #endif // TENSORFLOW_USE_SYCL #undef HANDLE_TYPE_NAME_CPU @@ -506,6 +508,16 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU); TF_CALL_complex128(HANDLE_TYPE_NAME_GPU); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_SYCL +#define HANDLE_TYPE_NAME_SYCL(T) \ + HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum::value); + +TF_CALL_float(HANDLE_TYPE_NAME_SYCL); +TF_CALL_double(HANDLE_TYPE_NAME_SYCL); +TF_CALL_int32(HANDLE_TYPE_NAME_SYCL); +#undef HANDLE_TYPE_NAME_SYCL +#endif // TENSORFLOW_USE_SYCL + #undef HANDLE_TYPE_NAME_CPU #undef HANDLE_TYPE_NAME_GPU #undef HANDLE_CASE_DIM @@ -605,6 +617,25 @@ REGISTER_KERNEL_BUILDER(Name("Tile") .TypeConstraint("Tmultiples") .HostMemory("multiples"), TileOp); +REGISTER_KERNEL_BUILDER(Name("Tile") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tmultiples") + .HostMemory("multiples"), + TileOp); + +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tmultiples") + .HostMemory("multiples"), + TileGradientOp); +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .TypeConstraint("Tmultiples") + .HostMemory("multiples"), + TileGradientOp); #endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h index 650c739ed5..f06cc5514c 100644 --- a/tensorflow/core/kernels/tile_ops_cpu_impl.h +++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h @@ -70,6 +70,8 @@ typedef Eigen::SyclDevice SYCLDevice; #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM) TF_CALL_float(DEFINE_TYPE); +TF_CALL_double(DEFINE_TYPE); +TF_CALL_int32(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE @@ -81,6 +83,8 @@ TF_CALL_float(DEFINE_TYPE); #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM) TF_CALL_float(DEFINE_TYPE); +TF_CALL_double(DEFINE_TYPE); +TF_CALL_int32(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 336c6b0ccc..5c2d371430 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -423,6 +423,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); TF_CALL_float(REGISTER_SYCL_KERNELS); +TF_CALL_double(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS #endif @@ -2355,6 +2356,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); TF_CALL_float(REGISTER_SYCL_KERNELS); +TF_CALL_double(REGISTER_SYCL_KERNELS); #endif #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc index 30b82f1843..3681b9a129 100644 --- a/tensorflow/core/kernels/transpose_functor_cpu.cc +++ b/tensorflow/core/kernels/transpose_functor_cpu.cc @@ -127,6 +127,7 @@ Status DoTranspose(const SYCLDevice& d, const Tensor& in, switch (in.dtype()) { case DT_FLOAT: + case DT_DOUBLE: case DT_INT32: internal::Transpose(d, in, perm, out); break; diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc index 67300c1e96..4d303f0173 100644 --- a/tensorflow/core/kernels/transpose_op.cc +++ b/tensorflow/core/kernels/transpose_op.cc @@ -82,6 +82,15 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation") .HostMemory("y"), InvertPermutationOp); +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("InvertPermutation") + .Device(DEVICE_SYCL) + .TypeConstraint("T") + .HostMemory("x") + .HostMemory("y"), + InvertPermutationOp); +#endif // TENSORFLOW_USE_SYCL + // output = TransposeOp(T input, T perm) takes a tensor // of type T and rank N, and a permutation of 0, 1, ..., N-1. It // shuffles the dimensions of the input tensor according to permutation. @@ -201,4 +210,24 @@ TF_CALL_POD_TYPES(REGISTER); #undef REGISTER #endif +#ifdef TENSORFLOW_USE_SYCL +Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in, + gtl::ArraySlice perm, Tensor* out) { + typedef Eigen::SyclDevice SYCLDevice; + return ::tensorflow::DoTranspose(ctx->eigen_device(), in, perm, + out); +} +#define REGISTER(T) \ + REGISTER_KERNEL_BUILDER(Name("Transpose") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .TypeConstraint("Tperm") \ + .HostMemory("perm"), \ + TransposeSyclOp); +REGISTER(float); +REGISTER(bool); +REGISTER(int32); +#undef REGISTER +#endif + } // namespace tensorflow diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h index 3b209c0ccc..5f40bcecc1 100644 --- a/tensorflow/core/kernels/transpose_op.h +++ b/tensorflow/core/kernels/transpose_op.h @@ -50,6 +50,17 @@ class TransposeGpuOp : public TransposeOp { gtl::ArraySlice perm, Tensor* out) override; }; +#ifdef TENSORFLOW_USE_SYCL +class TransposeSyclOp : public TransposeOp { + public: + explicit TransposeSyclOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {} + + protected: + Status DoTranspose(OpKernelContext* ctx, const Tensor& in, + gtl::ArraySlice perm, Tensor* out) override; +}; +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow #endif // TENSORFLOW_KERNELS_TRANSPOSE_OP_H_ diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc index 2a14fa3265..e4c79ae17b 100644 --- a/tensorflow/core/kernels/unpack_op.cc +++ b/tensorflow/core/kernels/unpack_op.cc @@ -160,6 +160,7 @@ REGISTER_KERNEL_BUILDER(Name("Unpack") UnpackOp) REGISTER_SYCL(float); +REGISTER_SYCL(double); #undef REGISTER_SYCL // A special SYCL kernel for int32. diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc index 34e227156d..7a4d9dc650 100644 --- a/tensorflow/core/kernels/variable_ops.cc +++ b/tensorflow/core/kernels/variable_ops.cc @@ -58,8 +58,9 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU), IsVariableInitializedOp); REGISTER_SYCL_KERNEL(float); +REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL -#endif +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA // Only register 'Variable' on GPU for the subset of types also supported by diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc index 26f4fb2a2e..dc21cee3a8 100644 --- a/tensorflow/core/kernels/xent_op.cc +++ b/tensorflow/core/kernels/xent_op.cc @@ -28,6 +28,9 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +#ifdef TENSORFLOW_USE_SYCL +typedef Eigen::SyclDevice SYCLDevice; +#endif // TENSORFLOW_USE_SYCL template class SoftmaxXentWithLogitsOp : public OpKernel { @@ -74,17 +77,25 @@ class SoftmaxXentWithLogitsOp : public OpKernel { // Partial specialization for a CPUDevice, that uses the Eigen implementation // from XentEigenImpl. namespace functor { -template -struct XentFunctor { - void operator()(const CPUDevice& d, typename TTypes::ConstMatrix logits, +template +struct XentFunctorBase { + void operator()(const Device& d, typename TTypes::ConstMatrix logits, typename TTypes::ConstMatrix labels, typename TTypes::Matrix scratch, typename TTypes::Vec loss, typename TTypes::Matrix backprop) { - XentEigenImpl::Compute(d, logits, labels, scratch, loss, + XentEigenImpl::Compute(d, logits, labels, scratch, loss, backprop); } }; + +template +struct XentFunctor : XentFunctorBase {}; + +#ifdef TENSORFLOW_USE_SYCL +template +struct XentFunctor : XentFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL } // namespace functor #define REGISTER_CPU(T) \ @@ -111,4 +122,11 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") SoftmaxXentWithLogitsOp); #endif // GOOGLE_CUDA +#ifdef TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") + .Device(DEVICE_SYCL) + .TypeConstraint("T"), + SoftmaxXentWithLogitsOp); +#endif // TENSORFLOW_USE_SYCL + } // namespace tensorflow diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc index 2def59ff04..8670ca307c 100644 --- a/tensorflow/core/ops/math_grad_test.cc +++ b/tensorflow/core/ops/math_grad_test.cc @@ -390,7 +390,7 @@ class TestOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_CPU), TestOp); #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_SYCL), TestOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL TEST_F(MathGradTest, Error_Reporting) { auto x = test::AsTensor({-3.f}); @@ -707,6 +707,8 @@ TEST_F(MathGradTest, Pow) { } } +//TODO{lukeiwanski}: Implement Complex Pow for SYCL +#ifndef TENSORFLOW_USE_SYCL TEST_F(MathGradTest, ComplexPow) { auto x = test::AsTensor({0.f, 2.f, -2.f}, TensorShape({3})); auto y = test::AsTensor({2.f, 2.f, 2.f}, TensorShape({3})); @@ -725,6 +727,7 @@ TEST_F(MathGradTest, ComplexPow) { dy, test::AsTensor({h(0.f, 2.f), h(2.f, 2.f), h(-2.f, 2.f)}, TensorShape({3}))); } +#endif // TENSORFLOW_USE_SYCL TEST_F(MathGradTest, Maximum) { auto x = test::AsTensor({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f}, @@ -886,6 +889,8 @@ TEST_F(MathGradTest, MatMul_11) { test::ExpectClose(dy, MatMul(dz, true, x, true)); } +//TODO{lukeiwanski}: Implement BatchMatMul for SYCL +#ifndef TENSORFLOW_USE_SYCL TEST_F(MathGradTest, BatchMatMul_00) { auto x = test::AsTensor({1.f, 2.f, 3.f, 4.f, 5.f, 6.f}, TensorShape({1, 2, 3})); @@ -933,6 +938,7 @@ TEST_F(MathGradTest, BatchMatMul_11) { test::ExpectClose(dx, BatchMatMul(y, true, dz, true)); test::ExpectClose(dy, BatchMatMul(dz, true, x, true)); } +#endif // TENSORFLOW_USE_SYCL TEST_F(MathGradTest, Sum_dim0) { auto x = test::AsTensor({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f}, diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index c9d648f8b9..9180170e20 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -25366,6 +25366,59 @@ op { summary: "Computes the sum along segments of a tensor." description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n
\n\n
" } +op { + name: "UnsortedSegmentSum" + input_arg { + name: "data" + type_attr: "T" + } + input_arg { + name: "segment_ids" + description: "A tensor whose shape is a prefix of `data.shape`." + type_attr: "Tindices" + } + input_arg { + name: "num_segments" + type: DT_INT32 + } + output_arg { + name: "output" + description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + summary: "Computes the max along segments of a tensor." + description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n
\n\n
" +} op { name: "Unstage" output_arg { diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index cd0a9436d9..81d49684a8 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -20,7 +20,7 @@ limitations under the License. #define TF_MAJOR_VERSION 1 #define TF_MINOR_VERSION 0 -#define TF_PATCH_VERSION 0-rc2 +#define TF_PATCH_VERSION 0 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1", // "-beta", "-rc", "-rc.1") diff --git a/tensorflow/docs_src/programmers_guide/meta_graph.md b/tensorflow/docs_src/programmers_guide/meta_graph.md index add4379d7d..fa4cee8700 100644 --- a/tensorflow/docs_src/programmers_guide/meta_graph.md +++ b/tensorflow/docs_src/programmers_guide/meta_graph.md @@ -277,7 +277,7 @@ Here are some of the typical usage models: * Retrieve Hyper Parameters ```Python - filename = ".".join([tf.latest_checkpoint(train_dir), "meta"]) + filename = ".".join([tf.train.latest_checkpoint(train_dir), "meta"]) tf.train.import_meta_graph(filename) hparams = tf.get_collection("hparams") ``` diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py index e612eb7424..575dea5584 100644 --- a/tensorflow/examples/image_retraining/retrain.py +++ b/tensorflow/examples/image_retraining/retrain.py @@ -893,7 +893,8 @@ def main(_): print('=== MISCLASSIFIED TEST IMAGES ===') for i, test_filename in enumerate(test_filenames): if predictions[i] != test_ground_truth[i].argmax(): - print('%70s %s' % (test_filename, image_lists.keys()[predictions[i]])) + print('%70s %s' % (test_filename, + list(image_lists.keys())[predictions[i]])) # Write out the trained graph and labels with the weights stored as constants. output_graph_def = graph_util.convert_variables_to_constants( diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py index 6e5fe7891b..15cf4b91dd 100644 --- a/tensorflow/examples/learn/mnist.py +++ b/tensorflow/examples/learn/mnist.py @@ -46,13 +46,13 @@ def conv_model(feature, target, mode): # First conv layer will compute 32 features for each 5x5 patch with tf.variable_scope('conv_layer1'): - h_conv1 = layers.convolution( + h_conv1 = layers.convolution2d( feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu) h_pool1 = max_pool_2x2(h_conv1) # Second conv layer will compute 64 features for each 5x5 patch. with tf.variable_scope('conv_layer2'): - h_conv2 = layers.convolution( + h_conv2 = layers.convolution2d( h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu) h_pool2 = max_pool_2x2(h_conv2) # reshape tensor into a batch of vectors diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py index a3a5f9e3e9..c3d00a11b9 100644 --- a/tensorflow/examples/learn/text_classification.py +++ b/tensorflow/examples/learn/text_classification.py @@ -104,8 +104,13 @@ def main(unused_argv): # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH) - x_train = np.array(list(vocab_processor.fit_transform(x_train))) - x_test = np.array(list(vocab_processor.transform(x_test))) + + x_transform_train = vocab_processor.fit_transform(x_train) + x_transform_test = vocab_processor.transform(x_test) + + x_train = np.array(list(x_transform_train)) + x_test = np.array(list(x_transform_test)) + n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py index 8dcd3bf37a..25800c109e 100644 --- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py +++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py @@ -181,7 +181,7 @@ with graph.as_default(): valid_embeddings, normalized_embeddings, transpose_b=True) # Add variable initializer. - init = tf.global_variables_initializer() + init = tf.initialize_all_variables() # Step 5: Begin training. num_steps = 100001 diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md index 143a75a3e9..6faad294c2 100644 --- a/tensorflow/examples/udacity/README.md +++ b/tensorflow/examples/udacity/README.md @@ -6,7 +6,7 @@ Course information can be found at https://www.udacity.com/course/deep-learning- Running the Docker container from the Google Cloud repository ------------------------------------------------------------- - docker run -p 8888:8888 --name tensorflow-udacity -it gcr.io/tensorflow/udacity-assignments:0.6.0 + docker run -p 8888:8888 --name tensorflow-udacity -it gcr.io/tensorflow/udacity-assignments:1.0.0 Note that if you ever exit the container, you can return to it using: @@ -94,10 +94,10 @@ This will allow you to save work and have access to generated files on the host Pushing a Google Cloud release ------------------------------ - V=0.6.0 + V=1.0.0 docker tag $USER/assignments gcr.io/tensorflow/udacity-assignments:$V gcloud docker push gcr.io/tensorflow/udacity-assignments - docker tag -f $USER/assignments gcr.io/tensorflow/udacity-assignments:latest + docker tag $USER/assignments gcr.io/tensorflow/udacity-assignments:latest gcloud docker push gcr.io/tensorflow/udacity-assignments History @@ -109,3 +109,4 @@ History * 0.4.0: Move notMMNIST data for Google Cloud. * 0.5.0: Actually use 0.7.1 release. * 0.6.0: Update to TF 0.10.0, add libjpeg (for Pillow). +* 1.0.0: Update to TF 1.0.0 release. diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py index 561ce09099..7bba10efac 100644 --- a/tensorflow/python/client/device_lib_test.py +++ b/tensorflow/python/client/device_lib_test.py @@ -34,7 +34,7 @@ class DeviceLibTest(test_util.TensorFlowTestCase): # GPU test if test.is_gpu_available(): self.assertGreater(len(devices), 1) - self.assertTrue("GPU" in [d.device_type for d in devices]) + self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices]) if __name__ == "__main__": diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index f2fd687adf..3ea7e547ee 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -44,7 +44,14 @@ from tensorflow.python.platform import googletest from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import compat from tensorflow.python.util.protobuf import compare +from tensorflow.python.client import device_lib +def gpu_device_name(): + """Returns the name of a GPU device if available or the empty string.""" + for x in device_lib.list_local_devices(): + if x.device_type == 'GPU' or x.device_type == 'SYCL': + return x.name + return '' def assert_ops_in_graph(expected_ops, graph): """Assert all expected operations are found. @@ -301,7 +308,12 @@ class TensorFlowTestCase(googletest.TestCase): sess = self._cached_session with sess.graph.as_default(), sess.as_default(): if force_gpu: - with sess.graph.device("/gpu:0"): + # Use the name of an actual device if one is detected, or '/gpu:0' + # otherwise + gpu_name = gpu_device_name() + if len(gpu_name) == 0: + gpu_name = '/gpu:0' + with sess.graph.device(gpu_name): yield sess elif use_gpu: yield sess @@ -311,7 +323,12 @@ class TensorFlowTestCase(googletest.TestCase): else: with session.Session(graph=graph, config=prepare_config(config)) as sess: if force_gpu: - with sess.graph.device("/gpu:0"): + # Use the name of an actual device if one is detected, or '/gpu:0' + # otherwise + gpu_name = gpu_device_name() + if len(gpu_name) == 0: + gpu_name = '/gpu:0' + with sess.graph.device(gpu_name): yield sess elif use_gpu: yield sess diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py index ae26dcb5a9..81eee48d2e 100644 --- a/tensorflow/python/kernel_tests/stage_op_test.py +++ b/tensorflow/python/kernel_tests/stage_op_test.py @@ -46,7 +46,7 @@ class StageTest(test.TestCase): with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) - with ops.device('/gpu:0'): + with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea([dtypes.float32, dtypes.float32]) stage = stager.put([x, v]) z, y = stager.get() @@ -62,7 +62,7 @@ class StageTest(test.TestCase): with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) - with ops.device('/gpu:0'): + with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea( [dtypes.float32, dtypes.float32], shapes=[[], [128, 128]], diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py index 11b350a99e..48be8cff4f 100644 --- a/tensorflow/python/kernel_tests/variables_test.py +++ b/tensorflow/python/kernel_tests/variables_test.py @@ -46,11 +46,13 @@ class VariablesTestCase(test.TestCase): self.assertEqual("Variable:0", var0.name) self.assertEqual([], var0.get_shape()) self.assertEqual([], var0.get_shape()) + self.assertEqual([], var0.shape) var1 = variables.Variable(1.1) self.assertEqual("Variable_1:0", var1.name) self.assertEqual([], var1.get_shape()) self.assertEqual([], var1.get_shape()) + self.assertEqual([], var1.shape) with self.assertRaisesOpError("Attempting to use uninitialized value"): var0.eval() @@ -69,11 +71,13 @@ class VariablesTestCase(test.TestCase): self.assertEqual("rnd:0", rnd.name) self.assertEqual([3, 6], rnd.get_shape()) self.assertEqual([3, 6], rnd.get_shape()) + self.assertEqual([3, 6], rnd.shape) dep = variables.Variable(rnd.initialized_value(), name="dep") self.assertEqual("dep:0", dep.name) self.assertEqual([3, 6], dep.get_shape()) self.assertEqual([3, 6], dep.get_shape()) + self.assertEqual([3, 6], dep.shape) # Currently have to set the shape manually for Add. added_val = rnd.initialized_value() + dep.initialized_value() + 2.0 @@ -83,6 +87,7 @@ class VariablesTestCase(test.TestCase): self.assertEqual("depdep:0", depdep.name) self.assertEqual([3, 6], depdep.get_shape()) self.assertEqual([3, 6], depdep.get_shape()) + self.assertEqual([3, 6], depdep.shape) variables.global_variables_initializer().run() @@ -375,6 +380,7 @@ class VariablesTestCase(test.TestCase): v1 = variables.Variable(initializer, dtype=dtypes.float32) self.assertEqual(shape, v1.get_shape()) + self.assertEqual(shape, v1.shape) self.assertAllClose(value, v1.initial_value.eval()) with self.assertRaises(errors_impl.FailedPreconditionError): v1.eval() @@ -382,6 +388,7 @@ class VariablesTestCase(test.TestCase): v2 = variables.Variable( math_ops.negative(v1.initialized_value()), dtype=dtypes.float32) self.assertEqual(v1.get_shape(), v2.get_shape()) + self.assertEqual(v1.shape, v2.shape) self.assertAllClose(np.negative(value), v2.initial_value.eval()) # Once v2.initial_value.eval() has been called, v1 has effectively been @@ -532,6 +539,7 @@ class PartitionedVariableTest(test.TestCase): self.assertEqual(2, num_partitions) self.assertEqual([v0, v1], iterated_partitions) self.assertEqual([2], concatenated.get_shape()) + self.assertEqual([2], concatenated.shape) def testPartitionedVariableFailures(self): with ops.Graph().as_default(): diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py index d037ceac61..4b3dadc112 100644 --- a/tensorflow/python/kernel_tests/xent_op_test.py +++ b/tensorflow/python/kernel_tests/xent_op_test.py @@ -157,7 +157,7 @@ class XentTest(test.TestCase): np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float64)) def testGradient(self): - with self.test_session(): + with self.test_session() as sess: l = constant_op.constant( [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5], shape=[3, 4], @@ -171,14 +171,21 @@ class XentTest(test.TestCase): x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f, name="xent") err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3]) + + # Check that no extra computation performed. When only first derivative is requested, + # second derivative must not be computed. So when there is no second derivative, + # there is no `BatchMatMul` op in the graph. + op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def] + self.assertNotIn('BatchMatMul', op_names) + print("cross entropy gradient err = ", err) self.assertLess(err, 5e-8) def testSecondGradient(self): - with self.test_session(): - l = constant_op.constant([0.0, 0.0, 1.0, 0.0, - 1.0, 0.0, 0.0, 0.0, - 0.0, 0.5, 0.0, 0.5], shape=[12], + with self.test_session() as sess: + l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0, + 1.0/3, 0.0, 0.0, 0.0, + 0.0, 0.5/3, 0.0, 0.5/3], shape=[12], dtype=dtypes.float64, name="l") f = constant_op.constant([0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, @@ -186,13 +193,19 @@ class XentTest(test.TestCase): dtype=dtypes.float64, name="f") x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f, name="xent") - loss = math_ops.reduce_mean(x) + loss = math_ops.reduce_sum(x) - # Taking ths second gradient should fail, since it is not - # yet supported. - with self.assertRaisesRegexp(LookupError, - "explicitly disabled"): - _ = gradients_impl.hessians(loss, [f]) + gradients = gradients_impl.gradients(loss, [f])[0] + + err = gradient_checker.compute_gradient_error(f, [12], gradients, [12]) + + # Check that second derivative is calculated. + # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad) + op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def] + self.assertIn('BatchMatMul', op_names) + + print("cross entropy hessian err = ", err) + self.assertLess(err, 5e-8) def testWrapper(self): features = np.array( diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index c64813962a..34380951af 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -586,7 +586,7 @@ def strided_slice(input_, `foo[::-1]` reverses a tensor with shape 8. - If the ith bit of `ellipsis_mask`, as many unspecified dimensions + If the ith bit of `ellipsis_mask` is non-zero, as many unspecified dimensions as needed will be inserted between other dimensions. Only one non-zero bit is allowed in `ellipsis_mask`. @@ -594,7 +594,7 @@ def strided_slice(input_, equivalent to `foo[3:5,:,:,4:5]` and `foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`. - If the ith bit of `new_axis_mask` is one, then a `begin`, + If the ith bit of `new_axis_mask` is one, then `begin`, `end`, and `stride` are ignored and a new length 1 dimension is added at this point in the output tensor. @@ -1545,7 +1545,7 @@ def sparse_placeholder(dtype, shape=None, name=None): x: (indices, values, shape)})) # Will succeed. sp = tf.SparseTensor(indices=indices, values=values, dense_shape=shape) - sp_value = sp.eval(session) + sp_value = sp.eval(session=sess) print(sess.run(y, feed_dict={x: sp_value})) # Will succeed. ``` diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py index bda7212c8a..3dc0ac34c8 100644 --- a/tensorflow/python/ops/clip_ops.py +++ b/tensorflow/python/ops/clip_ops.py @@ -41,8 +41,10 @@ def clip_by_value(t, clip_value_min, clip_value_max, Args: t: A `Tensor`. - clip_value_min: A 0-D (scalar) `Tensor`. The minimum value to clip by. - clip_value_max: A 0-D (scalar) `Tensor`. The maximum value to clip by. + clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape + as `t`. The minimum value to clip by. + clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape + as `t`. The maximum value to clip by. name: A name for the operation (optional). Returns: diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py index 397c522dbe..41e8877d72 100644 --- a/tensorflow/python/ops/nn_grad.py +++ b/tensorflow/python/ops/nn_grad.py @@ -321,21 +321,25 @@ def _BroadcastMul(vec, mat): @ops.RegisterGradient("SoftmaxCrossEntropyWithLogits") -def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _): +def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): """Gradient function for SoftmaxCrossEntropyWithLogits.""" - # grad_0 is the backprop for cost, and we multiply it with the gradients + # grad_loss is the backprop for cost, and we multiply it with the gradients # (which is output[1]) + # grad_grad is the backprop for softmax gradient. # There is no gradient for the labels # - # Currently there is no way to take the second derivative of this op - # due to the fused implementation's interaction with tf.gradients(), - # so we make sure we prevent silently incorrect results by raising - # an error if the second derivative is requested via prevent_gradient. - softmax_grad_without_gradient = array_ops.prevent_gradient( - op.outputs[1], message="Currently there is no way to take the second " - "derivative of softmax_cross_entropy_with_logits due to the fused " - " implementation's interaction with tf.gradients()") - return _BroadcastMul(grad_0, softmax_grad_without_gradient), None + # Second derivative is just softmax derivative w.r.t. logits. + softmax_grad = op.outputs[1] + grad = _BroadcastMul(grad_loss, softmax_grad) + + if grad_grad.op.type not in ('ZerosLike', 'Zeros'): + logits = op.inputs[0] + softmax = nn_ops.softmax(logits) + + grad += ((grad_grad - array_ops.squeeze(math_ops.matmul(grad_grad[:, None, :], + softmax[:, :, None]), axis=1)) * softmax) + + return grad, None @ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits") diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py index 06ae3589a2..658c9f79ee 100644 --- a/tensorflow/python/ops/rnn.py +++ b/tensorflow/python/ops/rnn.py @@ -515,7 +515,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, state = initial_state else: if not dtype: - raise ValueError("If no initial_state is provided, dtype must be.") + raise ValueError("If there is no initial_state, you must give a dtype.") state = cell.zero_state(batch_size, dtype) def _assert_has_shape(x, shape): diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py index bf4d198209..5a8eb432d1 100644 --- a/tensorflow/python/ops/special_math_ops.py +++ b/tensorflow/python/ops/special_math_ops.py @@ -318,44 +318,28 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum): # into a single axis, and combine multiple summed axes into a # single axis. - t0_shape = tuple(x.value for x in t0.get_shape()) + t0_shape = _get_shape(t0) num_broadcast_elements_t0 = _total_size( t0_shape[len(preserved_axes):-len(axes_to_sum)]) num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):]) - new_shape = t0_shape[:len(preserved_axes)] + (num_broadcast_elements_t0, - num_summed_elements) + new_shape = (t0_shape[:len(preserved_axes)] + + [num_broadcast_elements_t0, num_summed_elements]) t0 = _reshape_if_necessary(t0, new_shape) - t1_shape = tuple(x.value for x in t1.get_shape()) + t1_shape = _get_shape(t1) num_broadcast_elements_t1 = _total_size( t1_shape[len(preserved_axes)+len(axes_to_sum):]) - new_shape = t1_shape[:len(preserved_axes)] + (num_summed_elements, - num_broadcast_elements_t1) + new_shape = (t1_shape[:len(preserved_axes)] + + [num_summed_elements, num_broadcast_elements_t1]) t1 = _reshape_if_necessary(t1, new_shape) product = math_ops.matmul(t0, t1) # Undo compaction of broadcast axes uncompacted_shape = ( - t0_shape[:len(preserved_axes)+len(broadcast_axes[0])] + - t1_shape[len(t1_shape)-len(broadcast_axes[1]):] + t0_shape[:len(preserved_axes)+len(broadcast_axes[0])] + + t1_shape[len(t1_shape)-len(broadcast_axes[1]):] ) - - # Check the number of None values and replace them with Tensors containing - # corresponding dimensions if there exist two or more None values - num_none_dims = sum(1 for d in uncompacted_shape if d is None) - if num_none_dims > 1: - uncompacted_shape = list(uncompacted_shape) - for i in xrange(len(uncompacted_shape)): - if uncompacted_shape[i] is None: - if i < len(preserved_axes) + len(broadcast_axes[0]): - uncompacted_shape[i] = array_ops.shape(inputs[0])[i] - else: - idx = (i - len(preserved_axes) - len(broadcast_axes[0]) - + len(t1_shape) - len(broadcast_axes[1])) - uncompacted_shape[i] = array_ops.shape(inputs[1])[idx] - uncompacted_shape = tuple(uncompacted_shape) - product = _reshape_if_necessary(product, uncompacted_shape) product_axes = ( @@ -386,13 +370,27 @@ def _reshape_if_necessary(tensor, new_shape): return array_ops.reshape(tensor, new_shape) +def _get_shape(tensor): + """Like get_shape().as_list(), but explicitly queries the shape of a tensor + if necessary to ensure that the returned value contains no unknown value.""" + + shape = tensor.get_shape().as_list() + none_indices = [i for i, d in enumerate(shape) if d is None] + if none_indices: + # Query the shape if shape contains None values + shape_tensor = array_ops.shape(tensor) + for i in none_indices: + shape[i] = shape_tensor[i] + return shape + def _total_size(shape_values): - """Given list of tensor shape values, returns total size or -1 if unknown.""" + """Given list of tensor shape values, returns total size. + If shape_values contains tensor values (which are results of + array_ops.shape), then it returns a scalar tensor. + If not, it returns an integer.""" + result = 1 for val in shape_values: - if val is None: - return -1 - assert isinstance(val, int) result *= val return result diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py index 3d289bcc9a..c792d32277 100644 --- a/tensorflow/python/ops/special_math_ops_test.py +++ b/tensorflow/python/ops/special_math_ops_test.py @@ -318,7 +318,19 @@ class EinsumTest(test.TestCase): m1: [3, 2], } np.testing.assert_almost_equal( - [[7]], sess.run(out, feed_dict=feed_dict)) + [[7]], sess.run(out, feed_dict=feed_dict)) + + with ops.Graph().as_default(): + m0 = array_ops.placeholder(dtypes.int32, shape=(None, 2, None, 2)) + m1 = array_ops.placeholder(dtypes.int32, shape=(None, 2)) + out = special_math_ops.einsum('ijkl,ij->ikl', m0, m1) + with session.Session() as sess: + feed_dict = { + m0: [[[[1, 2]], [[2, 1]]]], + m1: [[3, 2]], + } + np.testing.assert_almost_equal( + [[[7, 8]]], sess.run(out, feed_dict=feed_dict)) if __name__ == '__main__': diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py index 5a1a43b5d5..ee1e361093 100644 --- a/tensorflow/python/ops/variables.py +++ b/tensorflow/python/ops/variables.py @@ -723,7 +723,8 @@ class Variable(object): """The `Graph` of this variable.""" return self._variable.graph - def get_shape(self): + @property + def shape(self): """The `TensorShape` of this variable. Returns: @@ -731,6 +732,10 @@ class Variable(object): """ return self._variable.get_shape() + def get_shape(self): + """Alias of Variable.shape.""" + return self.shape + def to_proto(self, export_scope=None): """Converts a `Variable` to a `VariableDef` protocol buffer. diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py index 0001eb65e1..452b8f5d3b 100644 --- a/tensorflow/python/platform/test.py +++ b/tensorflow/python/platform/test.py @@ -42,6 +42,7 @@ from tensorflow.python.util.all_util import remove_undocumented # pylint: disable=unused-import from tensorflow.python.framework.test_util import assert_equal_graph_def from tensorflow.python.framework.test_util import TensorFlowTestCase as TestCase +from tensorflow.python.framework.test_util import gpu_device_name from tensorflow.python.ops.gradient_checker import compute_gradient_error from tensorflow.python.ops.gradient_checker import compute_gradient @@ -107,15 +108,6 @@ def is_gpu_available(cuda_only=False): return any((x.device_type == 'GPU' or x.device_type == 'SYCL') for x in _device_lib.list_local_devices()) - -def gpu_device_name(): - """Returns the name of a GPU device if available or the empty string.""" - for x in _device_lib.list_local_devices(): - if x.device_type == 'GPU' or x.device_type == 'SYCL': - return x.name - return '' - - _allowed_symbols = [ # We piggy-back googletest documentation. 'Benchmark', diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc index def28053f4..01ce67252e 100644 --- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc +++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc @@ -57,7 +57,7 @@ namespace cuda { #ifdef __APPLE__ static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA"); -#else +#elif !defined(PLATFORM_WINDOWS) static const char *kDriverVersionPath = "/proc/driver/nvidia/version"; #endif @@ -341,6 +341,12 @@ port::StatusOr Diagnostician::FindKernelDriverVersion() { CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)) }; return status; +#elif defined(PLATFORM_WINDOWS) + auto status = + port::Status{port::error::UNIMPLEMENTED, + "kernel reported driver version not implemented on Windows" + }; + return status; #else FILE *driver_version_file = fopen(kDriverVersionPath, "r"); if (driver_version_file == nullptr) { diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md index 5c90fceaf7..eede1f5fac 100644 --- a/tensorflow/tools/ci_build/README.md +++ b/tensorflow/tools/ci_build/README.md @@ -53,10 +53,10 @@ tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/... tensorflow/tools/ci_build/ci_build.sh GPU bazel test -c opt --config=cuda //tensorflow/... # build pip with gpu support -tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU +tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU -c opt --config=cuda # build and run gpu tests using python 3 -CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3" tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU +CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3" tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU -c opt --config=cuda # build android example app tensorflow/tools/ci_build/ci_build.sh ANDROID tensorflow/tools/ci_build/builds/android.sh diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh index 5527feee34..1399f1e58f 100755 --- a/tensorflow/tools/ci_build/builds/pip.sh +++ b/tensorflow/tools/ci_build/builds/pip.sh @@ -19,8 +19,7 @@ # The PIP installation is done using the --user flag. # # Usage: -# pip.sh CONTAINER_TYPE [--mavx] [--mavx2] -# [--test_tutorials] [--integration_tests] +# pip.sh CONTAINER_TYPE [--test_tutorials] [--integration_tests] [bazel flags] # # When executing the Python unit tests, the script obeys the shell # variables: TF_BUILD_BAZEL_CLEAN, TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES, @@ -39,8 +38,7 @@ # If NO_TEST_USER_OPS has any non-empty and non-0 value, the testing of user- # defined ops against the installation will be skipped. # -# Use --mavx or --mavx2 to let bazel use --copt=-mavx or --copt=-mavx2 options -# while building the pip package, respectively. +# Any flags not listed in the usage above will be passed directly to Bazel. # # If the --test_tutorials flag is set, it will cause the script to run the # tutorial tests (see test_tutorials.sh) after the PIP @@ -49,6 +47,11 @@ # to run. # +# Helper function: Strip leading and trailing whitespaces +str_strip () { + echo -e "$1" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' +} + # Fixed naming patterns for wheel (.whl) files given different python versions if [[ $(uname) == "Linux" ]]; then declare -A WHL_TAGS @@ -66,6 +69,7 @@ source "${SCRIPT_DIR}/builds_common.sh" # Get the command line arguments CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' ) +shift if [[ ! -z "${TF_BUILD_BAZEL_CLEAN}" ]] && \ [[ "${TF_BUILD_BAZEL_CLEAN}" != "0" ]]; then @@ -82,16 +86,14 @@ fi DO_TEST_TUTORIALS=0 DO_INTEGRATION_TESTS=0 -MAVX_FLAG="" +BAZEL_FLAGS="" while true; do if [[ "${1}" == "--test_tutorials" ]]; then DO_TEST_TUTORIALS=1 elif [[ "${1}" == "--integration_tests" ]]; then DO_INTEGRATION_TESTS=1 - elif [[ "${1}" == "--mavx" ]]; then - MAVX_FLAG="--copt=-mavx" - elif [[ "${1}" == "--mavx2" ]]; then - MAVX_FLAG="--copt=-mavx2" + else + BAZEL_FLAGS="${BAZEL_FLAGS} ${1}" fi shift @@ -100,18 +102,18 @@ while true; do fi done -if [[ ! -z "${MAVX_FLAG}" ]]; then - echo "Using MAVX flag: ${MAVX_FLAG}" -fi +BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}") + +echo "Using Bazel flags: ${BAZEL_FLAGS}" PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package" GPU_FLAG="" if [[ ${CONTAINER_TYPE} == "cpu" ]] || \ [[ ${CONTAINER_TYPE} == "debian.jessie.cpu" ]]; then - bazel build -c opt ${MAVX_FLAG} ${PIP_BUILD_TARGET} || \ + bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \ die "Build failed." elif [[ ${CONTAINER_TYPE} == "gpu" ]]; then - bazel build -c opt --config=cuda ${MAVX_FLAG} ${PIP_BUILD_TARGET} || \ + bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \ die "Build failed." GPU_FLAG="--gpu" else diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh index 3697fd46a0..f0fa8a9381 100755 --- a/tensorflow/tools/ci_build/ci_build.sh +++ b/tensorflow/tools/ci_build/ci_build.sh @@ -26,7 +26,7 @@ # directory as this script will be used. # # COMMAND: Command to be executed in the docker container, e.g., -# tensorflow/tools/ci_build/builds/pip.sh gpu +# tensorflow/tools/ci_build/builds/pip.sh gpu -c opt --config=cuda SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/builds/builds_common.sh" diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh index 5e2cf94059..c2ad03bc6f 100755 --- a/tensorflow/tools/ci_build/ci_parameterized_build.sh +++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh @@ -83,7 +83,8 @@ # support for Google Cloud Platform (GCP), which is # enabled by default. # TF_BUILD_OPTIONS: -# (FASTBUILD | OPT | OPTDBG | MAVX | MAVX2) +# (FASTBUILD | OPT | OPTDBG | MAVX | MAVX2_FMA | MAVX_DBG | +# MAVX2_FMA_DBG) # Use the specified configurations when building. # When set, overrides TF_BUILD_IS_OPT and TF_BUILD_MAVX # options, as this will replace the two. @@ -304,11 +305,14 @@ else MAVX) OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx" ;; - MAVXDBG) + MAVX_DBG) OPT_FLAG="${OPT_FLAG} -c opt --copt=-g --copt=-mavx" ;; - MAVX2) - OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx2" + MAVX2_FMA) + OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx2 --copt=-mfma" + ;; + MAVX2_FMA_DBG) + OPT_FLAG="${OPT_FLAG} -c opt --copt=-g --copt=-mavx2 --copt=-mfma" ;; esac fi @@ -376,12 +380,7 @@ if [[ ${TF_BUILD_IS_PIP} == "pip" ]] || exit 0 fi - PIP_MAIN_CMD="${MAIN_CMD} ${PIP_CMD} ${CTYPE} ${EXTRA_AGRS}" - - # Add flag for mavx/mavx2 - if [[ ! -z "${TF_BUILD_MAVX}" ]]; then - PIP_MAIN_CMD="${PIP_MAIN_CMD} --${TF_BUILD_MAVX}" - fi + PIP_MAIN_CMD="${MAIN_CMD} ${PIP_CMD} ${CTYPE} ${EXTRA_ARGS} ${OPT_FLAG}" # Add flag for integration tests if [[ ! -z "${TF_BUILD_INTEGRATION_TESTS}" ]] && diff --git a/tensorflow/tools/ci_build/install/install_buildifier.sh b/tensorflow/tools/ci_build/install/install_buildifier.sh index 2f3470881a..5420934c6b 100755 --- a/tensorflow/tools/ci_build/install/install_buildifier.sh +++ b/tensorflow/tools/ci_build/install/install_buildifier.sh @@ -16,8 +16,9 @@ set -e BUILDIFIER_DIR="buildifier" -rm -rf ${BUILDIFIER_DIR} -git clone https://github.com/bazelbuild/buildifier.git ${BUILDIFIER_DIR} +mkdir ${BUILDIFIER_DIR} +curl -Ls https://github.com/bazelbuild/buildifier/archive/0.4.3.tar.gz | \ + tar -C "${BUILDIFIER_DIR}" --strip-components=1 -xz pushd ${BUILDIFIER_DIR} bazel build buildifier:buildifier --spawn_strategy=standalone --genrule_strategy=standalone diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh index 35c1218470..886266caaf 100755 --- a/tensorflow/tools/docker/parameterized_docker_build.sh +++ b/tensorflow/tools/docker/parameterized_docker_build.sh @@ -266,7 +266,7 @@ fi IMG="${USER}/tensorflow:${FINAL_TAG}" echo "Building docker image with image name and tag: ${IMG}" -"${DOCKER_BINARY}" build --no-cache -t "${IMG}" -f "${DOCKERFILE}" "${TMP_DIR}" +"${DOCKER_BINARY}" build --no-cache --pull -t "${IMG}" -f "${DOCKERFILE}" "${TMP_DIR}" if [[ $? == "0" ]]; then echo "${DOCKER_BINARY} build of ${IMG} succeeded" else diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md index 36a5c01a0c..73af8699b2 100644 --- a/tensorflow/tools/graph_transforms/README.md +++ b/tensorflow/tools/graph_transforms/README.md @@ -179,7 +179,7 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \ --outputs='softmax:0' \ --transforms='\ strip_unused_nodes(type=float, shape="1,299,299,3") \ -fold_constants \ +fold_constants(ignore_errors=true) \ fold_batch_norms \ fold_old_batch_norms\ ' @@ -715,7 +715,7 @@ shape arguments let you control the attributes of any new Placeholders that are created. Plain `type` and `shape` set global defaults, but if you have different inputs with varying characteristics, you'll need to pass in a list of arguments where the preceding name specifies what layer each applies to. For example, if -you had two inputs in1 and in2, you could call `strip_unused_node(name=in1, +you had two inputs in1 and in2, you could call `strip_unused_nodes(name=in1, type_for_name=int32, shape_for_name="2,3", name=in2, type_for_name=float, shape_for_name="1,10,10,3")`. diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index ea4c7e9ff4..9b229704d6 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -29,7 +29,7 @@ from setuptools.dist import Distribution # This version string is semver compatible, but incompatible with pip. # For pip, we will remove all '-' characters from this string, and use the # result for pip. -_VERSION = '1.0.0-rc2' +_VERSION = '1.0.0' REQUIRED_PACKAGES = [ 'numpy >= 1.11.0', diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 5a2fe99c67..66ff8843d8 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -406,12 +406,16 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): actual = "@zlib_archive//:zlib", ) - native.new_http_archive( + temp_workaround_http_archive( name = "nccl_archive", - url = "https://github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz", + urls = [ + "http://bazel-mirror.storage.googleapis.com/github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz", + "https://github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz", + ], sha256 = "6787f0eed88d52ee8e32956fa4947d92c139da469f1d8e311c307f27d641118e", strip_prefix = "nccl-024d1e267845f2ed06f3e2e42476d50f04a00ee6", build_file = str(Label("//third_party:nccl.BUILD")), + repository = tf_repo_name, ) java_import_external( diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD index dde8e6cdb7..557c2885eb 100644 --- a/third_party/curl.BUILD +++ b/third_party/curl.BUILD @@ -204,13 +204,13 @@ cc_library( "lib/wildcard.h", "lib/x509asn1.h", ] + select({ - ":darwin": [ + "@//tensorflow:darwin": [ "lib/vtls/darwinssl.c", ], - ":ios": [ + "@//tensorflow:ios": [ "lib/vtls/darwinssl.c", ], - ":windows": [ + "@//tensorflow:windows": [ "lib/asyn-thread.c", "lib/inet_ntop.c", "lib/system_win32.c", @@ -231,7 +231,7 @@ cc_library( "include/curl/typecheck-gcc.h", ], copts = select({ - ":windows": [ + "@//tensorflow:windows": [ "/I%prefix%/curl/lib", "/DHAVE_CONFIG_H", "/DCURL_DISABLE_FTP", @@ -255,10 +255,10 @@ cc_library( "-Wno-string-plus-int", ], }) + select({ - ":darwin": [ + "@//tensorflow:darwin": [ "-fno-constant-cfstrings", ], - ":windows": [ + "@//tensorflow:windows": [ # See curl.h for discussion of write size and Windows "/DCURL_MAX_WRITE_SIZE=16384", ], @@ -268,17 +268,17 @@ cc_library( }), includes = ["include"], linkopts = select({ - ":android": [ + "@//tensorflow:android": [ "-pie", ], - ":darwin": [ + "@//tensorflow:darwin": [ "-Wl,-framework", "-Wl,CoreFoundation", "-Wl,-framework", "-Wl,Security", ], - ":ios": [], - ":windows": [ + "@//tensorflow:ios": [], + "@//tensorflow:windows": [ "ws2_32.lib", ], "//conditions:default": [ @@ -289,8 +289,8 @@ cc_library( deps = [ "@zlib_archive//:zlib", ] + select({ - ":ios": [], - ":windows": [], + "@//tensorflow:ios": [], + "@//tensorflow:windows": [], "//conditions:default": [ "@boringssl//:ssl", ], @@ -386,7 +386,7 @@ cc_binary( "src/tool_xattr.h", ], copts = select({ - ":windows": [ + "@//tensorflow:windows": [ "/I%prefix%/curl/lib", "/DHAVE_CONFIG_H", "/DCURL_DISABLE_LIBCURL_OPTION", @@ -657,23 +657,3 @@ genrule( "EOF", ]), ) - -config_setting( - name = "ios", - values = {"crosstool_top": "//tools/osx/crosstool:crosstool"}, -) - -config_setting( - name = "darwin", - values = {"cpu": "darwin"}, -) - -config_setting( - name = "windows", - values = {"cpu": "x64_windows_msvc"}, -) - -config_setting( - name = "android", - values = {"crosstool_top": "//external:android/crosstool"}, -) diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h index 98deb1742e..078be83e0d 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h @@ -11,6 +11,13 @@ typedef struct Packet32q8i { Packet32q8i(__m256i val) : val(val) {} } Packet32q8i; +typedef struct Packet16q16i { + __m256i val; + operator __m256i() const { return val; } + Packet16q16i(); + Packet16q16i(__m256i val) : val(val) {} +} Packet16q16i; + typedef struct Packet32q8u { __m256i val; operator __m256i() const { return val; } @@ -32,6 +39,13 @@ typedef struct Packet16q8u { Packet16q8u(__m128i val) : val(val) {} } Packet16q8u; +typedef struct Packet8q16i { + __m128i val; + operator __m128i() const { return val; } + Packet8q16i(); + Packet8q16i(__m128i val) : val(val) {} +} Packet8q16i; + typedef struct Packet8q32i { __m256i val; operator __m256i() const { return val; } @@ -92,6 +106,28 @@ struct packet_traits : default_packet_traits { }; }; template <> +struct packet_traits : default_packet_traits { + typedef Packet16q16i type; + typedef Packet8q16i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template <> struct packet_traits : default_packet_traits { typedef Packet8q32i type; typedef Packet4q32i half; @@ -122,6 +158,12 @@ struct unpacket_traits { enum { size = 32, alignment=Aligned32 }; }; template <> +struct unpacket_traits { + typedef QInt16 type; + typedef Packet8q16i half; + enum { size = 16, alignment=Aligned32 }; +}; +template <> struct unpacket_traits { typedef QUInt8 type; typedef Packet16q8u half; @@ -146,6 +188,11 @@ EIGEN_STRONG_INLINE Packet32q8u ploadu(const QUInt8* from) { reinterpret_cast(from)); } template <> +EIGEN_STRONG_INLINE Packet16q16i ploadu(const QInt16* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( + reinterpret_cast(from)); +} +template <> EIGEN_STRONG_INLINE Packet8q32i ploadu(const QInt32* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( reinterpret_cast(from)); @@ -163,6 +210,11 @@ EIGEN_STRONG_INLINE Packet32q8u pload(const QUInt8* from) { reinterpret_cast(from)); } template <> +EIGEN_STRONG_INLINE Packet16q16i pload(const QInt16* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( + reinterpret_cast(from)); +} +template <> EIGEN_STRONG_INLINE Packet8q32i pload(const QInt32* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( reinterpret_cast(from)); @@ -180,6 +232,11 @@ EIGEN_STRONG_INLINE void pstoreu(QUInt8* to, const Packet32q8u& from) { reinterpret_cast<__m256i*>(to), from.val); } template <> +EIGEN_STRONG_INLINE void pstoreu(QInt16* to, const Packet16q16i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( + reinterpret_cast<__m256i*>(to), from.val); +} +template <> EIGEN_STRONG_INLINE void pstoreu(QInt32* to, const Packet8q32i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( reinterpret_cast<__m256i*>(to), from.val); @@ -192,6 +249,11 @@ EIGEN_STRONG_INLINE void pstore(QInt32* to, const Packet8q32i& from) { from.val); } template <> +EIGEN_STRONG_INLINE void pstore(QInt16* to, const Packet16q16i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), + from.val); +} +template <> EIGEN_STRONG_INLINE void pstore(QUInt8* to, const Packet32q8u& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from.val); @@ -208,6 +270,10 @@ EIGEN_STRONG_INLINE QInt32 pfirst(const Packet8q32i& a) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); } template <> +EIGEN_STRONG_INLINE QInt16 pfirst(const Packet16q16i& a) { + return _mm256_extract_epi16(a.val, 0); +} +template <> EIGEN_STRONG_INLINE QUInt8 pfirst(const Packet32q8u& a) { return static_cast(_mm256_extract_epi8(a.val, 0)); } @@ -237,6 +303,10 @@ EIGEN_STRONG_INLINE Packet8q32i padd(const Packet8q32i& a, return _mm256_add_epi32(a.val, b.val); } template <> +EIGEN_STRONG_INLINE Packet16q16i pset1(const QInt16& from) { + return _mm256_set1_epi16(from.value); +} +template <> EIGEN_STRONG_INLINE Packet8q32i psub(const Packet8q32i& a, const Packet8q32i& b) { return _mm256_sub_epi32(a.val, b.val); @@ -264,6 +334,17 @@ EIGEN_STRONG_INLINE Packet8q32i pmax(const Packet8q32i& a, return _mm256_max_epi32(a.val, b.val); } +template <> +EIGEN_STRONG_INLINE Packet16q16i pmin(const Packet16q16i& a, + const Packet16q16i& b) { + return _mm256_min_epi16(a.val, b.val); +} +template <> +EIGEN_STRONG_INLINE Packet16q16i pmax(const Packet16q16i& a, + const Packet16q16i& b) { + return _mm256_max_epi16(a.val, b.val); +} + template <> EIGEN_STRONG_INLINE Packet32q8u pmin(const Packet32q8u& a, const Packet32q8u& b) { @@ -304,6 +385,23 @@ EIGEN_STRONG_INLINE QInt32 predux_max(const Packet8q32i& a) { _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1))); } +template <> +EIGEN_STRONG_INLINE QInt16 predux_min(const Packet16q16i& a) { + __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1)); + return std::min(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1)); +} +template <> +EIGEN_STRONG_INLINE QInt16 predux_max(const Packet16q16i& a) { + __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1)); + tmp = + _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1)); + return std::max(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1)); +} + template <> EIGEN_STRONG_INLINE QUInt8 predux_min(const Packet32q8u& a) { __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1)); diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD index ddaf29a086..7e95ebd355 100644 --- a/third_party/mkl/BUILD +++ b/third_party/mkl/BUILD @@ -8,12 +8,17 @@ config_setting( visibility = ["//visibility:public"], ) +load( + "//third_party/mkl:build_defs.bzl", + "if_mkl", +) + cc_library( name = "intel_binary_blob", - srcs = [ - "libiomp5.so", + srcs = if_mkl([ "libmklml_intel.so", - ], + "libiomp5.so", + ]), includes = ["."], visibility = ["//visibility:public"], ) diff --git a/third_party/nccl.BUILD b/third_party/nccl.BUILD index bb460a05e0..06b9b8ff68 100644 --- a/third_party/nccl.BUILD +++ b/third_party/nccl.BUILD @@ -43,6 +43,24 @@ cc_library( "-Iexternal/nccl_archive/src", "-O3", ] + cuda_default_copts(), + linkopts = select({ + "@%ws%//tensorflow:android": [ + "-pie", + ], + "@%ws%//tensorflow:darwin": [ + "-Wl,-framework", + "-Wl,CoreFoundation", + "-Wl,-framework", + "-Wl,Security", + ], + "@%ws%//tensorflow:ios": [], + "@%ws%//tensorflow:windows": [ + "ws2_32.lib", + ], + "//conditions:default": [ + "-lrt", + ], + }), visibility = ["//visibility:public"], deps = ["@local_config_cuda//cuda:cuda_headers"], ) diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl index a5e6b9fe93..66dd9aea7b 100755 --- a/third_party/sycl/crosstool/computecpp.tpl +++ b/third_party/sycl/crosstool/computecpp.tpl @@ -26,9 +26,7 @@ def main(): if(output_file_index == 1): # we are linking - return subprocess.call([CPU_CXX_COMPILER] + compiler_flags) - - compiler_flags = compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DEIGEN_USE_SYCL=1'] + return subprocess.call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined']) # find what we compile compiling_cpp = 0 @@ -38,6 +36,28 @@ def main(): if(compited_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))): compiling_cpp = 1; + compiler_flags = compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DEIGEN_USE_SYCL=1', '-DTENSORFLOW_USE_SYCL', '-DEIGEN_HAS_C99_MATH'] + + if(compiling_cpp == 1): + # create a blacklist of folders that will be skipped when compiling with ComputeCpp + _skip = ["external", "llvm", ".cu.cc"] + # if compiling external project skip computecpp + if any(_folder in _skip for _folder in output_file_name): + return subprocess.call([CPU_CXX_COMPILER] + compiler_flags) + + if(compiling_cpp == 1): + # this is an optimisation that will check if compiled file has to be compiled with ComputeCpp + + _tmp_flags = [flag for flag in compiler_flags if not flag.startswith(('-o', output_file_name))] + # create preprocessed of the file + _cmd = " ".join([CPU_CXX_COMPILER] + _tmp_flags + ["-E"]) + # check if it has parallel_for< in it + _cmd += " | grep \".parallel_for\" > /dev/null" + ps = subprocess.call(_cmd, shell=True) + # if not call CXX compiler + if(ps != 0): + return subprocess.call([CPU_CXX_COMPILER] + compiler_flags) + if(compiling_cpp == 1): filename, file_extension = os.path.splitext(output_file_name) bc_out = filename + '.sycl' @@ -52,9 +72,12 @@ def main(): # dont want that in case of compiling with computecpp first host_compiler_flags = [flag for flag in compiler_flags if not flag.startswith(('-MF', '-MD',)) - if not '.d' in flag] + if not '.d' in flag + ] + + host_compiler_flags[host_compiler_flags.index('-c')] = "--include" - host_compiler_flags = ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '--include', bc_out] + host_compiler_flags + host_compiler_flags = ['-xc++', '-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-c', bc_out] + host_compiler_flags x = subprocess.call([CPU_CXX_COMPILER] + host_compiler_flags) return x else: diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template index 48c9f5aa3f..3622b9423c 100644 --- a/tools/bazel.rc.template +++ b/tools/bazel.rc.template @@ -7,7 +7,7 @@ build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain build:sycl --define=using_sycl=true build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain -build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -fsanitize=address --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -lasan +build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address build --force_python=py$PYTHON_MAJOR_VERSION build --host_force_python=py$PYTHON_MAJOR_VERSION -- cgit v1.2.3