aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--README.md11
-rw-r--r--RELEASE.md2
-rw-r--r--tensorflow/cc/training/coordinator.h2
-rw-r--r--tensorflow/compiler/tf2xla/kernels/binary_ops.cc37
-rw-r--r--tensorflow/compiler/tf2xla/kernels/cwise_ops.h6
-rw-r--r--tensorflow/compiler/tf2xla/kernels/relu_op.cc43
-rw-r--r--tensorflow/compiler/xla/client/computation_builder.cc5
-rw-r--r--tensorflow/compiler/xla/client/computation_builder.h5
-rw-r--r--tensorflow/contrib/cmake/external/eigen.cmake1
-rw-r--r--tensorflow/contrib/cmake/tf_tests.cmake1
-rw-r--r--tensorflow/contrib/hvx/README.md26
-rw-r--r--tensorflow/contrib/learn/python/learn/estimators/dnn.py11
-rw-r--r--tensorflow/contrib/rnn/python/ops/lstm_ops.py9
-rw-r--r--tensorflow/contrib/training/python/training/evaluation.py3
-rw-r--r--tensorflow/core/BUILD5
-rw-r--r--tensorflow/core/common_runtime/sycl/sycl_allocator.cc7
-rw-r--r--tensorflow/core/common_runtime/sycl/sycl_allocator.h11
-rw-r--r--tensorflow/core/common_runtime/sycl/sycl_device.cc2
-rw-r--r--tensorflow/core/common_runtime/sycl/sycl_device.h9
-rw-r--r--tensorflow/core/common_runtime/sycl/sycl_device_context.cc256
-rw-r--r--tensorflow/core/common_runtime/sycl/sycl_device_context.h6
-rw-r--r--tensorflow/core/common_runtime/sycl/sycl_device_factory.cc12
-rw-r--r--tensorflow/core/distributed_runtime/master_session.cc1
-rw-r--r--tensorflow/core/framework/register_types_traits.h19
-rw-r--r--tensorflow/core/kernels/BUILD15
-rw-r--r--tensorflow/core/kernels/cast_op.cc56
-rw-r--r--tensorflow/core/kernels/cast_op_impl.h29
-rw-r--r--tensorflow/core/kernels/cast_op_impl_bool.cc10
-rw-r--r--tensorflow/core/kernels/cast_op_impl_double.cc10
-rw-r--r--tensorflow/core/kernels/cast_op_impl_float.cc10
-rw-r--r--tensorflow/core/kernels/cast_op_impl_int32.cc10
-rw-r--r--tensorflow/core/kernels/cast_op_impl_int64.cc12
-rw-r--r--tensorflow/core/kernels/cast_op_test.cc14
-rw-r--r--tensorflow/core/kernels/concat_lib.h8
-rw-r--r--tensorflow/core/kernels/concat_lib_cpu.cc19
-rw-r--r--tensorflow/core/kernels/concat_lib_cpu.h35
-rw-r--r--tensorflow/core/kernels/concat_op.cc50
-rw-r--r--tensorflow/core/kernels/constant_op.cc18
-rw-r--r--tensorflow/core/kernels/control_flow_ops.cc24
-rw-r--r--tensorflow/core/kernels/cwise_op_acos.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_add_1.cc15
-rw-r--r--tensorflow/core/kernels/cwise_op_asin.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_atan.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_ceil.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_cos.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_div.cc12
-rw-r--r--tensorflow/core/kernels/cwise_op_equal_to_1.cc12
-rw-r--r--tensorflow/core/kernels/cwise_op_expm1.cc3
-rw-r--r--tensorflow/core/kernels/cwise_op_floor.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_floor_div.cc21
-rw-r--r--tensorflow/core/kernels/cwise_op_floor_mod.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_greater.cc14
-rw-r--r--tensorflow/core/kernels/cwise_op_greater_equal.cc11
-rw-r--r--tensorflow/core/kernels/cwise_op_isfinite.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_isinf.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_isnan.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_less.cc10
-rw-r--r--tensorflow/core/kernels/cwise_op_less_equal.cc12
-rw-r--r--tensorflow/core/kernels/cwise_op_log.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_log1p.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_maximum.cc15
-rw-r--r--tensorflow/core/kernels/cwise_op_minimum.cc12
-rw-r--r--tensorflow/core/kernels/cwise_op_mul_1.cc8
-rw-r--r--tensorflow/core/kernels/cwise_op_pow.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_reciprocal.cc6
-rw-r--r--tensorflow/core/kernels/cwise_op_round.cc4
-rw-r--r--tensorflow/core/kernels/cwise_op_rsqrt.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_select.cc59
-rw-r--r--tensorflow/core/kernels/cwise_op_sigmoid.cc6
-rw-r--r--tensorflow/core/kernels/cwise_op_sign.cc13
-rw-r--r--tensorflow/core/kernels/cwise_op_sin.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_sqrt.cc3
-rw-r--r--tensorflow/core/kernels/cwise_op_square.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_tan.cc1
-rw-r--r--tensorflow/core/kernels/cwise_op_tanh.cc1
-rw-r--r--tensorflow/core/kernels/cwise_ops_gradients.h15
-rw-r--r--tensorflow/core/kernels/cwise_ops_test.cc51
-rw-r--r--tensorflow/core/kernels/debug_ops.cc2
-rw-r--r--tensorflow/core/kernels/dense_update_ops.cc1
-rw-r--r--tensorflow/core/kernels/fill_functor.cc2
-rw-r--r--tensorflow/core/kernels/function_ops.cc28
-rw-r--r--tensorflow/core/kernels/matmul_op.cc60
-rw-r--r--tensorflow/core/kernels/mkl_matmul_op.cc217
-rw-r--r--tensorflow/core/kernels/pack_op.cc1
-rw-r--r--tensorflow/core/kernels/pad_op.cc29
-rw-r--r--tensorflow/core/kernels/reduction_ops_common.h25
-rw-r--r--tensorflow/core/kernels/reduction_ops_max.cc23
-rw-r--r--tensorflow/core/kernels/reduction_ops_mean.cc13
-rw-r--r--tensorflow/core/kernels/reduction_ops_min.cc23
-rw-r--r--tensorflow/core/kernels/reduction_ops_prod.cc24
-rw-r--r--tensorflow/core/kernels/reduction_ops_sum.cc1
-rw-r--r--tensorflow/core/kernels/relu_op.cc29
-rw-r--r--tensorflow/core/kernels/relu_op.h4
-rw-r--r--tensorflow/core/kernels/resize_bicubic_op.cc2
-rw-r--r--tensorflow/core/kernels/reverse_op.cc35
-rw-r--r--tensorflow/core/kernels/scatter_op.cc4
-rw-r--r--tensorflow/core/kernels/sequence_ops.cc9
-rw-r--r--tensorflow/core/kernels/shape_ops.cc83
-rw-r--r--tensorflow/core/kernels/softmax_op.cc24
-rw-r--r--tensorflow/core/kernels/stage_op.cc6
-rw-r--r--tensorflow/core/kernels/strided_slice_op.cc67
-rw-r--r--tensorflow/core/kernels/strided_slice_op_impl.h14
-rw-r--r--tensorflow/core/kernels/tile_ops.cc31
-rw-r--r--tensorflow/core/kernels/tile_ops_cpu_impl.h4
-rw-r--r--tensorflow/core/kernels/training_ops.cc2
-rw-r--r--tensorflow/core/kernels/transpose_functor_cpu.cc1
-rw-r--r--tensorflow/core/kernels/transpose_op.cc29
-rw-r--r--tensorflow/core/kernels/transpose_op.h11
-rw-r--r--tensorflow/core/kernels/unpack_op.cc1
-rw-r--r--tensorflow/core/kernels/variable_ops.cc3
-rw-r--r--tensorflow/core/kernels/xent_op.cc26
-rw-r--r--tensorflow/core/ops/math_grad_test.cc8
-rw-r--r--tensorflow/core/ops/ops.pbtxt53
-rw-r--r--tensorflow/core/public/version.h2
-rw-r--r--tensorflow/docs_src/programmers_guide/meta_graph.md2
-rw-r--r--tensorflow/examples/image_retraining/retrain.py3
-rw-r--r--tensorflow/examples/learn/mnist.py4
-rw-r--r--tensorflow/examples/learn/text_classification.py9
-rw-r--r--tensorflow/examples/tutorials/word2vec/word2vec_basic.py2
-rw-r--r--tensorflow/examples/udacity/README.md7
-rw-r--r--tensorflow/python/client/device_lib_test.py2
-rw-r--r--tensorflow/python/framework/test_util.py21
-rw-r--r--tensorflow/python/kernel_tests/stage_op_test.py4
-rw-r--r--tensorflow/python/kernel_tests/variables_test.py8
-rw-r--r--tensorflow/python/kernel_tests/xent_op_test.py35
-rw-r--r--tensorflow/python/ops/array_ops.py6
-rw-r--r--tensorflow/python/ops/clip_ops.py6
-rw-r--r--tensorflow/python/ops/nn_grad.py26
-rw-r--r--tensorflow/python/ops/rnn.py2
-rw-r--r--tensorflow/python/ops/special_math_ops.py54
-rw-r--r--tensorflow/python/ops/special_math_ops_test.py14
-rw-r--r--tensorflow/python/ops/variables.py7
-rw-r--r--tensorflow/python/platform/test.py10
-rw-r--r--tensorflow/stream_executor/cuda/cuda_diagnostics.cc8
-rw-r--r--tensorflow/tools/ci_build/README.md4
-rwxr-xr-xtensorflow/tools/ci_build/builds/pip.sh30
-rwxr-xr-xtensorflow/tools/ci_build/ci_build.sh2
-rwxr-xr-xtensorflow/tools/ci_build/ci_parameterized_build.sh19
-rwxr-xr-xtensorflow/tools/ci_build/install/install_buildifier.sh5
-rwxr-xr-xtensorflow/tools/docker/parameterized_docker_build.sh2
-rw-r--r--tensorflow/tools/graph_transforms/README.md4
-rw-r--r--tensorflow/tools/pip_package/setup.py2
-rw-r--r--tensorflow/workspace.bzl8
-rw-r--r--third_party/curl.BUILD46
-rw-r--r--third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h98
-rw-r--r--third_party/mkl/BUILD11
-rw-r--r--third_party/nccl.BUILD18
-rwxr-xr-xthird_party/sycl/crosstool/computecpp.tpl33
-rw-r--r--tools/bazel.rc.template2
149 files changed, 2122 insertions, 428 deletions
diff --git a/README.md b/README.md
index 2dc3dcb3e6..c99e9350dd 100644
--- a/README.md
+++ b/README.md
@@ -33,11 +33,12 @@ and discussion.**
People who are a little more adventurous can also try our nightly binaries:
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.0.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.0.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.0.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.0.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
diff --git a/RELEASE.md b/RELEASE.md
index e9f0361511..f7f5e37bd7 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -89,6 +89,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change
* Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args.
## Bug Fixes and Other Changes
+* Numerous C++ API updates.
* New op: `parallel_stack`.
* Introducing common tf io compression options constants for
RecordReader/RecordWriter.
@@ -127,6 +128,7 @@ To help you upgrade your existing TensorFlow Python code to match the API change
* `tf.divide` now honors the name field.
* Make metrics weight broadcasting more strict.
* Add new queue-like `StagingArea` and new ops: `stage` and `unstage`.
+* Enable inplace update ops for strings on CPU. Speed up string concat.
## Thanks to our Contributors
diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h
index dbcf072015..1b107e2d06 100644
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@@ -115,8 +115,6 @@ class Coordinator {
std::vector<std::unique_ptr<RunnerInterface>> runners_
GUARDED_BY(runners_lock_);
- std::atomic<int> num_runners_to_cancel_;
-
TF_DISALLOW_COPY_AND_ASSIGN(Coordinator);
};
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 39c5567f80..1f9ac029c7 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -127,32 +127,21 @@ XLA_MAKE_BINARY(GreaterEqual, b->Ge(lhs, rhs, extend_dimensions));
XLA_MAKE_BINARY(Less, b->Lt(lhs, rhs, extend_dimensions));
XLA_MAKE_BINARY(LessEqual, b->Le(lhs, rhs, extend_dimensions));
-#undef XLA_MAKE_BINARY
+// Non-linear ops
+XLA_MAKE_BINARY(SigmoidGrad,
+ b->Mul(b->Mul(rhs, lhs),
+ b->Sub(XlaHelpers::One(b, input_type(0)), lhs)));
-#define XLA_MAKE_BINARY_MAP(Name, HLO) \
- class Name##Op : public XlaBinaryMapOp { \
- public: \
- explicit Name##Op(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {} \
- void BuildMapLambda(xla::ComputationBuilder* b, \
- const xla::ComputationDataHandle& lhs, \
- const xla::ComputationDataHandle& rhs) override { \
- HLO; \
- } \
- }; \
- REGISTER_XLA_OP(#Name, Name##Op)
+XLA_MAKE_BINARY(SoftplusGrad,
+ b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)),
+ XlaHelpers::One(b, input_type(1)))));
+
+XLA_MAKE_BINARY(TanhGrad, b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)),
+ b->Mul(lhs, lhs))));
-XLA_MAKE_BINARY_MAP(Pow, b->Pow(lhs, rhs));
-XLA_MAKE_BINARY_MAP(SigmoidGrad,
- b->Mul(b->Mul(rhs, lhs),
- b->Sub(XlaHelpers::One(b, input_type(0)), lhs)));
-XLA_MAKE_BINARY_MAP(SoftplusGrad,
- b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)),
- XlaHelpers::One(b, input_type(1)))));
-XLA_MAKE_BINARY_MAP(TanhGrad,
- b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)),
- b->Mul(lhs, lhs))));
-
-#undef XLA_MAKE_BINARY_MAP
+XLA_MAKE_BINARY(Pow, b->Pow(lhs, rhs, extend_dimensions));
+
+#undef XLA_MAKE_BINARY
} // namespace
} // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
index f0687c1d4b..ba38693325 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -32,9 +32,7 @@ namespace tensorflow {
// description of the operation; and Computation adds the
// implementation of the operation to a xla::ComputationBuilder. For most
// arithmetic Ops XLA handles the broadcasting automatically given the input
-// tensors. Ops like ReluGrad that need to map a scalar function over the inputs
-// can use the XlaBinaryMapOp subclass below which handles manual
-// broadcasting of the inputs.
+// tensors.
class XlaBinaryOp : public XlaOpKernel {
public:
explicit XlaBinaryOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -83,6 +81,8 @@ class XlaBinaryOp : public XlaOpKernel {
// virtual methods to override: description is a textual description
// of the mapped function; and BuildMapLambda adds the
// implementation of the lambda to a xla::ComputationBuilder.
+// Operations may have better performance if implemented as graphs of
+// element-wise tensor operations.
class XlaBinaryMapOp : public XlaBinaryOp {
public:
explicit XlaBinaryMapOp(OpKernelConstruction* ctx) : XlaBinaryOp(ctx) {}
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index 2dea4032c0..d1b857c22a 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -50,36 +50,37 @@ class Relu6Op : public XlaOpKernel {
}
};
-// A subclass of a XlaBinaryMapOp must build the lambda computation
-// that describes the (scalar,scalar)->scalar function to apply to
-// each element of the input. We have to use XlaBinaryMapOp instead of
-// XlaBinaryOp here because XLA Select does not do automatic
-// broadcasting.
-class ReluGradOp : public XlaBinaryMapOp {
+class ReluGradOp : public XlaOpKernel {
public:
- explicit ReluGradOp(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {}
+ explicit ReluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
// Return the lhs (incoming gradient) if the rhs (input feature) > 0,
// otherwise return 0.
- void BuildMapLambda(xla::ComputationBuilder* b,
- const xla::ComputationDataHandle& gradient,
- const xla::ComputationDataHandle& feature) override {
- const auto zero = XlaHelpers::Zero(b, input_type(0));
- b->Select(b->Gt(feature, zero), gradient, zero);
+ void Compile(XlaOpKernelContext* ctx) {
+ xla::ComputationBuilder* b = ctx->builder();
+ const TensorShape shape = ctx->InputShape(0);
+ const auto zero =
+ b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
+ const auto pred = b->Gt(ctx->Input(1), zero);
+ ctx->SetOutput(0, b->Select(pred, ctx->Input(0), zero));
}
};
-class Relu6GradOp : public XlaBinaryMapOp {
+class Relu6GradOp : public XlaOpKernel {
public:
- explicit Relu6GradOp(OpKernelConstruction* ctx) : XlaBinaryMapOp(ctx) {}
+ explicit Relu6GradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
// Return the lhs (incoming gradient) if the rhs (input feature) > 0,
// otherwise return 0.
- void BuildMapLambda(xla::ComputationBuilder* b,
- const xla::ComputationDataHandle& gradient,
- const xla::ComputationDataHandle& feature) override {
- const auto zero = XlaHelpers::Zero(b, input_type(0));
- auto six = XlaHelpers::IntegerLiteral(b, input_type(0), 6);
- b->Select(b->LogicalAnd(b->Lt(feature, six), b->Gt(feature, zero)),
- gradient, zero);
+ void Compile(XlaOpKernelContext* ctx) {
+ xla::ComputationBuilder* b = ctx->builder();
+ const TensorShape shape = ctx->InputShape(0);
+ const auto zero =
+ b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
+ const auto six = b->Broadcast(
+ XlaHelpers::IntegerLiteral(b, input_type(0), 6), shape.dim_sizes());
+ auto out = b->Select(
+ b->LogicalAnd(b->Lt(ctx->Input(1), six), b->Gt(ctx->Input(1), zero)),
+ ctx->Input(0), zero);
+ ctx->SetOutput(0, out);
}
};
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index ae7695ade5..2ee6b51c18 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -1010,8 +1010,9 @@ ComputationDataHandle ComputationBuilder::SqrtF32(
}
ComputationDataHandle ComputationBuilder::Pow(
- const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
- return BinaryOp(BINOP_POW, lhs, rhs, /*broadcast_dimensions=*/{});
+ const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+ tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+ return BinaryOp(BINOP_POW, lhs, rhs, broadcast_dimensions);
}
ComputationDataHandle ComputationBuilder::ConvertElementType(
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index a49e5a8843..98649ba02f 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -504,8 +504,9 @@ class ComputationBuilder {
ComputationDataHandle SquareF32(const ComputationDataHandle& operand);
// Enqueues a lhs^rhs computation onto the computation.
- ComputationDataHandle Pow(const ComputationDataHandle& lhs,
- const ComputationDataHandle& rhs);
+ ComputationDataHandle Pow(
+ const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+ tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
// Enqueues an operator that tests if the operand's values are finite, i.e.,
// not Inf or NaN. Defined only for floating-point types. Returns an array of
diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake
index a2f5774b9f..7dd7dd4070 100644
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@@ -36,4 +36,5 @@ ExternalProject_Add(eigen
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_INSTALL_PREFIX:STRING=${eigen_INSTALL}
-DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
+ -DBUILD_TESTING:BOOL=OFF
)
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 84107706ca..7762328cfe 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -163,6 +163,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
"${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
# training tests
"${tensorflow_source_dir}/tensorflow/python/training/basic_session_run_hooks_test.py" # Needs tf.contrib fix.
+ "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py" # Needs tf.contrib fix.
"${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py" # Needs portpicker.
"${tensorflow_source_dir}/tensorflow/python/training/monitored_session_test.py" # Needs tf.contrib fix.
"${tensorflow_source_dir}/tensorflow/python/training/saver_large_variable_test.py" # Overflow error.
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index ef9c8b2e8e..6f64cbee3f 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -108,3 +108,29 @@ Finally, you can run the inference tests on your device.
adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
"/data/local/tmp/hexagon_graph_execution"
```
+
+#### Troubleshooting
+If you're using the Open-Q 820 Snapdragon development kit, you may run into an issue with running the executable due to a missing testsig library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
+
+If the lack of a testsig library is your problem, you will see errors of the type:
+`vendor/qcom/proprietary/adsprpc/src/fastrpc_apps_user.c:169::error: -1: 0 == (nErr = remotectl_open(name, (int*)ph, dlerrstr, sizeof(dlerrstr), &dlerr))`
+appearing in adb logcat.
+
+There are several ways to create the testsig library, the only prerequisite is Python and the correct version of the Hexagon-SDK. The following steps is one way to create this library:
+1. Run adb as root: `adb root`
+2. Run the command `adb shell cat /sys/devices/soc0/serial_number`
+3. Convert the decimal number you get as output to hex
+4. Run the python script: `python ${QUALCOMM_SDK}/tools/elfsigner/elfsigner.py -t $(SERIAL_NUMBER_HEX_VALUE)`
+5. The output of the python script is a shared library stored in ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so
+6. Push the shared library to your device:
+```
+adb root
+adb wait-for-device
+adb remount
+adb wait-for-device
+adb shell mkdir /system/lib/rfsa
+adb shell mkdir /system/lib/rfsa/adsp
+adb push ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so /system/lib/rfsa/adsp/
+```
+
+After rebooting your device, you should be able to run the sample application.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index bfe1cd0aed..f9ba6711e6 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -748,15 +748,16 @@ class _DNNEstimator(estimator.Estimator):
To create a _DNNEstimator for binary classification, where
estimator = _DNNEstimator(
feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
- head=head=head_lib._multi_class__head(n_classes=2),
+ head=head_lib._multi_class__head(n_classes=2),
hidden_units=[1024, 512, 256])
If your label is keyed with "y" in your labels dict, and weights are keyed
with "w" in features dict, and you want to enable centered bias,
- head=head_lib._multi_class__head(n_classes=2,
- label_name="x"
- weight_column_name="w",
- enable_centered_bias=True)
+ head = head_lib._multi_class__head(
+ n_classes=2,
+ label_name="x",
+ weight_column_name="w",
+ enable_centered_bias=True)
estimator = _DNNEstimator(
feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
head=head,
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 2e6f2ac05c..c1ec46d763 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -362,7 +362,7 @@ class LSTMBlockCell(core_rnn_cell.RNNCell):
@property
def state_size(self):
- return (self._num_units,) * 2
+ return core_rnn_cell.LSTMStateTuple(self._num_units, self._num_units)
@property
def output_size(self):
@@ -401,7 +401,8 @@ class LSTMBlockCell(core_rnn_cell.RNNCell):
forget_bias=self._forget_bias,
use_peephole=self._use_peephole)
- return (h, (cs, h))
+ new_state = core_rnn_cell.LSTMStateTuple(cs, h)
+ return h, new_state
class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
@@ -544,7 +545,9 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
# Input was a list, so return a list
outputs = array_ops.unstack(outputs)
- return outputs, (final_cell_state, final_output)
+ final_state = core_rnn_cell.LSTMStateTuple(final_cell_state,
+ final_output)
+ return outputs, final_state
def _gather_states(self, data, indices, batch_size):
"""Produce `out`, s.t. out(i, j) = data(indices(i), i, j)."""
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index c14f4ee440..212f2c7a7b 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -410,8 +410,7 @@ def evaluate_repeatedly(checkpoint_dir,
'%Y-%m-%d-%H:%M:%S', time.gmtime()))
num_evaluations += 1
- reached_max = num_evaluations >= max_number_of_evaluations
- if max_number_of_evaluations and reached_max:
+ if max_number_of_evaluations is not None and num_evaluations >= max_number_of_evaluations:
return final_ops_hook.final_ops_values
logging.info('Timed-out waiting for a checkpoint.')
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 2f8bacca85..b59b749182 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -117,6 +117,8 @@ load(
"//third_party/mkl:build_defs.bzl",
"if_mkl",
)
+load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
+
# -----------------------------------------------------------------------------
# Public targets
@@ -686,6 +688,8 @@ cc_library(
"//tensorflow/core/kernels:array_not_windows",
"//tensorflow/core/kernels:math_not_windows",
"//tensorflow/core/kernels:quantized_ops",
+ ]) + if_mkl([
+ "//tensorflow/core/kernels:mkl_ops",
]),
)
@@ -745,6 +749,7 @@ cc_library(
":proto_text",
":protos_all_cc",
":shape_inference_testutil",
+ ":sycl_runtime",
":tensor_testutil",
":test",
"//tensorflow/core/kernels:constant_op",
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
index 0d238276f4..b7ef9361e9 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@@ -25,6 +25,9 @@ string SYCLAllocator::Name() { return "device:SYCL"; }
void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
assert(device_);
+ if (num_bytes == 0) {
+ return device_->allocate(1);
+ }
auto p = device_->allocate(num_bytes);
return p;
}
@@ -42,6 +45,6 @@ void SYCLAllocator::EnterLameDuckMode() {
}
}
-} // namespace tensorflow
+} // namespace tensorflow
-#endif // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index c896f7f603..15d9ab41a4 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -27,8 +27,8 @@ limitations under the License.
namespace tensorflow {
class SYCLAllocator : public Allocator {
-public:
- SYCLAllocator(Eigen::QueueInterface* device) : device_(device) {}
+ public:
+ SYCLAllocator(Eigen::QueueInterface *device) : device_(device) {}
virtual ~SYCLAllocator() override;
string Name() override;
void *AllocateRaw(size_t alignment, size_t num_bytes) override;
@@ -36,11 +36,12 @@ public:
void EnterLameDuckMode();
virtual bool ShouldAllocateEmptyTensors() override final { return true; }
-private:
+
+ private:
Eigen::QueueInterface *device_; // not owned
TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
};
-} // namespace tensorflow
+} // namespace tensorflow
-#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
+#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index 0abe25c373..2c2185b2c0 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -23,7 +23,7 @@ limitations under the License.
namespace tensorflow {
-static std::unordered_set<SYCLDevice*> live_devices;
+static std::unordered_set<SYCLDevice *> live_devices;
static bool first_time = true;
void ShutdownSycl() {
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index b5a72d9476..a5c7c5f0ec 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -34,10 +34,11 @@ class SYCLDevice : public LocalDevice {
Bytes memory_limit, const DeviceLocality &locality,
const string &physical_device_desc, SYCLSelector sycl_selector,
Allocator *cpu_allocator)
- : LocalDevice(options, Device::BuildDeviceAttributes(
- name, DEVICE_SYCL, memory_limit, locality,
- physical_device_desc),
- nullptr),
+ : LocalDevice(
+ options,
+ Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit,
+ locality, physical_device_desc),
+ nullptr),
cpu_allocator_(cpu_allocator),
sycl_queue_(new Eigen::QueueInterface(sycl_selector)),
sycl_device_(new Eigen::SyclDevice(sycl_queue_)),
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
index a6be9195d4..1c868f5606 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
@@ -17,8 +17,8 @@ limitations under the License.
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
namespace tensorflow {
@@ -31,68 +31,68 @@ void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor,
const void *src_ptr = DMAHelper::base(cpu_tensor);
void *dst_ptr = DMAHelper::base(device_tensor);
switch (cpu_tensor->dtype()) {
- case DT_FLOAT:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
- total_bytes);
- break;
- case DT_DOUBLE:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
- total_bytes);
- break;
- case DT_INT32:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
- total_bytes);
- break;
- case DT_INT64:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
- total_bytes);
- break;
- case DT_HALF:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<Eigen::half *>(dst_ptr),
- static_cast<const Eigen::half *>(src_ptr), total_bytes);
- break;
- case DT_COMPLEX64:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<std::complex<float> *>(dst_ptr),
- static_cast<const std::complex<float> *>(src_ptr), total_bytes);
- break;
- case DT_COMPLEX128:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<std::complex<double> *>(dst_ptr),
- static_cast<const std::complex<double> *>(src_ptr), total_bytes);
- break;
- case DT_INT8:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
- total_bytes);
- break;
- case DT_INT16:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
- total_bytes);
- break;
- case DT_UINT8:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
- total_bytes);
- break;
- case DT_UINT16:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
- total_bytes);
- break;
- case DT_BOOL:
- device->eigen_sycl_device()->memcpyHostToDevice(
- static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
- total_bytes);
- break;
- default:
- assert(false && "unsupported type");
+ case DT_FLOAT:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_DOUBLE:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<double *>(dst_ptr),
+ static_cast<const double *>(src_ptr), total_bytes);
+ break;
+ case DT_INT32:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_INT64:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_HALF:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<Eigen::half *>(dst_ptr),
+ static_cast<const Eigen::half *>(src_ptr), total_bytes);
+ break;
+ case DT_COMPLEX64:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<std::complex<float> *>(dst_ptr),
+ static_cast<const std::complex<float> *>(src_ptr), total_bytes);
+ break;
+ case DT_COMPLEX128:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<std::complex<double> *>(dst_ptr),
+ static_cast<const std::complex<double> *>(src_ptr), total_bytes);
+ break;
+ case DT_INT8:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_INT16:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_UINT8:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_UINT16:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<uint16 *>(dst_ptr),
+ static_cast<const uint16 *>(src_ptr), total_bytes);
+ break;
+ case DT_BOOL:
+ device->eigen_sycl_device()->memcpyHostToDevice(
+ static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
+ total_bytes);
+ break;
+ default:
+ assert(false && "unsupported type");
}
}
device->eigen_sycl_device()->synchronize();
@@ -106,71 +106,71 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
StatusCallback done) {
const int64 total_bytes = device_tensor->TotalBytes();
if (total_bytes > 0) {
- const void* src_ptr = DMAHelper::base(device_tensor);
- void* dst_ptr = DMAHelper::base(cpu_tensor);
+ const void *src_ptr = DMAHelper::base(device_tensor);
+ void *dst_ptr = DMAHelper::base(cpu_tensor);
switch (device_tensor->dtype()) {
- case DT_FLOAT:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
- total_bytes);
- break;
- case DT_DOUBLE:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<double *>(dst_ptr), static_cast<const double *>(src_ptr),
- total_bytes);
- break;
- case DT_INT32:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
- total_bytes);
- break;
- case DT_INT64:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
- total_bytes);
- break;
- case DT_HALF:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<Eigen::half *>(dst_ptr),
- static_cast<const Eigen::half *>(src_ptr), total_bytes);
- break;
- case DT_COMPLEX64:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<std::complex<float> *>(dst_ptr),
- static_cast<const std::complex<float> *>(src_ptr), total_bytes);
- break;
- case DT_COMPLEX128:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<std::complex<double> *>(dst_ptr),
- static_cast<const std::complex<double> *>(src_ptr), total_bytes);
- break;
- case DT_INT8:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
- total_bytes);
- break;
- case DT_INT16:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
- total_bytes);
- break;
- case DT_UINT8:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
- total_bytes);
- break;
- case DT_UINT16:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<uint16 *>(dst_ptr), static_cast<const uint16 *>(src_ptr),
- total_bytes);
- break;
- case DT_BOOL:
- device->eigen_sycl_device()->memcpyDeviceToHost(
- static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
- total_bytes);
- break;
- default:
- assert(false && "unsupported type");
+ case DT_FLOAT:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_DOUBLE:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<double *>(dst_ptr),
+ static_cast<const double *>(src_ptr), total_bytes);
+ break;
+ case DT_INT32:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_INT64:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_HALF:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<Eigen::half *>(dst_ptr),
+ static_cast<const Eigen::half *>(src_ptr), total_bytes);
+ break;
+ case DT_COMPLEX64:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<std::complex<float> *>(dst_ptr),
+ static_cast<const std::complex<float> *>(src_ptr), total_bytes);
+ break;
+ case DT_COMPLEX128:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<std::complex<double> *>(dst_ptr),
+ static_cast<const std::complex<double> *>(src_ptr), total_bytes);
+ break;
+ case DT_INT8:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_INT16:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_UINT8:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
+ total_bytes);
+ break;
+ case DT_UINT16:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<uint16 *>(dst_ptr),
+ static_cast<const uint16 *>(src_ptr), total_bytes);
+ break;
+ case DT_BOOL:
+ device->eigen_sycl_device()->memcpyDeviceToHost(
+ static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
+ total_bytes);
+ break;
+ default:
+ assert(false && "unsupported type");
}
}
device->eigen_sycl_device()->synchronize();
@@ -178,4 +178,4 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
}
} // namespace tensorflow
-#endif // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.h b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
index 1f7ad543d9..0f8f17b805 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
@@ -26,7 +26,7 @@ limitations under the License.
namespace tensorflow {
class SYCLDeviceContext : public DeviceContext {
-public:
+ public:
SYCLDeviceContext() {}
~SYCLDeviceContext() override {}
@@ -40,6 +40,6 @@ public:
StatusCallback done) override;
};
-} // namespace tensorflow
+} // namespace tensorflow
-#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
+#endif // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index 51eb4973d8..a643fc7258 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -21,7 +21,7 @@ limitations under the License.
namespace tensorflow {
class SYCLDeviceFactory : public DeviceFactory {
-public:
+ public:
Status CreateDevices(const SessionOptions &options, const string &name_prefix,
std::vector<Device *> *devices) override {
int n = 1;
@@ -31,10 +31,10 @@ public:
}
for (int i = 0; i < n; i++) {
string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
- devices->push_back(new SYCLDevice(options, name, Bytes(256 << 20),
- DeviceLocality(),
- SYCLDevice::GetShortDeviceDescription(),
- cl::sycl::gpu_selector(), cpu_allocator()));
+ devices->push_back(
+ new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality(),
+ SYCLDevice::GetShortDeviceDescription(),
+ cl::sycl::gpu_selector(), cpu_allocator()));
}
return Status::OK();
}
@@ -43,4 +43,4 @@ public:
REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
}
-#endif // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 9bb92ad7ef..41ea0267b2 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -383,7 +383,6 @@ Status MasterSession::ReffedClientGraph::DoBuildPartitions(
// Partition the graph.
Status s;
- std::unordered_map<string, GraphDef> graph_partitions;
return Partition(popts, &client_graph_->graph, out_partitions);
}
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index 8f8d9fd08e..c1fe5517c6 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -21,6 +21,10 @@ limitations under the License.
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
+
#include "tensorflow/core/framework/numeric_types.h"
#include "tensorflow/core/platform/types.h"
@@ -66,6 +70,17 @@ struct proxy_type_pod<GPUDevice, 2> {
typedef Eigen::half type;
};
+#ifdef TENSORFLOW_USE_SYCL
+template <>
+struct proxy_type_pod<SYCLDevice, 8> {
+ typedef double type;
+};
+template <>
+struct proxy_type_pod<SYCLDevice, 4> {
+ typedef float type;
+};
+#endif // TENSORFLOW_USE_SYCL
+
/// If POD we use proxy_type_pod, otherwise this maps to identiy.
template <typename Device, typename T>
struct proxy_type {
@@ -81,6 +96,10 @@ struct proxy_type {
TF_CALL_int8(m) TF_CALL_complex128(m)
#define TF_CALL_GPU_PROXY_TYPES(m) \
TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m)
+#ifdef TENSORFLOW_USE_SYCL
+#define TF_CALL_SYCL_PROXY_TYPES(m) \
+ TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
#endif // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 2503b441f7..fb7d63767d 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -39,6 +39,10 @@ load(
"tf_proto_library",
"tf_kernel_tests_linkstatic",
)
+load(
+ "//third_party/mkl:build_defs.bzl",
+ "if_mkl",
+)
config_setting(
# Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
@@ -4294,6 +4298,17 @@ tf_cc_test(
],
)
+if_mkl(
+ tf_kernel_library(
+ name = "mkl_ops",
+ prefix = "mkl_matmul",
+ deps = [
+ ":math",
+ "//third_party/mkl:intel_binary_blob",
+ ],
+ ),
+)
+
# -----------------------------------------------------------------------------
# Google-internal targets. These must be at the end for syncrepo.
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index ab82c247d6..562934ed63 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -34,6 +34,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
#define CURRY_TYPES2(FN, arg0) \
FN(arg0, bool); \
@@ -206,6 +209,52 @@ REGISTER_CAST_GPU(bfloat16, float);
#undef REGISTER_CAST_GPU
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+class SyclCastOp : public CastOpBase {
+ public:
+ explicit SyclCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
+ OP_REQUIRES_OK(ctx, Prepare());
+ }
+
+ private:
+ Status Prepare() {
+ if (src_dtype_ == dst_dtype_) {
+ work_ = nullptr; // Identity
+ return Status::OK();
+ }
+ if (src_dtype_ == DT_BOOL) {
+ work_ = GetSyclCastFromBool(dst_dtype_);
+ } else if (src_dtype_ == DT_INT32) {
+ work_ = GetSyclCastFromInt32(dst_dtype_);
+ } else if (src_dtype_ == DT_INT64) {
+ work_ = GetSyclCastFromInt64(dst_dtype_);
+ } else if (src_dtype_ == DT_FLOAT) {
+ work_ = GetSyclCastFromFloat(dst_dtype_);
+ } else if (src_dtype_ == DT_DOUBLE) {
+ work_ = GetSyclCastFromDouble(dst_dtype_);
+ }
+
+ return work_ == nullptr ? Unimplemented() : Status::OK();
+ }
+};
+
+#define REGISTER_CAST_SYCL(srctype, dsttype) \
+ REGISTER_KERNEL_BUILDER(Name("Cast") \
+ .TypeConstraint<srctype>("SrcT") \
+ .TypeConstraint<dsttype>("DstT") \
+ .Device(DEVICE_SYCL), \
+ SyclCastOp)
+
+CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
+CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
+CURRY_TYPES2(REGISTER_CAST_SYCL, int64);
+CURRY_TYPES2(REGISTER_CAST_SYCL, float);
+CURRY_TYPES2(REGISTER_CAST_SYCL, double);
+
+#undef REGISTER_CAST_SYCL
+
+#endif // TENSORFLOW_USE_SYCL
+
#undef CURRY_TYPES2
// HostCast differs from Cast in that its input and output are in host memory.
@@ -213,5 +262,10 @@ REGISTER_KERNEL_BUILDER(Name("_HostCast").Device(DEVICE_CPU), CpuCastOp);
REGISTER_KERNEL_BUILDER(
Name("_HostCast").Device(DEVICE_GPU).HostMemory("x").HostMemory("y"),
CpuCastOp);
-
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+ Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
+ CpuCastOp);
+#endif // TENSORFLOW_USE_SYCL
} // end namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index cb7cc81937..1ee0796ac1 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -33,6 +33,16 @@ struct CastFunctor<Eigen::ThreadPoolDevice, O, I> {
}
};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename O, typename I>
+struct CastFunctor<Eigen::SyclDevice, O, I> {
+ void operator()(const Eigen::SyclDevice& d, typename TTypes<O>::Flat o,
+ typename TTypes<I>::ConstFlat i) {
+ o.device(d) = i.template cast<O>();
+ }
+};
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace functor
#define CURRY_TYPES3(FN, arg0, arg1) \
@@ -140,6 +150,25 @@ GetGpuCastFromBfloat(DataType dst_dtype);
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromBool(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt32(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt64(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromFloat(DataType dst_dtype);
+
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromDouble(DataType dst_dtype);
+
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index 92fee89a47..a13f163009 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -34,4 +34,14 @@ GetGpuCastFromBool(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromBool(DataType dst_dtype) {
+ CURRY_TYPES3(CAST_CASE, SYCLDevice, bool);
+ return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index fd20061d21..fdc8d51158 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -34,4 +34,14 @@ GetGpuCastFromDouble(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromDouble(DataType dst_dtype) {
+ CURRY_TYPES3(CAST_CASE, SYCLDevice, double);
+ return nullptr;
+}
+#endif // TENSORFLOW_USE_SYC
+
} // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index 71e63fbff0..1241dcd8f2 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -49,4 +49,14 @@ GetGpuCastFromFloat(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromFloat(DataType dst_dtype) {
+ CURRY_TYPES3(CAST_CASE, SYCLDevice, float);
+ return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index 0fc6e16afe..fca9cd60ec 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -34,4 +34,14 @@ GetGpuCastFromInt32(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt32(DataType dst_dtype) {
+ CURRY_TYPES3(CAST_CASE, CPUDevice, int32);
+ return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
+
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index b5571b19a5..c0a543708d 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -19,6 +19,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
GetCpuCastFromInt64(DataType dst_dtype) {
@@ -34,4 +37,13 @@ GetGpuCastFromInt64(DataType dst_dtype) {
}
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
+GetSyclCastFromInt64(DataType dst_dtype) {
+ CURRY_TYPES3(CAST_CASE, SYCLDevice, int64);
+ return nullptr;
+}
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index 5b7529bb8a..a106f287c1 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -105,7 +105,12 @@ static void BM_gpu_float_int64(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(int64)));
testing::UseRealTime();
+#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+ test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
+#endif // TENSORFLOW_USE_SYCL
}
BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
@@ -123,7 +128,12 @@ static void BM_gpu_bool_float(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(bool) + sizeof(float)));
testing::UseRealTime();
+#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+ test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
+#endif // TENSORFLOW_USE_SYCL
}
BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
@@ -168,7 +178,9 @@ static void BM_gpu_float_half(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
+#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
+#endif // GOOGLE_CUDA
}
BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
@@ -177,7 +189,9 @@ static void BM_gpu_half_float(int iters, int num) {
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
+#if GOOGLE_CUDA
test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
+#endif // GOOGLE_CUDA
}
BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index cef873f804..14e6e1bc32 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -38,6 +38,14 @@ void ConcatGPU(
Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
#endif // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+void ConcatSYCL(const Eigen::SyclDevice& d,
+ const std::vector<
+ std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+ typename TTypes<T, 2>::Matrix* output);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
#endif // TENSORFLOW_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index f83aed6aef..f89948350c 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -74,4 +74,23 @@ REGISTER(qint16)
REGISTER(qint32)
REGISTER(bfloat16)
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+void ConcatSYCL(const Eigen::SyclDevice& d,
+ const std::vector<
+ std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+ typename TTypes<T, 2>::Matrix* output) {
+ ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
+ output);
+}
+#define REGISTER_SYCL(T) \
+ template void ConcatSYCL<T>( \
+ const Eigen::SyclDevice&, \
+ const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+ typename TTypes<T, 2>::Matrix* output);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL)
+
+#undef REGISTER_SYCL
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 9d37cafb4e..6a933efde4 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -126,4 +126,39 @@ void ConcatCPUImpl(
cost_per_unit, work);
}
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T, typename ElementCopier>
+void ConcatSYCLImpl(
+ const Eigen::SyclDevice& d,
+ const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+ inputs,
+ int64 cost_per_unit, ElementCopier copier,
+ typename TTypes<T, 2>::Matrix* output) {
+ size_t num_inputs = inputs.size();
+
+ std::vector<ptrdiff_t> sizes;
+ sizes.reserve(num_inputs);
+ int64 row_size = 0;
+ for (const auto& input : inputs) {
+ sizes.push_back(input->dimension(1));
+ row_size += sizes.back();
+ }
+
+ T* out = &(*output)(0, 0);
+ std::vector<const T*> inp;
+ inp.reserve(num_inputs);
+ for (const auto& input : inputs) {
+ inp.push_back(&(*input)(0, 0));
+ }
+ const int64 dim0 = output->dimension(0);
+ for (int64 i = 0; i < dim0; ++i) {
+ for (int64 j = 0; j < num_inputs; ++j) {
+ auto size = sizes[j];
+ d.memcpy(out, inp[j], size * sizeof(T));
+ out += size;
+ inp[j] += size;
+ }
+ }
+}
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index e6dae5fa7e..9628a7efa4 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -35,6 +35,9 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
#if GOOGLE_CUDA
typedef Eigen::GpuDevice GPUDevice;
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
@@ -134,6 +137,12 @@ class ConcatBaseOp : public OpKernel {
return;
}
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+ if (std::is_same<Device, SYCLDevice>::value) {
+ ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
+ return;
+ }
+#endif // TENSORFLOW_USE_SYCL
ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
}
}
@@ -207,6 +216,39 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(type) \
+ REGISTER_KERNEL_BUILDER(Name("Concat") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("concat_dim"), \
+ ConcatOp<SYCLDevice, type>) \
+ REGISTER_KERNEL_BUILDER(Name("ConcatV2") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int32>("Tidx") \
+ .HostMemory("axis"), \
+ ConcatV2Op<SYCLDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL);
+REGISTER_KERNEL_BUILDER(Name("Concat")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .HostMemory("concat_dim")
+ .HostMemory("values")
+ .HostMemory("output"),
+ ConcatOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("ConcatV2")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Tidx")
+ .HostMemory("values")
+ .HostMemory("axis")
+ .HostMemory("output"),
+ ConcatV2Op<CPUDevice, int32>);
+#undef REGISTER_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
class ConcatOffsetOp : public OpKernel {
public:
explicit ConcatOffsetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -293,4 +335,12 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
.HostMemory("offset"),
ConcatOffsetOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
+ .Device(DEVICE_SYCL)
+ .HostMemory("concat_dim")
+ .HostMemory("shape")
+ .HostMemory("offset"),
+ ConcatOffsetOp);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 115a842d1c..d444ddec1d 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -57,7 +57,10 @@ REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp);
REGISTER_KERNEL_BUILDER( \
Name("Const").Device(DEVICE_SYCL).TypeConstraint<TYPE>("dtype"), \
ConstantOp);
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+REGISTER_SYCL_KERNEL(bool);
+REGISTER_SYCL_KERNEL(int64);
#undef REGISTER_SYCL_KERNEL
#endif
@@ -112,6 +115,17 @@ REGISTER_KERNEL_BUILDER(Name("Const")
HostConstantOp);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Const")
+ .Device(DEVICE_SYCL)
+ .HostMemory("output")
+ .TypeConstraint<int32>("dtype"),
+ HostConstantOp);
+#endif // TENSORFLOW_USE_SYCL
+
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
@@ -186,6 +200,7 @@ REGISTER_KERNEL(CPU, quint8);
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL(SYCL, float)
+REGISTER_KERNEL(SYCL, double)
REGISTER_KERNEL_BUILDER(Name("Fill")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
@@ -246,6 +261,7 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CPU);
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL(float, SYCL);
+REGISTER_KERNEL(bool, SYCL);
REGISTER_KERNEL_BUILDER(Name("ZerosLike")
.Device(DEVICE_SYCL)
.TypeConstraint<int32>("T")
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 1a73a3d0f8..6a79be5a95 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -321,6 +321,30 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
#undef REGISTER_SYCL_KERNEL
#undef REGISTER_SYCL_REF_KERNEL
+#define REGISTER_SYCL_HOST_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("Enter") \
+ .Device(DEVICE_SYCL) \
+ .HostMemory("data") \
+ .HostMemory("output") \
+ .TypeConstraint<type>("T"), \
+ EnterOp)
+
+#define REGISTER_SYCL_HOST_REF_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("RefEnter") \
+ .Device(DEVICE_SYCL) \
+ .HostMemory("data") \
+ .HostMemory("output") \
+ .TypeConstraint<type>("T"), \
+ EnterOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_REF_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_REF_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#undef REGISTER_SYCL_HOST_REF_KERNEL
#endif
// Special GPU kernels for int32 and string.
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 1d2d815027..65801da3c7 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::acos<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index a6bff78694..f6e9b59cf8 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -18,7 +18,7 @@ limitations under the License.
namespace tensorflow {
REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
int64);
-
+
#if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \
REGISTER_KERNEL_BUILDER( \
@@ -26,10 +26,19 @@ REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
.Device(DEVICE_SYCL) \
.TypeConstraint<TYPE>("T"), \
BinaryOp<SYCLDevice, functor::add<TYPE>>);
- REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("Add")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::add<int32>>);
#endif // TENSORFLOW_USE_SYCL
-
+
#if GOOGLE_CUDA
REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index 92a22e90c4..c9ebfe759b 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::asin<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 825e85283f..72645b303f 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::atan<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index c5a4aaf831..c74e10576d 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::ceil<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index a758da5842..634c90adc6 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::cos<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index 74d8faedb5..1e2300832f 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -37,8 +37,18 @@ REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
BinaryOp<SYCLDevice, functor::div<TYPE>>);
REGISTER_SYCL_KERNEL(float)
-REGISTER_SYCL_KERNEL(int32)
+REGISTER_SYCL_KERNEL(double)
#undef REGISTER_SYCL_KERNEL
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Div")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::safe_div<int32>>);
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
index 7bd44abd39..93ea768836 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
@@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("Equal")
BinaryOp<CPUDevice, functor::equal_to<int32>>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(BinaryOp, SYCL, "Equal", functor::equal_to, float, double);
+
+REGISTER_KERNEL_BUILDER(Name("Equal")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::equal_to<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index f1c53ca272..5573c2bcc2 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -21,4 +21,7 @@ REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
#if GOOGLE_CUDA
REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Expm1", functor::expm1, float);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 129d754b82..59e32d7f6f 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::floor<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index 8a600f8f95..fa81ef0872 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -21,17 +21,6 @@ REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
Eigen::half, double);
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE) \
- REGISTER_KERNEL_BUILDER( \
- Name("FloorDiv") \
- .Device(DEVICE_SYCL) \
- .TypeConstraint<TYPE>("T"), \
- BinaryOp<SYCLDevice, functor::floor_div_real<TYPE>>);
-REGISTER_SYCL_KERNEL(float)
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
#if GOOGLE_CUDA
REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
int64);
@@ -51,4 +40,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("FloorDiv")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 4e641a8bb3..55f8a30461 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -31,4 +31,14 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("FloorMod")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index 8c9691d1ea..6b5a806aa2 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -33,5 +33,19 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::greater<int32>>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "Greater", functor::greater, float);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Greater")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::greater<int32>>);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index a6083cb9cd..ac21528256 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -34,4 +34,15 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
BinaryOp<CPUDevice, functor::greater_equal<int32>>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float);
+
+REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::greater_equal<int32>>);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 59976141c7..0faeffa95c 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -27,6 +27,7 @@ REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::isfinite<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index 675cb95b95..df63006b3f 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::isinf<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index c394087ed8..e1cf7a8637 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -26,6 +26,7 @@ REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::isnan<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 701007d637..a38f1024a9 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -33,5 +33,15 @@ REGISTER_KERNEL_BUILDER(Name("Less")
.TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::less<int32>>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);
+REGISTER_KERNEL_BUILDER(Name("Less")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::less<int32>>);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 97fd1ae919..3a2cc2ae0e 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
BinaryOp<CPUDevice, functor::less_equal<int32>>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "LessEqual", functor::less_equal, float);
+
+REGISTER_KERNEL_BUILDER(Name("LessEqual")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::less_equal<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 71c4588b3d..5e74e778c7 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::log<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 03ea3a0a89..edb821318e 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::log1p<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index f93b5a8303..7311f25ec0 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -34,4 +34,19 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
BinaryOp<CPUDevice, functor::maximum<int32>>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "Maximum", functor::maximum, float);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Maximum")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::maximum<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 36800975a8..99e5a76620 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -34,4 +34,16 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
BinaryOp<CPUDevice, functor::minimum<int32>>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(BinaryOp, SYCL, "Minimum", functor::minimum, float);
+
+REGISTER_KERNEL_BUILDER(Name("Minimum")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::minimum<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index e23fe6761d..5273522626 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -28,7 +28,15 @@ REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
BinaryOp<SYCLDevice, functor::mul<TYPE>>);
REGISTER_SYCL_KERNEL(float)
+REGISTER_SYCL_KERNEL(double)
#undef REGISTER_SYCL_KERNEL
+REGISTER_KERNEL_BUILDER(Name("Mul")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .HostMemory("z")
+ .TypeConstraint<int32>("T"),
+ BinaryOp<CPUDevice, functor::mul<int32>>);
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index 8eeba6ab14..f1780168e4 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -27,6 +27,7 @@ REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32,
.TypeConstraint<TYPE>("T"), \
BinaryOp<SYCLDevice, functor::pow<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index d858a077f5..8c0e21f9cf 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -36,6 +36,9 @@ REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
double, int64);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
+#endif // TENSORFLOW_USE_SYCL
REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
Eigen::half, double, complex64, complex128);
@@ -43,4 +46,7 @@ REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
Eigen::half, double);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index 7a4482dbb2..e192f89782 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -20,9 +20,9 @@ REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
int32, int64);
#ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Round", functor::round, float);
+REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double);
namespace functor {
-DEFINE_UNARY1(round, float);
+DEFINE_UNARY2(round, float, double);
} // namespace functor
#endif
diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc
index 7dc96d47a6..f23725f48e 100644
--- a/tensorflow/core/kernels/cwise_op_rsqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::rsqrt<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index add26b5ac8..709628da13 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -28,6 +28,10 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
+
template <typename Device, typename T>
class SelectOp : public OpKernel {
public:
@@ -169,12 +173,24 @@ REGISTER_SELECT_GPU(complex128);
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+// Registration of the SYCL implementations.
+#define REGISTER_SELECT_SYCL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Select").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+ SelectOp<SYCLDevice, type>);
+
+REGISTER_SELECT_SYCL(float);
+REGISTER_SELECT_SYCL(int32);
+#undef REGISTER_SELECT_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
namespace functor {
// CPU Specializations of Select functors.
-template <typename T>
-struct SelectFunctor<CPUDevice, T> {
- void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+template <typename Device, typename T>
+struct SelectFunctorBase {
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
typename TTypes<bool>::ConstFlat cond_flat,
typename TTypes<T>::ConstFlat then_flat,
typename TTypes<T>::ConstFlat else_flat) {
@@ -182,10 +198,18 @@ struct SelectFunctor<CPUDevice, T> {
}
};
-// CPU Specializations of Select functors with scalar
template <typename T>
-struct SelectScalarFunctor<CPUDevice, T> {
- void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+struct SelectFunctor<CPUDevice, T>
+ : SelectFunctorBase<CPUDevice, T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct SelectFunctor<SYCLDevice, T>
+ : SelectFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
+
+template <typename Device, typename T>
+struct SelectScalarFunctorBase {
+ void operator()(const Device& d, typename TTypes<T>::Flat out,
TTypes<bool>::ConstScalar cond,
typename TTypes<T>::ConstFlat then_flat,
typename TTypes<T>::ConstFlat else_flat) {
@@ -193,9 +217,19 @@ struct SelectScalarFunctor<CPUDevice, T> {
}
};
+// CPU Specializations of Select functors with scalar
template <typename T>
-struct BatchSelectFunctor<CPUDevice, T> {
- void operator()(const CPUDevice& d,
+struct SelectScalarFunctor<CPUDevice, T>
+ : SelectScalarFunctorBase<CPUDevice, T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct SelectScalarFunctor<SYCLDevice, T>
+ : SelectScalarFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
+
+template <typename Device, typename T>
+struct BatchSelectFunctorBase {
+ void operator()(const Device& d,
typename TTypes<T>::Matrix output_flat_outer_dims,
TTypes<bool>::ConstVec cond_vec,
typename TTypes<T>::ConstMatrix then_flat_outer_dims,
@@ -220,6 +254,15 @@ struct BatchSelectFunctor<CPUDevice, T> {
}
};
+template <typename T>
+struct BatchSelectFunctor<CPUDevice, T>
+ : BatchSelectFunctorBase<CPUDevice, T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct BatchSelectFunctor<SYCLDevice, T>
+ : BatchSelectFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace functor
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index cc1f9b8f03..a76a088ac8 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -23,6 +23,9 @@ REGISTER5(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, Eigen::half, double,
REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
double);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
+#endif // TENSORFLOW_USE_SYCL
REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
Eigen::half, double, complex64, complex128);
@@ -30,5 +33,8 @@ REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
Eigen::half, double);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 568906612a..dedd414db5 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -33,4 +33,17 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
UnaryOp<CPUDevice, functor::sign<int32>>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER(UnaryOp, SYCL, "Sign", functor::sign, float);
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Sign")
+ .Device(DEVICE_SYCL)
+ .HostMemory("x")
+ .HostMemory("y")
+ .TypeConstraint<int32>("T"),
+ UnaryOp<CPUDevice, functor::sign<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index 8d0c0959f7..ab54c61b56 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -27,6 +27,7 @@ REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::sin<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 710001517b..55acf648db 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -27,8 +27,9 @@ REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::sqrt<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index f867f127a7..afcacfec1c 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -27,6 +27,7 @@ REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::square<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index ac49cad88f..9c850c9420 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -26,6 +26,7 @@ REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::tan<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index ae2c473e20..1dbc13061b 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -28,6 +28,7 @@ REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::tanh<TYPE>>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 671de380d3..77b330f589 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -171,6 +171,21 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
}
};
+
+#ifdef TENSORFLOW_USE_SYCL
+// Partial specialization of BinaryFunctor for SYCL devices
+typedef Eigen::SyclDevice SYCLDevice;
+template <typename Functor>
+struct SimpleBinaryFunctor<SYCLDevice, Functor> {
+ void operator()(const SYCLDevice& d, typename Functor::tout_type out,
+ typename Functor::tin_type in0,
+ typename Functor::tin_type in1) {
+ out.device(d) = in0.binaryExpr(in1, typename Functor::func());
+ }
+};
+
+#endif // TENSORFLOW_USE_SYCL
+
template <typename T>
struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 6250928aca..92018ec871 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -51,18 +51,38 @@ static int ColsFromArg(int arg) { return (arg % kRows); }
BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
BM_UNARY(cpu, Floor, float, DT_FLOAT);
+#if GOOGLE_CUDA
BM_UNARY(gpu, Floor, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_UNARY(sycl, Floor, float, DT_FLOAT);
+#endif // TENSORFLOW_USE_SYCL
+
BM_UNARY(cpu, Floor, double, DT_DOUBLE);
+#if GOOGLE_CUDA
BM_UNARY(gpu, Floor, double, DT_DOUBLE);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_UNARY(sycl, Floor, double, DT_DOUBLE);
+#endif // TENSORFLOW_USE_SYCL
+
BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
+#if GOOGLE_CUDA
BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
+#endif // GOOGLE_CUDA
BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
+#if GOOGLE_CUDA
BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
+#endif // GOOGLE_CUDA
BM_UNARY(cpu, Rint, double, DT_DOUBLE);
+#if GOOGLE_CUDA
BM_UNARY(gpu, Rint, double, DT_DOUBLE);
+#endif // GOOGLE_CUDA
BM_UNARY(cpu, Rint, float, DT_FLOAT);
+#if GOOGLE_CUDA
BM_UNARY(gpu, Rint, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
// data func scalar.
static Graph* BinaryScalar(int num, const string& func) {
@@ -90,9 +110,20 @@ static Graph* BinaryScalar(int num, const string& func) {
->Arg(1048576);
BM_BINARY_SCALAR(cpu, Less);
+#if GOOGLE_CUDA
BM_BINARY_SCALAR(gpu, Less);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, Less);
+#endif // TENSORFLOW_USE_SYCL
+
BM_BINARY_SCALAR(cpu, Add);
+#if GOOGLE_CUDA
BM_BINARY_SCALAR(gpu, Add);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, Add);
+#endif // TENSORFLOW_USE_SYCL
#undef BM_BINARY_SCALAR
template <class T>
@@ -130,9 +161,13 @@ static Graph* BiasAdd(int rows, int cols, DataType type) {
using Eigen::half;
BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
+#if GOOGLE_CUDA
BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
+#if GOOGLE_CUDA
BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
+#endif // GOOGLE_CUDA
#undef BM_BIAS_ADD_ALL
#undef BM_BIAS_ADD
@@ -180,12 +215,18 @@ static Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
using Eigen::half;
+#if GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
+#endif // GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
+#if GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
+#endif // GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
+#if GOOGLE_CUDA
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
+#endif // GOOGLE_CUDA
#undef BM_BIAS_ADD_GRAD_ALL
#undef BM_BIAS_ADD_GRAD
@@ -223,7 +264,12 @@ static Graph* BcastAdd(int rows, int cols, int dim) {
BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
BM_BCAST_ADD_ROW_ALL(cpu);
+#if GOOGLE_CUDA
BM_BCAST_ADD_ROW_ALL(gpu);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_ROW_ALL(sycl);
+#endif // TENSORFLOW_USE_SYCL
#undef BM_BCAST_ADD_ROW_ALL
#undef BM_BCAST_ADD_ROW
@@ -244,7 +290,12 @@ BM_BCAST_ADD_ROW_ALL(gpu);
BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
BM_BCAST_ADD_COL(DEVICE, 4096, 512);
BM_BCAST_ADD_COL_ALL(cpu);
+#if GOOGLE_CUDA
BM_BCAST_ADD_COL_ALL(gpu);
+#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BCAST_ADD_COL_ALL(sycl);
+#endif // TENSORFLOW_USE_SYCL
#undef BM_BCAST_ADD_COL_ALL
#undef BM_BCAST_ADD_COL
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index e9ddd535ec..55a7657ea8 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -97,6 +97,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double);
.TypeConstraint<type>("T"), \
DebugNanCountOp<type>);
REGISTER_GPU_DEBUG_NAN_COUNT(float);
+REGISTER_GPU_DEBUG_NAN_COUNT(double);
#endif // TENSORFLOW_USE_SYCL
// Register debug numeric summary ops.
@@ -133,6 +134,7 @@ TF_CALL_double(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
.TypeConstraint<type>("T"), \
DebugNumericSummaryOp<type>);
REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(float);
+REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT(double);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 42fe6e88c9..767f143727 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -152,6 +152,7 @@ typedef Eigen::SyclDevice SYCLDevice;
DenseUpdateOp<SYCLDevice, type, DenseUpdateType::SUB>);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 08ec4baff3..0df8f9d3ed 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -62,6 +62,8 @@ void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
#define DEFINE_SETZERO_SYCL(T) \
template struct SetZeroFunctor<Eigen::SyclDevice, T>;
DEFINE_SETZERO_SYCL(float);
+DEFINE_SETZERO_SYCL(bool);
+DEFINE_SETZERO_SYCL(double);
#undef DEFINE_SETZERO_SYCL
#endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 9aa289c3c9..d08dec46d1 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -185,6 +185,34 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
.TypeConstraint<int32>("T"),
PassOn);
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+ PassOn); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+ PassOn);
+
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+
+#undef REGISTER_SYCL_KERNELS
+
+REGISTER_KERNEL_BUILDER(Name("_ListToArray")
+ .Device(DEVICE_SYCL)
+ .HostMemory("input")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T"),
+ PassOn);
+REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
+ .Device(DEVICE_SYCL)
+ .HostMemory("input")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T"),
+ PassOn);
+#endif // TENSORFLOW_USE_SYCL
+
class SymbolicGradientOp : public AsyncOpKernel {
public:
SymbolicGradientOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index a2b0127fac..94fe22ed31 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -46,6 +46,9 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T, bool USE_CUBLAS>
struct LaunchMatMul;
@@ -118,27 +121,42 @@ bool ExplicitVectorMatrixOptimization<Eigen::half>(
return false;
}
-// On CPUs, we ignore USE_CUBLAS
-template <typename T>
-struct LaunchMatMulCPU {
+template <typename Device, typename T>
+struct LaunchMatMulBase {
static void launch(
OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
Tensor* out) {
+#ifndef TENSORFLOW_USE_SYCL
// An explicit vector-matrix multiply is much better optimized than an
// implicit one and this is a bottleneck during non-batched inference.
bool was_vector = ExplicitVectorMatrixOptimization<T>(a, b, dim_pair, out);
if (!was_vector) {
- functor::MatMulFunctor<CPUDevice, T>()(ctx->eigen_device<CPUDevice>(),
+#endif // TENSORFLOW_USE_SYCL
+ functor::MatMulFunctor<Device, T>()(ctx->eigen_device<Device>(),
out->matrix<T>(), a.matrix<T>(),
b.matrix<T>(), dim_pair);
+#ifndef TENSORFLOW_USE_SYCL
}
+#endif // TENSORFLOW_USE_SYCL
}
};
+// On CPUs, we ignore USE_CUBLAS
+template <typename T>
+struct LaunchMatMulCPU : LaunchMatMulBase<CPUDevice, T> {};
+
template <typename T, bool USE_CUBLAS>
struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {};
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct LaunchMatMulSYCL : LaunchMatMulBase<SYCLDevice, T> {};
+
+template <typename T, bool USE_CUBLAS>
+struct LaunchMatMul<SYCLDevice, T, USE_CUBLAS> : public LaunchMatMulSYCL<T> {};
+#endif // TENSORFLOW_USE_SYCL
+
#if GOOGLE_CUDA
template <typename T>
@@ -256,6 +274,20 @@ struct MatMulFunctor<CPUDevice, T> {
}
};
+#ifdef TENSORFLOW_USE_SYCL
+// Partial specialization MatMulFunctor<Device=SYCLDevice, T>.
+template <typename T>
+struct MatMulFunctor<SYCLDevice, T> {
+ void operator()(
+ const SYCLDevice& d, typename MatMulTypes<T>::out_type out,
+ typename MatMulTypes<T>::in_type in0,
+ typename MatMulTypes<T>::in_type in1,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+ MatMul<SYCLDevice>(d, out, in0, in1, dim_pair);
+ }
+};
+#endif // TENSORFLOW_USE_SYCL
+
} // end namespace functor
#define REGISTER_CPU(T) \
@@ -276,6 +308,12 @@ struct MatMulFunctor<CPUDevice, T> {
.Label("cublas"), \
MatMulOp<GPUDevice, T, true /* cublas */>)
+#if defined (INTEL_MKL)
+// MKL does not support half and int32 types for matrix-multiplication, so
+// register the kernel to use default Eigen based implementations for these types
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_int32(REGISTER_CPU);
+#else
TF_CALL_float(REGISTER_CPU);
TF_CALL_double(REGISTER_CPU);
TF_CALL_half(REGISTER_CPU);
@@ -283,6 +321,7 @@ TF_CALL_half(REGISTER_CPU);
TF_CALL_int32(REGISTER_CPU);
TF_CALL_complex64(REGISTER_CPU);
TF_CALL_complex128(REGISTER_CPU);
+#endif
#if GOOGLE_CUDA
TF_CALL_float(REGISTER_GPU);
@@ -294,4 +333,17 @@ TF_CALL_half(REGISTER_GPU);
#endif
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("MatMul").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+ MatMulOp<SYCLDevice, T, false /* xxblas */>); \
+ REGISTER_KERNEL_BUILDER(Name("MatMul") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<T>("T") \
+ .Label("eigen"), \
+ MatMulOp<SYCLDevice, T, false /* xxblas */>)
+TF_CALL_float(REGISTER_SYCL);
+
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
new file mode 100644
index 0000000000..3ba28c13ed
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -0,0 +1,217 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+// This file uses MKL CBLAS xGEMM for acceleration of TF Matrix-Matrix
+// Multiplication (MatMul) operations.
+// We currently register this kernel only for MKL supported data
+// types (float, double, complex64, complex128). The macro INTEL_MKL is defined
+// by the build system only when MKL is chosen as an option at configure stage
+// and when it is undefined at build time, this file becomes an empty
+// compilation unit
+
+#if defined(INTEL_MKL)
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "third_party/mkl/include/mkl_cblas.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T, bool USE_CUBLAS>
+class MklMatMulOp : public OpKernel {
+ public:
+ explicit MklMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
+ OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+ }
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor& a = ctx->input(0);
+ const Tensor& b = ctx->input(1);
+
+ // Check that the dimensions of the two matrices are valid.
+ OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
+ errors::InvalidArgument("In[0] is not a matrix"));
+ OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
+ errors::InvalidArgument("In[1] is not a matrix"));
+ Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+ dim_pair[0].first = transpose_a_ ? 0 : 1;
+ dim_pair[0].second = transpose_b_ ? 1 : 0;
+
+ OP_REQUIRES(ctx,
+ a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+ errors::InvalidArgument("Matrix size-incompatible: In[0]: ",
+ a.shape().DebugString(), ", In[1]: ",
+ b.shape().DebugString()));
+ int a_dim_remaining = 1 - dim_pair[0].first;
+ int b_dim_remaining = 1 - dim_pair[0].second;
+ TensorShape out_shape(
+ {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
+ Tensor* out = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+
+ if (out->NumElements() == 0) {
+ // If a has shape [0, x] or b has shape [x, 0], the output shape
+ // is a 0-element matrix, so there is nothing to do.
+ return;
+ }
+
+ if (a.NumElements() == 0 || b.NumElements() == 0) {
+ // If a has shape [x, 0] and b has shape [0, y], the
+ // output shape is [x, y] where x and y are non-zero, so we fill
+ // the output with zeros.
+ functor::SetZeroFunctor<Device, T> f;
+ f(ctx->eigen_device<Device>(), out->flat<T>());
+ return;
+ }
+
+ const int m = a.dim_size(1 - dim_pair[0].first);
+ const int k = a.dim_size(dim_pair[0].first);
+ const int n = b.dim_size(1 - dim_pair[0].second);
+ bool transpose_a = dim_pair[0].first == 0;
+ bool transpose_b = dim_pair[0].second == 1;
+
+ auto a_ptr = (a.template flat<T>().data());
+ auto b_ptr = (b.template flat<T>().data());
+ auto c_ptr = (out->template flat<T>().data());
+
+ MklBlasGemm(transpose_a, transpose_b, m, n, k, a_ptr, transpose_a ? m : k,
+ b_ptr, transpose_b ? k : n, c_ptr, n);
+ }
+
+ private:
+ bool transpose_a_;
+ bool transpose_b_;
+
+ // --------------------------------------------------------------------------
+ //
+ // @brief Matrix-Matrix Multiplication with FP32 tensors, a, b, c using CBLAS
+ // interface. c = op(a) * op(b)
+ //
+ // @param transa Specifies the form of op(a) used in MatMul. If transa is
+ // true, then op(a) = a^T, otherwise op(a) = a
+ //
+ // @param transb Specifies the form of op(b) used in MatMul. If transb is
+ // true, then op(b) = b^T, otherwise op(b) = b
+ //
+ // @param m Specifies the number of rows of the matrix op(a) and of the
+ // matrix c. The value of m must be at least zero.
+ //
+ // @param n Specifies the number of columns of the matrix op(b) and the
+ // number of columns of the matrix c. The value of n must be at least zero.
+ //
+ // @param k Specifies the number of columns of the matrix op(a) and the
+ // number of rows of the matrix op(b)
+ //
+ // @param a Address of matrix a
+ //
+ // @param lda Leading dimension of 'a' matrix. This is set at calling site
+ // depending on transa parameter. Since TF uses row-major
+ // layout, leading dimension is the stride between consecutive rows
+ // lda = max(1,k) when transa is false, otherwise lda = max(1,m)
+ //
+ // @param b Address of matrix b
+ //
+ // @param ldb Leading dimension of 'b' matrix. This is set at calling site
+ // depending on transb parameter. Since TF uses row-major
+ // layout, leading dimension is the stride between consecutive rows
+ // ldb = max(1,n) when transb is false, otherwise ldb = max(1,k)
+ //
+ // @param c Address of matrix c
+ //
+ // @param ldc Leading dimension of 'c' matrix. Since TF uses row-major
+ // layout, leading dimension is the stride between consecutive rows, max(1,n)
+ //
+ // --------------------------------------------------------------------------
+ void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+ const int k, const float* a, const int lda, const float* b,
+ const int ldb, float* c, const int ldc) {
+ // BLAS GEMM API defines Matrix Multiplication as c = alpha * op(a) * op(b)
+ // + beta * c.
+ // Since TF MatMul does not have parameters for alpha, beta, we set them to
+ // 1.0 and 0.0 respectively.
+ const float alpha = 1.0f;
+ const float beta = 0.0f;
+ cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+ transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
+ ldb, beta, c, ldc);
+ }
+
+ // Matrix-Matrix Multiplication with FP64 tensors. For detailed info about
+ // parameters, look at FP32 function description.
+ void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+ const int k, const double* a, const int lda, const double* b,
+ const int ldb, double* c, const int ldc) {
+ const double alpha = 1.0;
+ const double beta = 0.0;
+ cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+ transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
+ ldb, beta, c, ldc);
+ }
+
+ // Matrix-Matrix Multiplication with Complex64 (std::complex<float>) tensors.
+ // For detailed info about parameters, look at FP32 function description.
+ void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+ const int k, const std::complex<float>* a, const int lda,
+ const std::complex<float>* b, const int ldb,
+ std::complex<float>* c, int const ldc) {
+ const MKL_Complex8 alpha = {1.0f, 0.0f};
+ const MKL_Complex8 beta = {0.0f, 0.0f};
+ cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+ transb ? CblasTrans : CblasNoTrans, m, n, k,
+ static_cast<const void*>(&alpha), static_cast<const void*>(a),
+ lda, static_cast<const void*>(b), ldb,
+ static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
+ }
+
+ // Matrix-Matrix Multiplication with Complex128 (std::complex<double>)
+ // tensors. For detailed info about parameters, look at FP32 function
+ // description.
+ void MklBlasGemm(bool transa, bool transb, const int m, const int n,
+ const int k, const std::complex<double>* a, const int lda,
+ const std::complex<double>* b, const int ldb,
+ std::complex<double>* c, const int ldc) {
+ const MKL_Complex16 alpha = {1.0, 0.0};
+ const MKL_Complex16 beta = {0.0, 0.0};
+ cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+ transb ? CblasTrans : CblasNoTrans, m, n, k,
+ static_cast<const void*>(&alpha), static_cast<const void*>(a),
+ lda, static_cast<const void*>(b), ldb,
+ static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
+ }
+};
+
+#define REGISTER_CPU(T) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+ MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("MKL"), \
+ MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>)
+
+// TODO:Consider template specialization when adding/removing additional types
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+TF_CALL_complex64(REGISTER_CPU);
+TF_CALL_complex128(REGISTER_CPU);
+
+} // namespace tensorflow
+#endif // INTEL_MKL
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 4977ad1d7c..a6650f369b 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -167,6 +167,7 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
PackOp<SYCLDevice, type>)
REGISTER_SYCL(float);
+REGISTER_SYCL(double);
#undef REGISTER_SYCL
// A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index bec2d02cb5..91984319c6 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -38,6 +38,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T>
class PadOp : public OpKernel {
@@ -199,4 +202,30 @@ REGISTER_KERNEL_BUILDER(Name("Pad")
PadOp<CPUDevice, int32>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+// Registration of the GPU implementations.
+#define REGISTER_SYCL_KERNEL(T) \
+ REGISTER_KERNEL_BUILDER(Name("Pad") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<T>("T") \
+ .TypeConstraint<int32>("Tpaddings") \
+ .HostMemory("paddings"), \
+ PadOp<SYCLDevice, T>)
+
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Pad")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Tpaddings")
+ .HostMemory("input")
+ .HostMemory("paddings")
+ .HostMemory("output"),
+ PadOp<CPUDevice, int32>);
+#endif // TENSORFLOW_USE_SYCL
+
} // end namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 1bc7e14187..0d309c2185 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -268,6 +268,31 @@ struct ReduceFunctor<CPUDevice, Reducer>
template <typename Reducer>
struct ReduceFunctor<SYCLDevice, Reducer>
: ReduceFunctorBase<SYCLDevice, Reducer>{};
+
+template <typename T>
+struct ReduceFunctor<SYCLDevice, Eigen::internal::MeanReducer<T> > {
+ template <typename OUT_T, typename IN_T, typename ReductionAxes>
+ static void Reduce(const SYCLDevice& d, OUT_T out, IN_T in,
+ const ReductionAxes& reduction_axes,
+ const Eigen::internal::MeanReducer<T>& reducer) {
+ typedef typename IN_T::Index Index;
+ // Eigen sum reductions are much faster on GPU than mean reductions:
+ // Simply trigger them by computing the sum of the weighted inputs.
+ Index num_coeffs_to_reduce = 1;
+ for (int i = 0; i < Eigen::internal::array_size<ReductionAxes>::value;
+ ++i) {
+ num_coeffs_to_reduce *= in.dimension(reduction_axes[i]);
+ }
+ T scale = T(1.0) / num_coeffs_to_reduce;
+ out.device(d) = (in * scale).sum(reduction_axes);
+ }
+
+ template <typename OUT_T>
+ static void FillIdentity(const SYCLDevice& d, OUT_T out,
+ const Eigen::internal::MeanReducer<T>& reducer) {
+ FillIdentityEigenImpl(d, out, reducer);
+ }
+};
#endif // TENSORFLOW_USE_SYCL
} // namespace functor
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index db86157c8e..5ab97d1eee 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -57,4 +57,27 @@ REGISTER_KERNEL_BUILDER(
#endif
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Max") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int32>("Tidx") \
+ .HostMemory("reduction_indices"), \
+ ReductionOp<SYCLDevice, type, Eigen::internal::MaxReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+#undef REGISTER_SYCL_KERNELS
+
+REGISTER_KERNEL_BUILDER(
+ Name("Max")
+ .Device(DEVICE_SYCL)
+ .HostMemory("reduction_indices")
+ .HostMemory("input")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Tidx"),
+ ReductionOp<CPUDevice, int32, Eigen::internal::MaxReducer<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index fef3cd0699..e018cb55dd 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -44,4 +44,17 @@ REGISTER_GPU_KERNELS(double);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Mean") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int32>("Tidx") \
+ .HostMemory("reduction_indices"), \
+ ReductionOp<SYCLDevice, type, Eigen::internal::MeanReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index c362bc8867..ec240421b9 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -57,4 +57,27 @@ REGISTER_KERNEL_BUILDER(
#endif
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Min") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int32>("Tidx") \
+ .HostMemory("reduction_indices"), \
+ ReductionOp<SYCLDevice, type, Eigen::internal::MinReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+#undef REGISTER_SYCL_KERNELS
+
+REGISTER_KERNEL_BUILDER(
+ Name("Min")
+ .Device(DEVICE_SYCL)
+ .HostMemory("reduction_indices")
+ .HostMemory("input")
+ .HostMemory("output")
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Tidx"),
+ ReductionOp<CPUDevice, int32, Eigen::internal::MinReducer<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
index c6aff8c2ed..e04c655dab 100644
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -45,4 +45,28 @@ REGISTER_GPU_KERNELS(double);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Prod") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int32>("Tidx") \
+ .HostMemory("reduction_indices"), \
+ ReductionOp<SYCLDevice, type, Eigen::internal::ProdReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+#undef REGISTER_SYCL_KERNELS
+
+REGISTER_KERNEL_BUILDER(
+ Name("Prod")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Tidx")
+ .HostMemory("input")
+ .HostMemory("output")
+ .HostMemory("reduction_indices"),
+ ReductionOp<CPUDevice, int32, Eigen::internal::ProdReducer<int32>>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index 3aa38f418e..938ca66a0c 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -74,7 +74,6 @@ REGISTER_KERNEL_BUILDER(
.HostMemory("reduction_indices"), \
ReductionOp<SYCLDevice, type, Eigen::internal::SumReducer<type>>);
REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
#undef REGISTER_SYCL_KERNELS
// A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index f24a71ec8c..d70398bea5 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -29,6 +29,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
#define REGISTER_RELU_KERNELS(type) \
REGISTER_KERNEL_BUILDER( \
@@ -131,4 +134,30 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+// Registration of the GPU implementations.
+#define REGISTER_SYCL_KERNELS(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+ ReluOp<SYCLDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+ ReluGradOp<SYCLDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+ Relu6Op<SYCLDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+ Relu6GradOp<SYCLDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+ EluOp<SYCLDevice, type>); \
+ REGISTER_KERNEL_BUILDER( \
+ Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+ EluGradOp<SYCLDevice, type>)
+
+REGISTER_SYCL_KERNELS(float);
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
index 365c6201a5..e2e0bd48dd 100644
--- a/tensorflow/core/kernels/relu_op.h
+++ b/tensorflow/core/kernels/relu_op.h
@@ -175,6 +175,10 @@ void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
} // namespace tensorflow
+#ifdef TENSORFLOW_USE_SYCL
+#undef EIGEN_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
#undef EIGEN_USE_THREADS
#endif // TENSORFLOW_KERNELS_RELU_OP_H_
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 5df36ef4cd..c5c805c44f 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -130,7 +130,7 @@ class CachedInterpolation {
}
// We use 2 hands and walk through, copying from one to another where
// we already have values.
- // Invarient, new_indicies_hand <= cached_values_hand
+ // Invariant, new_indicies_hand <= cached_values_hand
const std::array<int64, 4> new_x_indices{{x_0, x_1, x_2, x_3}};
int cached_values_hand = 0;
int new_indicies_hand = 0;
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index ec3b27ed6f..24b3ba31b8 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -33,6 +33,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
namespace {
@@ -344,4 +347,36 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
ReverseV2Op<CPUDevice, int32>);
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(T) \
+ REGISTER_KERNEL_BUILDER(Name("Reverse") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<T>("T") \
+ .HostMemory("dims"), \
+ ReverseOp<SYCLDevice, T>) \
+ REGISTER_KERNEL_BUILDER(Name("ReverseV2") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<T>("T") \
+ .TypeConstraint<int32>("Tidx") \
+ .HostMemory("axis"), \
+ ReverseV2Op<SYCLDevice, T>)
+TF_CALL_float(REGISTER_SYCL_KERNELS);
+
+REGISTER_KERNEL_BUILDER(Name("Reverse")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .HostMemory("tensor")
+ .HostMemory("dims")
+ .HostMemory("output"),
+ ReverseOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("ReverseV2")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Tidx")
+ .HostMemory("tensor")
+ .HostMemory("axis")
+ .HostMemory("output"),
+ ReverseV2Op<CPUDevice, int32>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 827eb7dbca..51dad49cfe 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -180,8 +180,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);
#define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL);
-REGISTER_SCATTER_ARITHEMTIC_SYCL(float);
-REGISTER_SCATTER_UPDATE_SYCL(float);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHEMTIC_SYCL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL);
#undef REGISTER_SCATTER_ARITHEMTIC_SYCL
#undef REGISTER_SCATTER_UPDATE_SYCL
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index c24ecdf8b9..c8ea923020 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -92,9 +92,11 @@ class RangeOp : public OpKernel {
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
TF_CALL_float(REGISTER_SYCL_KERNEL);
+TF_CALL_double(REGISTER_SYCL_KERNEL);
TF_CALL_int32(REGISTER_SYCL_KERNEL);
TF_CALL_int64(REGISTER_SYCL_KERNEL);
-#endif // TENSORFLOW_USE_SYCL
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
@@ -170,4 +172,9 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
TF_CALL_float(REGISTER_GPU_KERNEL);
TF_CALL_double(REGISTER_GPU_KERNEL);
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
+TF_CALL_float(REGISTER_SYCL_KERNEL);
+TF_CALL_double(REGISTER_SYCL_KERNEL);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index 6bc0b4560b..177a32464b 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -201,6 +201,7 @@ REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
.HostMemory("output"), \
RankOp);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
// A special GPU kernel for int32 and bool.
@@ -297,6 +298,43 @@ REGISTER_KERNEL_BUILDER(Name("Size")
SizeOp<int64>);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("Size") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int32>("out_type") \
+ .HostMemory("output"), \
+ SizeOp<int32>); \
+ REGISTER_KERNEL_BUILDER(Name("Size") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int64>("out_type") \
+ .HostMemory("output"), \
+ SizeOp<int64>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Size")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("out_type")
+ .HostMemory("input")
+ .HostMemory("output"),
+ SizeOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("Size")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int64>("out_type")
+ .HostMemory("input")
+ .HostMemory("output"),
+ SizeOp<int64>);
+#endif // TENSORFLOW_USE_SYCL
+
// ExpandDims ------------------------------------
REGISTER_KERNEL_BUILDER(Name("ExpandDims")
.Device(DEVICE_CPU)
@@ -323,7 +361,30 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
.HostMemory("dim")
.HostMemory("output"),
ExpandDimsOp);
-#endif
+#endif // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER(Name("ExpandDims") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .TypeConstraint<int32>("Tdim") \
+ .HostMemory("dim"), \
+ ExpandDimsOp);
+REGISTER_SYCL_KERNEL(float)
+REGISTER_SYCL_KERNEL(double)
+
+#undef REGISTER_SYCL_KERNEL
+
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Tdim")
+ .HostMemory("input")
+ .HostMemory("dim")
+ .HostMemory("output"),
+ ExpandDimsOp);
+#endif // TENSORFLOW_USE_SYCL
// Squeeze ---------------------------------------
REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
@@ -347,4 +408,24 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze")
SqueezeOp);
#endif
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+ SqueezeOp);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Squeeze")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .HostMemory("input")
+ .HostMemory("output"),
+ SqueezeOp);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index c7ae93852f..de11de32f1 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -28,17 +28,27 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
// Partial specialization for a CPUDevice, that uses the Eigen implementation
// from SoftmaxEigenImpl.
namespace functor {
-template <typename T>
-struct SoftmaxFunctor<CPUDevice, T> {
- void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+template <typename Device, typename T>
+struct SoftmaxFunctorBase {
+ void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
typename TTypes<T>::Matrix softmax, const bool log) {
- SoftmaxEigenImpl<CPUDevice, T>::Compute(d, logits, softmax, log);
+ SoftmaxEigenImpl<Device, T>::Compute(d, logits, softmax, log);
}
};
+template <typename T>
+struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
} // namespace functor
#define REGISTER_CPU(T) \
@@ -76,4 +86,10 @@ REGISTER_KERNEL_BUILDER(
SoftmaxOp<GPUDevice, float>);
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+ Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
+ SoftmaxOp<SYCLDevice, float>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index c18b992ea1..161ba89212 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -99,6 +99,9 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp);
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
+#endif // TENSORFLOW_USE_SYCL
class UnstageOp : public OpKernel {
public:
@@ -126,5 +129,8 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp);
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 6580d271d1..10593516f7 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -451,4 +451,71 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
#undef REGISTER_GPU
#endif // GOOGLE_CUDA
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL(type) \
+ REGISTER_KERNEL_BUILDER(Name("StridedSlice") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("begin") \
+ .HostMemory("end") \
+ .HostMemory("strides") \
+ .TypeConstraint<int32>("Index"), \
+ StridedSliceOp<SYCLDevice, type>) \
+ REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("shape") \
+ .HostMemory("begin") \
+ .HostMemory("end") \
+ .HostMemory("strides") \
+ .TypeConstraint<int32>("Index"), \
+ StridedSliceGradOp<SYCLDevice, type>)\
+ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<type>("T") \
+ .HostMemory("begin") \
+ .HostMemory("end") \
+ .HostMemory("strides") \
+ .TypeConstraint<int32>("Index"), \
+ StridedSliceAssignOp<SYCLDevice, type>)
+
+REGISTER_SYCL(float);
+REGISTER_SYCL(double);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("StridedSlice")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Index")
+ .HostMemory("input")
+ .HostMemory("begin")
+ .HostMemory("end")
+ .HostMemory("strides")
+ .HostMemory("output"),
+ StridedSliceOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Index")
+ .HostMemory("shape")
+ .HostMemory("begin")
+ .HostMemory("end")
+ .HostMemory("strides")
+ .HostMemory("dy")
+ .HostMemory("output"),
+ StridedSliceGradOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .TypeConstraint<int32>("Index")
+ .HostMemory("ref")
+ .HostMemory("begin")
+ .HostMemory("end")
+ .HostMemory("strides"),
+ StridedSliceAssignOp<CPUDevice, int32>)
+#undef REGISTER_SYCL
+#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 6341964b4b..d820db348e 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -287,6 +287,20 @@ DECLARE_FOR_N_GPU(int32);
TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
DECLARE_FOR_N_CPU(bfloat16);
+#ifdef TENSORFLOW_USE_SYCL
+#define PREVENT_FOR_N_SYCL(T) \
+ PREVENT_INSTANTIATE(T, STRIDED_SLICE_INSTANTIATE_DIM)
+
+#define DECLARE_FOR_N_SYCL(T) \
+ INSTANTIATE(SYCLDevice, T, STRIDED_SLICE_INSTANTIATE_DIM)
+
+TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_SYCL);
+DECLARE_FOR_N_SYCL(int32);
+
+#undef DECLARE_FOR_N_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
#undef INSTANTIATE
#undef DECLARE_FOR_N_CPU
#undef DECLARE_FOR_N_GPU
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index e55c8679e9..9822b021eb 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -260,6 +260,8 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
#ifdef TENSORFLOW_USE_SYCL
TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
#endif // TENSORFLOW_USE_SYCL
#undef HANDLE_TYPE_NAME_CPU
@@ -506,6 +508,16 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
#endif // GOOGLE_CUDA
+#if TENSORFLOW_USE_SYCL
+#define HANDLE_TYPE_NAME_SYCL(T) \
+ HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
+
+TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
+TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
+#undef HANDLE_TYPE_NAME_SYCL
+#endif // TENSORFLOW_USE_SYCL
+
#undef HANDLE_TYPE_NAME_CPU
#undef HANDLE_TYPE_NAME_GPU
#undef HANDLE_CASE_DIM
@@ -605,6 +617,25 @@ REGISTER_KERNEL_BUILDER(Name("Tile")
.TypeConstraint<int32>("Tmultiples")
.HostMemory("multiples"),
TileOp<SYCLDevice>);
+REGISTER_KERNEL_BUILDER(Name("Tile")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<double>("T")
+ .TypeConstraint<int32>("Tmultiples")
+ .HostMemory("multiples"),
+ TileOp<SYCLDevice>);
+
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<float>("T")
+ .TypeConstraint<int32>("Tmultiples")
+ .HostMemory("multiples"),
+ TileGradientOp<SYCLDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<double>("T")
+ .TypeConstraint<int32>("Tmultiples")
+ .HostMemory("multiples"),
+ TileGradientOp<SYCLDevice>);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index 650c739ed5..f06cc5514c 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -70,6 +70,8 @@ typedef Eigen::SyclDevice SYCLDevice;
#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
#undef DEFINE_DIM
#undef DEFINE_TYPE
@@ -81,6 +83,8 @@ TF_CALL_float(DEFINE_TYPE);
#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
#undef DEFINE_DIM
#undef DEFINE_TYPE
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 336c6b0ccc..5c2d371430 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -423,6 +423,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
TF_CALL_float(REGISTER_SYCL_KERNELS);
+TF_CALL_double(REGISTER_SYCL_KERNELS);
#undef REGISTER_SYCL_KERNELS
#endif
@@ -2355,6 +2356,7 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
TF_CALL_float(REGISTER_SYCL_KERNELS);
+TF_CALL_double(REGISTER_SYCL_KERNELS);
#endif
#if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 30b82f1843..3681b9a129 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -127,6 +127,7 @@ Status DoTranspose<SYCLDevice>(const SYCLDevice& d, const Tensor& in,
switch (in.dtype()) {
case DT_FLOAT:
+ case DT_DOUBLE:
case DT_INT32:
internal::Transpose<SYCLDevice, uint32>(d, in, perm, out);
break;
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 67300c1e96..4d303f0173 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -82,6 +82,15 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
.HostMemory("y"),
InvertPermutationOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<int32>("T")
+ .HostMemory("x")
+ .HostMemory("y"),
+ InvertPermutationOp);
+#endif // TENSORFLOW_USE_SYCL
+
// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
// of type T and rank N, and a permutation of 0, 1, ..., N-1. It
// shuffles the dimensions of the input tensor according to permutation.
@@ -201,4 +210,24 @@ TF_CALL_POD_TYPES(REGISTER);
#undef REGISTER
#endif
+#ifdef TENSORFLOW_USE_SYCL
+Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
+ gtl::ArraySlice<int32> perm, Tensor* out) {
+ typedef Eigen::SyclDevice SYCLDevice;
+ return ::tensorflow::DoTranspose(ctx->eigen_device<SYCLDevice>(), in, perm,
+ out);
+}
+#define REGISTER(T) \
+ REGISTER_KERNEL_BUILDER(Name("Transpose") \
+ .Device(DEVICE_SYCL) \
+ .TypeConstraint<T>("T") \
+ .TypeConstraint<int32>("Tperm") \
+ .HostMemory("perm"), \
+ TransposeSyclOp);
+REGISTER(float);
+REGISTER(bool);
+REGISTER(int32);
+#undef REGISTER
+#endif
+
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 3b209c0ccc..5f40bcecc1 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -50,6 +50,17 @@ class TransposeGpuOp : public TransposeOp {
gtl::ArraySlice<int32> perm, Tensor* out) override;
};
+#ifdef TENSORFLOW_USE_SYCL
+class TransposeSyclOp : public TransposeOp {
+ public:
+ explicit TransposeSyclOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+ Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+ gtl::ArraySlice<int32> perm, Tensor* out) override;
+};
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
#endif // TENSORFLOW_KERNELS_TRANSPOSE_OP_H_
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 2a14fa3265..e4c79ae17b 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -160,6 +160,7 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
UnpackOp<SYCLDevice, type>)
REGISTER_SYCL(float);
+REGISTER_SYCL(double);
#undef REGISTER_SYCL
// A special SYCL kernel for int32.
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 34e227156d..7a4d9dc650 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -58,8 +58,9 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
IsVariableInitializedOp);
REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
-#endif
+#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
// Only register 'Variable' on GPU for the subset of types also supported by
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 26f4fb2a2e..dc21cee3a8 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -28,6 +28,9 @@ namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T>
class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -74,17 +77,25 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
// Partial specialization for a CPUDevice, that uses the Eigen implementation
// from XentEigenImpl.
namespace functor {
-template <typename T>
-struct XentFunctor<CPUDevice, T> {
- void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+template <typename Device, typename T>
+struct XentFunctorBase {
+ void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
typename TTypes<T>::ConstMatrix labels,
typename TTypes<T>::Matrix scratch,
typename TTypes<T>::Vec loss,
typename TTypes<T>::Matrix backprop) {
- XentEigenImpl<CPUDevice, T>::Compute(d, logits, labels, scratch, loss,
+ XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
backprop);
}
};
+
+template <typename T>
+struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
+#endif // TENSORFLOW_USE_SYCL
} // namespace functor
#define REGISTER_CPU(T) \
@@ -111,4 +122,11 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
SoftmaxXentWithLogitsOp<GPUDevice, double>);
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
+ .Device(DEVICE_SYCL)
+ .TypeConstraint<float>("T"),
+ SoftmaxXentWithLogitsOp<SYCLDevice, float>);
+#endif // TENSORFLOW_USE_SYCL
+
} // namespace tensorflow
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 2def59ff04..8670ca307c 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -390,7 +390,7 @@ class TestOp : public OpKernel {
REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_CPU), TestOp);
#ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_SYCL), TestOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif // TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, Error_Reporting) {
auto x = test::AsTensor<float>({-3.f});
@@ -707,6 +707,8 @@ TEST_F(MathGradTest, Pow) {
}
}
+//TODO{lukeiwanski}: Implement Complex Pow for SYCL
+#ifndef TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, ComplexPow) {
auto x = test::AsTensor<complex64>({0.f, 2.f, -2.f}, TensorShape({3}));
auto y = test::AsTensor<complex64>({2.f, 2.f, 2.f}, TensorShape({3}));
@@ -725,6 +727,7 @@ TEST_F(MathGradTest, ComplexPow) {
dy, test::AsTensor<complex64>({h(0.f, 2.f), h(2.f, 2.f), h(-2.f, 2.f)},
TensorShape({3})));
}
+#endif // TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, Maximum) {
auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
@@ -886,6 +889,8 @@ TEST_F(MathGradTest, MatMul_11) {
test::ExpectClose(dy, MatMul(dz, true, x, true));
}
+//TODO{lukeiwanski}: Implement BatchMatMul for SYCL
+#ifndef TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, BatchMatMul_00) {
auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
TensorShape({1, 2, 3}));
@@ -933,6 +938,7 @@ TEST_F(MathGradTest, BatchMatMul_11) {
test::ExpectClose(dx, BatchMatMul(y, true, dz, true));
test::ExpectClose(dy, BatchMatMul(dz, true, x, true));
}
+#endif // TENSORFLOW_USE_SYCL
TEST_F(MathGradTest, Sum_dim0) {
auto x = test::AsTensor<float>({-3.f, -2.f, -1.f, 1.f, 2.f, 3.f},
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index c9d648f8b9..9180170e20 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -25367,6 +25367,59 @@ op {
description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
}
op {
+ name: "UnsortedSegmentSum"
+ input_arg {
+ name: "data"
+ type_attr: "T"
+ }
+ input_arg {
+ name: "segment_ids"
+ description: "A tensor whose shape is a prefix of `data.shape`."
+ type_attr: "Tindices"
+ }
+ input_arg {
+ name: "num_segments"
+ type: DT_INT32
+ }
+ output_arg {
+ name: "output"
+ description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
+ type_attr: "T"
+ }
+ attr {
+ name: "T"
+ type: "type"
+ allowed_values {
+ list {
+ type: DT_FLOAT
+ type: DT_DOUBLE
+ type: DT_INT64
+ type: DT_INT32
+ type: DT_UINT8
+ type: DT_UINT16
+ type: DT_INT16
+ type: DT_INT8
+ type: DT_QINT8
+ type: DT_QUINT8
+ type: DT_QINT32
+ type: DT_HALF
+ }
+ }
+ }
+ attr {
+ name: "Tindices"
+ type: "type"
+ allowed_values {
+ list {
+ type: DT_INT32
+ type: DT_INT64
+ }
+ }
+ }
+ summary: "Computes the max along segments of a tensor."
+ description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
+}
+op {
name: "Unstage"
output_arg {
name: "values"
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index cd0a9436d9..81d49684a8 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -20,7 +20,7 @@ limitations under the License.
#define TF_MAJOR_VERSION 1
#define TF_MINOR_VERSION 0
-#define TF_PATCH_VERSION 0-rc2
+#define TF_PATCH_VERSION 0
// TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
// "-beta", "-rc", "-rc.1")
diff --git a/tensorflow/docs_src/programmers_guide/meta_graph.md b/tensorflow/docs_src/programmers_guide/meta_graph.md
index add4379d7d..fa4cee8700 100644
--- a/tensorflow/docs_src/programmers_guide/meta_graph.md
+++ b/tensorflow/docs_src/programmers_guide/meta_graph.md
@@ -277,7 +277,7 @@ Here are some of the typical usage models:
* Retrieve Hyper Parameters
```Python
- filename = ".".join([tf.latest_checkpoint(train_dir), "meta"])
+ filename = ".".join([tf.train.latest_checkpoint(train_dir), "meta"])
tf.train.import_meta_graph(filename)
hparams = tf.get_collection("hparams")
```
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index e612eb7424..575dea5584 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -893,7 +893,8 @@ def main(_):
print('=== MISCLASSIFIED TEST IMAGES ===')
for i, test_filename in enumerate(test_filenames):
if predictions[i] != test_ground_truth[i].argmax():
- print('%70s %s' % (test_filename, image_lists.keys()[predictions[i]]))
+ print('%70s %s' % (test_filename,
+ list(image_lists.keys())[predictions[i]]))
# Write out the trained graph and labels with the weights stored as constants.
output_graph_def = graph_util.convert_variables_to_constants(
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
index 6e5fe7891b..15cf4b91dd 100644
--- a/tensorflow/examples/learn/mnist.py
+++ b/tensorflow/examples/learn/mnist.py
@@ -46,13 +46,13 @@ def conv_model(feature, target, mode):
# First conv layer will compute 32 features for each 5x5 patch
with tf.variable_scope('conv_layer1'):
- h_conv1 = layers.convolution(
+ h_conv1 = layers.convolution2d(
feature, 32, kernel_size=[5, 5], activation_fn=tf.nn.relu)
h_pool1 = max_pool_2x2(h_conv1)
# Second conv layer will compute 64 features for each 5x5 patch.
with tf.variable_scope('conv_layer2'):
- h_conv2 = layers.convolution(
+ h_conv2 = layers.convolution2d(
h_pool1, 64, kernel_size=[5, 5], activation_fn=tf.nn.relu)
h_pool2 = max_pool_2x2(h_conv2)
# reshape tensor into a batch of vectors
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index a3a5f9e3e9..c3d00a11b9 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -104,8 +104,13 @@ def main(unused_argv):
# Process vocabulary
vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
- x_train = np.array(list(vocab_processor.fit_transform(x_train)))
- x_test = np.array(list(vocab_processor.transform(x_test)))
+
+ x_transform_train = vocab_processor.fit_transform(x_train)
+ x_transform_test = vocab_processor.transform(x_test)
+
+ x_train = np.array(list(x_transform_train))
+ x_test = np.array(list(x_transform_test))
+
n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 8dcd3bf37a..25800c109e 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -181,7 +181,7 @@ with graph.as_default():
valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
- init = tf.global_variables_initializer()
+ init = tf.initialize_all_variables()
# Step 5: Begin training.
num_steps = 100001
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index 143a75a3e9..6faad294c2 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -6,7 +6,7 @@ Course information can be found at https://www.udacity.com/course/deep-learning-
Running the Docker container from the Google Cloud repository
-------------------------------------------------------------
- docker run -p 8888:8888 --name tensorflow-udacity -it gcr.io/tensorflow/udacity-assignments:0.6.0
+ docker run -p 8888:8888 --name tensorflow-udacity -it gcr.io/tensorflow/udacity-assignments:1.0.0
Note that if you ever exit the container, you can return to it using:
@@ -94,10 +94,10 @@ This will allow you to save work and have access to generated files on the host
Pushing a Google Cloud release
------------------------------
- V=0.6.0
+ V=1.0.0
docker tag $USER/assignments gcr.io/tensorflow/udacity-assignments:$V
gcloud docker push gcr.io/tensorflow/udacity-assignments
- docker tag -f $USER/assignments gcr.io/tensorflow/udacity-assignments:latest
+ docker tag $USER/assignments gcr.io/tensorflow/udacity-assignments:latest
gcloud docker push gcr.io/tensorflow/udacity-assignments
History
@@ -109,3 +109,4 @@ History
* 0.4.0: Move notMMNIST data for Google Cloud.
* 0.5.0: Actually use 0.7.1 release.
* 0.6.0: Update to TF 0.10.0, add libjpeg (for Pillow).
+* 1.0.0: Update to TF 1.0.0 release.
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index 561ce09099..7bba10efac 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -34,7 +34,7 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
# GPU test
if test.is_gpu_available():
self.assertGreater(len(devices), 1)
- self.assertTrue("GPU" in [d.device_type for d in devices])
+ self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
if __name__ == "__main__":
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index f2fd687adf..3ea7e547ee 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -44,7 +44,14 @@ from tensorflow.python.platform import googletest
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.util import compat
from tensorflow.python.util.protobuf import compare
+from tensorflow.python.client import device_lib
+def gpu_device_name():
+ """Returns the name of a GPU device if available or the empty string."""
+ for x in device_lib.list_local_devices():
+ if x.device_type == 'GPU' or x.device_type == 'SYCL':
+ return x.name
+ return ''
def assert_ops_in_graph(expected_ops, graph):
"""Assert all expected operations are found.
@@ -301,7 +308,12 @@ class TensorFlowTestCase(googletest.TestCase):
sess = self._cached_session
with sess.graph.as_default(), sess.as_default():
if force_gpu:
- with sess.graph.device("/gpu:0"):
+ # Use the name of an actual device if one is detected, or '/gpu:0'
+ # otherwise
+ gpu_name = gpu_device_name()
+ if len(gpu_name) == 0:
+ gpu_name = '/gpu:0'
+ with sess.graph.device(gpu_name):
yield sess
elif use_gpu:
yield sess
@@ -311,7 +323,12 @@ class TensorFlowTestCase(googletest.TestCase):
else:
with session.Session(graph=graph, config=prepare_config(config)) as sess:
if force_gpu:
- with sess.graph.device("/gpu:0"):
+ # Use the name of an actual device if one is detected, or '/gpu:0'
+ # otherwise
+ gpu_name = gpu_device_name()
+ if len(gpu_name) == 0:
+ gpu_name = '/gpu:0'
+ with sess.graph.device(gpu_name):
yield sess
elif use_gpu:
yield sess
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index ae26dcb5a9..81eee48d2e 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -46,7 +46,7 @@ class StageTest(test.TestCase):
with ops.device('/cpu:0'):
x = array_ops.placeholder(dtypes.float32)
v = 2. * (array_ops.zeros([128, 128]) + x)
- with ops.device('/gpu:0'):
+ with ops.device(test.gpu_device_name()):
stager = data_flow_ops.StagingArea([dtypes.float32, dtypes.float32])
stage = stager.put([x, v])
z, y = stager.get()
@@ -62,7 +62,7 @@ class StageTest(test.TestCase):
with ops.device('/cpu:0'):
x = array_ops.placeholder(dtypes.float32)
v = 2. * (array_ops.zeros([128, 128]) + x)
- with ops.device('/gpu:0'):
+ with ops.device(test.gpu_device_name()):
stager = data_flow_ops.StagingArea(
[dtypes.float32, dtypes.float32],
shapes=[[], [128, 128]],
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 11b350a99e..48be8cff4f 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -46,11 +46,13 @@ class VariablesTestCase(test.TestCase):
self.assertEqual("Variable:0", var0.name)
self.assertEqual([], var0.get_shape())
self.assertEqual([], var0.get_shape())
+ self.assertEqual([], var0.shape)
var1 = variables.Variable(1.1)
self.assertEqual("Variable_1:0", var1.name)
self.assertEqual([], var1.get_shape())
self.assertEqual([], var1.get_shape())
+ self.assertEqual([], var1.shape)
with self.assertRaisesOpError("Attempting to use uninitialized value"):
var0.eval()
@@ -69,11 +71,13 @@ class VariablesTestCase(test.TestCase):
self.assertEqual("rnd:0", rnd.name)
self.assertEqual([3, 6], rnd.get_shape())
self.assertEqual([3, 6], rnd.get_shape())
+ self.assertEqual([3, 6], rnd.shape)
dep = variables.Variable(rnd.initialized_value(), name="dep")
self.assertEqual("dep:0", dep.name)
self.assertEqual([3, 6], dep.get_shape())
self.assertEqual([3, 6], dep.get_shape())
+ self.assertEqual([3, 6], dep.shape)
# Currently have to set the shape manually for Add.
added_val = rnd.initialized_value() + dep.initialized_value() + 2.0
@@ -83,6 +87,7 @@ class VariablesTestCase(test.TestCase):
self.assertEqual("depdep:0", depdep.name)
self.assertEqual([3, 6], depdep.get_shape())
self.assertEqual([3, 6], depdep.get_shape())
+ self.assertEqual([3, 6], depdep.shape)
variables.global_variables_initializer().run()
@@ -375,6 +380,7 @@ class VariablesTestCase(test.TestCase):
v1 = variables.Variable(initializer, dtype=dtypes.float32)
self.assertEqual(shape, v1.get_shape())
+ self.assertEqual(shape, v1.shape)
self.assertAllClose(value, v1.initial_value.eval())
with self.assertRaises(errors_impl.FailedPreconditionError):
v1.eval()
@@ -382,6 +388,7 @@ class VariablesTestCase(test.TestCase):
v2 = variables.Variable(
math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
self.assertEqual(v1.get_shape(), v2.get_shape())
+ self.assertEqual(v1.shape, v2.shape)
self.assertAllClose(np.negative(value), v2.initial_value.eval())
# Once v2.initial_value.eval() has been called, v1 has effectively been
@@ -532,6 +539,7 @@ class PartitionedVariableTest(test.TestCase):
self.assertEqual(2, num_partitions)
self.assertEqual([v0, v1], iterated_partitions)
self.assertEqual([2], concatenated.get_shape())
+ self.assertEqual([2], concatenated.shape)
def testPartitionedVariableFailures(self):
with ops.Graph().as_default():
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index d037ceac61..4b3dadc112 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -157,7 +157,7 @@ class XentTest(test.TestCase):
np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float64))
def testGradient(self):
- with self.test_session():
+ with self.test_session() as sess:
l = constant_op.constant(
[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5],
shape=[3, 4],
@@ -171,14 +171,21 @@ class XentTest(test.TestCase):
x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
name="xent")
err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])
+
+ # Check that no extra computation performed. When only first derivative is requested,
+ # second derivative must not be computed. So when there is no second derivative,
+ # there is no `BatchMatMul` op in the graph.
+ op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
+ self.assertNotIn('BatchMatMul', op_names)
+
print("cross entropy gradient err = ", err)
self.assertLess(err, 5e-8)
def testSecondGradient(self):
- with self.test_session():
- l = constant_op.constant([0.0, 0.0, 1.0, 0.0,
- 1.0, 0.0, 0.0, 0.0,
- 0.0, 0.5, 0.0, 0.5], shape=[12],
+ with self.test_session() as sess:
+ l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0,
+ 1.0/3, 0.0, 0.0, 0.0,
+ 0.0, 0.5/3, 0.0, 0.5/3], shape=[12],
dtype=dtypes.float64, name="l")
f = constant_op.constant([0.1, 0.2, 0.3, 0.4,
0.1, 0.4, 0.9, 1.6,
@@ -186,13 +193,19 @@ class XentTest(test.TestCase):
dtype=dtypes.float64, name="f")
x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
name="xent")
- loss = math_ops.reduce_mean(x)
+ loss = math_ops.reduce_sum(x)
- # Taking ths second gradient should fail, since it is not
- # yet supported.
- with self.assertRaisesRegexp(LookupError,
- "explicitly disabled"):
- _ = gradients_impl.hessians(loss, [f])
+ gradients = gradients_impl.gradients(loss, [f])[0]
+
+ err = gradient_checker.compute_gradient_error(f, [12], gradients, [12])
+
+ # Check that second derivative is calculated.
+ # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad)
+ op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
+ self.assertIn('BatchMatMul', op_names)
+
+ print("cross entropy hessian err = ", err)
+ self.assertLess(err, 5e-8)
def testWrapper(self):
features = np.array(
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index c64813962a..34380951af 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -586,7 +586,7 @@ def strided_slice(input_,
`foo[::-1]` reverses a tensor with shape 8.
- If the ith bit of `ellipsis_mask`, as many unspecified dimensions
+ If the ith bit of `ellipsis_mask` is non-zero, as many unspecified dimensions
as needed will be inserted between other dimensions. Only one
non-zero bit is allowed in `ellipsis_mask`.
@@ -594,7 +594,7 @@ def strided_slice(input_,
equivalent to `foo[3:5,:,:,4:5]` and
`foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`.
- If the ith bit of `new_axis_mask` is one, then a `begin`,
+ If the ith bit of `new_axis_mask` is one, then `begin`,
`end`, and `stride` are ignored and a new length 1 dimension is
added at this point in the output tensor.
@@ -1545,7 +1545,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
x: (indices, values, shape)})) # Will succeed.
sp = tf.SparseTensor(indices=indices, values=values, dense_shape=shape)
- sp_value = sp.eval(session)
+ sp_value = sp.eval(session=sess)
print(sess.run(y, feed_dict={x: sp_value})) # Will succeed.
```
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index bda7212c8a..3dc0ac34c8 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -41,8 +41,10 @@ def clip_by_value(t, clip_value_min, clip_value_max,
Args:
t: A `Tensor`.
- clip_value_min: A 0-D (scalar) `Tensor`. The minimum value to clip by.
- clip_value_max: A 0-D (scalar) `Tensor`. The maximum value to clip by.
+ clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+ as `t`. The minimum value to clip by.
+ clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+ as `t`. The maximum value to clip by.
name: A name for the operation (optional).
Returns:
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 397c522dbe..41e8877d72 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -321,21 +321,25 @@ def _BroadcastMul(vec, mat):
@ops.RegisterGradient("SoftmaxCrossEntropyWithLogits")
-def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
+def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
"""Gradient function for SoftmaxCrossEntropyWithLogits."""
- # grad_0 is the backprop for cost, and we multiply it with the gradients
+ # grad_loss is the backprop for cost, and we multiply it with the gradients
# (which is output[1])
+ # grad_grad is the backprop for softmax gradient.
# There is no gradient for the labels
#
- # Currently there is no way to take the second derivative of this op
- # due to the fused implementation's interaction with tf.gradients(),
- # so we make sure we prevent silently incorrect results by raising
- # an error if the second derivative is requested via prevent_gradient.
- softmax_grad_without_gradient = array_ops.prevent_gradient(
- op.outputs[1], message="Currently there is no way to take the second "
- "derivative of softmax_cross_entropy_with_logits due to the fused "
- " implementation's interaction with tf.gradients()")
- return _BroadcastMul(grad_0, softmax_grad_without_gradient), None
+ # Second derivative is just softmax derivative w.r.t. logits.
+ softmax_grad = op.outputs[1]
+ grad = _BroadcastMul(grad_loss, softmax_grad)
+
+ if grad_grad.op.type not in ('ZerosLike', 'Zeros'):
+ logits = op.inputs[0]
+ softmax = nn_ops.softmax(logits)
+
+ grad += ((grad_grad - array_ops.squeeze(math_ops.matmul(grad_grad[:, None, :],
+ softmax[:, :, None]), axis=1)) * softmax)
+
+ return grad, None
@ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits")
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 06ae3589a2..658c9f79ee 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -515,7 +515,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
state = initial_state
else:
if not dtype:
- raise ValueError("If no initial_state is provided, dtype must be.")
+ raise ValueError("If there is no initial_state, you must give a dtype.")
state = cell.zero_state(batch_size, dtype)
def _assert_has_shape(x, shape):
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index bf4d198209..5a8eb432d1 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -318,44 +318,28 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
# into a single axis, and combine multiple summed axes into a
# single axis.
- t0_shape = tuple(x.value for x in t0.get_shape())
+ t0_shape = _get_shape(t0)
num_broadcast_elements_t0 = _total_size(
t0_shape[len(preserved_axes):-len(axes_to_sum)])
num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):])
- new_shape = t0_shape[:len(preserved_axes)] + (num_broadcast_elements_t0,
- num_summed_elements)
+ new_shape = (t0_shape[:len(preserved_axes)]
+ + [num_broadcast_elements_t0, num_summed_elements])
t0 = _reshape_if_necessary(t0, new_shape)
- t1_shape = tuple(x.value for x in t1.get_shape())
+ t1_shape = _get_shape(t1)
num_broadcast_elements_t1 = _total_size(
t1_shape[len(preserved_axes)+len(axes_to_sum):])
- new_shape = t1_shape[:len(preserved_axes)] + (num_summed_elements,
- num_broadcast_elements_t1)
+ new_shape = (t1_shape[:len(preserved_axes)]
+ + [num_summed_elements, num_broadcast_elements_t1])
t1 = _reshape_if_necessary(t1, new_shape)
product = math_ops.matmul(t0, t1)
# Undo compaction of broadcast axes
uncompacted_shape = (
- t0_shape[:len(preserved_axes)+len(broadcast_axes[0])] +
- t1_shape[len(t1_shape)-len(broadcast_axes[1]):]
+ t0_shape[:len(preserved_axes)+len(broadcast_axes[0])]
+ + t1_shape[len(t1_shape)-len(broadcast_axes[1]):]
)
-
- # Check the number of None values and replace them with Tensors containing
- # corresponding dimensions if there exist two or more None values
- num_none_dims = sum(1 for d in uncompacted_shape if d is None)
- if num_none_dims > 1:
- uncompacted_shape = list(uncompacted_shape)
- for i in xrange(len(uncompacted_shape)):
- if uncompacted_shape[i] is None:
- if i < len(preserved_axes) + len(broadcast_axes[0]):
- uncompacted_shape[i] = array_ops.shape(inputs[0])[i]
- else:
- idx = (i - len(preserved_axes) - len(broadcast_axes[0])
- + len(t1_shape) - len(broadcast_axes[1]))
- uncompacted_shape[i] = array_ops.shape(inputs[1])[idx]
- uncompacted_shape = tuple(uncompacted_shape)
-
product = _reshape_if_necessary(product, uncompacted_shape)
product_axes = (
@@ -386,13 +370,27 @@ def _reshape_if_necessary(tensor, new_shape):
return array_ops.reshape(tensor, new_shape)
+def _get_shape(tensor):
+ """Like get_shape().as_list(), but explicitly queries the shape of a tensor
+ if necessary to ensure that the returned value contains no unknown value."""
+
+ shape = tensor.get_shape().as_list()
+ none_indices = [i for i, d in enumerate(shape) if d is None]
+ if none_indices:
+ # Query the shape if shape contains None values
+ shape_tensor = array_ops.shape(tensor)
+ for i in none_indices:
+ shape[i] = shape_tensor[i]
+ return shape
+
def _total_size(shape_values):
- """Given list of tensor shape values, returns total size or -1 if unknown."""
+ """Given list of tensor shape values, returns total size.
+ If shape_values contains tensor values (which are results of
+ array_ops.shape), then it returns a scalar tensor.
+ If not, it returns an integer."""
+
result = 1
for val in shape_values:
- if val is None:
- return -1
- assert isinstance(val, int)
result *= val
return result
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 3d289bcc9a..c792d32277 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -318,7 +318,19 @@ class EinsumTest(test.TestCase):
m1: [3, 2],
}
np.testing.assert_almost_equal(
- [[7]], sess.run(out, feed_dict=feed_dict))
+ [[7]], sess.run(out, feed_dict=feed_dict))
+
+ with ops.Graph().as_default():
+ m0 = array_ops.placeholder(dtypes.int32, shape=(None, 2, None, 2))
+ m1 = array_ops.placeholder(dtypes.int32, shape=(None, 2))
+ out = special_math_ops.einsum('ijkl,ij->ikl', m0, m1)
+ with session.Session() as sess:
+ feed_dict = {
+ m0: [[[[1, 2]], [[2, 1]]]],
+ m1: [[3, 2]],
+ }
+ np.testing.assert_almost_equal(
+ [[[7, 8]]], sess.run(out, feed_dict=feed_dict))
if __name__ == '__main__':
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 5a1a43b5d5..ee1e361093 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -723,7 +723,8 @@ class Variable(object):
"""The `Graph` of this variable."""
return self._variable.graph
- def get_shape(self):
+ @property
+ def shape(self):
"""The `TensorShape` of this variable.
Returns:
@@ -731,6 +732,10 @@ class Variable(object):
"""
return self._variable.get_shape()
+ def get_shape(self):
+ """Alias of Variable.shape."""
+ return self.shape
+
def to_proto(self, export_scope=None):
"""Converts a `Variable` to a `VariableDef` protocol buffer.
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 0001eb65e1..452b8f5d3b 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -42,6 +42,7 @@ from tensorflow.python.util.all_util import remove_undocumented
# pylint: disable=unused-import
from tensorflow.python.framework.test_util import assert_equal_graph_def
from tensorflow.python.framework.test_util import TensorFlowTestCase as TestCase
+from tensorflow.python.framework.test_util import gpu_device_name
from tensorflow.python.ops.gradient_checker import compute_gradient_error
from tensorflow.python.ops.gradient_checker import compute_gradient
@@ -107,15 +108,6 @@ def is_gpu_available(cuda_only=False):
return any((x.device_type == 'GPU' or x.device_type == 'SYCL')
for x in _device_lib.list_local_devices())
-
-def gpu_device_name():
- """Returns the name of a GPU device if available or the empty string."""
- for x in _device_lib.list_local_devices():
- if x.device_type == 'GPU' or x.device_type == 'SYCL':
- return x.name
- return ''
-
-
_allowed_symbols = [
# We piggy-back googletest documentation.
'Benchmark',
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index def28053f4..01ce67252e 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -57,7 +57,7 @@ namespace cuda {
#ifdef __APPLE__
static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
-#else
+#elif !defined(PLATFORM_WINDOWS)
static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
#endif
@@ -341,6 +341,12 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8))
};
return status;
+#elif defined(PLATFORM_WINDOWS)
+ auto status =
+ port::Status{port::error::UNIMPLEMENTED,
+ "kernel reported driver version not implemented on Windows"
+ };
+ return status;
#else
FILE *driver_version_file = fopen(kDriverVersionPath, "r");
if (driver_version_file == nullptr) {
diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md
index 5c90fceaf7..eede1f5fac 100644
--- a/tensorflow/tools/ci_build/README.md
+++ b/tensorflow/tools/ci_build/README.md
@@ -53,10 +53,10 @@ tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
tensorflow/tools/ci_build/ci_build.sh GPU bazel test -c opt --config=cuda //tensorflow/...
# build pip with gpu support
-tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU
+tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU -c opt --config=cuda
# build and run gpu tests using python 3
-CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3" tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3" tensorflow/tools/ci_build/ci_build.sh GPU tensorflow/tools/ci_build/builds/pip.sh GPU -c opt --config=cuda
# build android example app
tensorflow/tools/ci_build/ci_build.sh ANDROID tensorflow/tools/ci_build/builds/android.sh
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 5527feee34..1399f1e58f 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -19,8 +19,7 @@
# The PIP installation is done using the --user flag.
#
# Usage:
-# pip.sh CONTAINER_TYPE [--mavx] [--mavx2]
-# [--test_tutorials] [--integration_tests]
+# pip.sh CONTAINER_TYPE [--test_tutorials] [--integration_tests] [bazel flags]
#
# When executing the Python unit tests, the script obeys the shell
# variables: TF_BUILD_BAZEL_CLEAN, TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES,
@@ -39,8 +38,7 @@
# If NO_TEST_USER_OPS has any non-empty and non-0 value, the testing of user-
# defined ops against the installation will be skipped.
#
-# Use --mavx or --mavx2 to let bazel use --copt=-mavx or --copt=-mavx2 options
-# while building the pip package, respectively.
+# Any flags not listed in the usage above will be passed directly to Bazel.
#
# If the --test_tutorials flag is set, it will cause the script to run the
# tutorial tests (see test_tutorials.sh) after the PIP
@@ -49,6 +47,11 @@
# to run.
#
+# Helper function: Strip leading and trailing whitespaces
+str_strip () {
+ echo -e "$1" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'
+}
+
# Fixed naming patterns for wheel (.whl) files given different python versions
if [[ $(uname) == "Linux" ]]; then
declare -A WHL_TAGS
@@ -66,6 +69,7 @@ source "${SCRIPT_DIR}/builds_common.sh"
# Get the command line arguments
CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
+shift
if [[ ! -z "${TF_BUILD_BAZEL_CLEAN}" ]] && \
[[ "${TF_BUILD_BAZEL_CLEAN}" != "0" ]]; then
@@ -82,16 +86,14 @@ fi
DO_TEST_TUTORIALS=0
DO_INTEGRATION_TESTS=0
-MAVX_FLAG=""
+BAZEL_FLAGS=""
while true; do
if [[ "${1}" == "--test_tutorials" ]]; then
DO_TEST_TUTORIALS=1
elif [[ "${1}" == "--integration_tests" ]]; then
DO_INTEGRATION_TESTS=1
- elif [[ "${1}" == "--mavx" ]]; then
- MAVX_FLAG="--copt=-mavx"
- elif [[ "${1}" == "--mavx2" ]]; then
- MAVX_FLAG="--copt=-mavx2"
+ else
+ BAZEL_FLAGS="${BAZEL_FLAGS} ${1}"
fi
shift
@@ -100,18 +102,18 @@ while true; do
fi
done
-if [[ ! -z "${MAVX_FLAG}" ]]; then
- echo "Using MAVX flag: ${MAVX_FLAG}"
-fi
+BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}")
+
+echo "Using Bazel flags: ${BAZEL_FLAGS}"
PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
GPU_FLAG=""
if [[ ${CONTAINER_TYPE} == "cpu" ]] || \
[[ ${CONTAINER_TYPE} == "debian.jessie.cpu" ]]; then
- bazel build -c opt ${MAVX_FLAG} ${PIP_BUILD_TARGET} || \
+ bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
die "Build failed."
elif [[ ${CONTAINER_TYPE} == "gpu" ]]; then
- bazel build -c opt --config=cuda ${MAVX_FLAG} ${PIP_BUILD_TARGET} || \
+ bazel build ${BAZEL_FLAGS} ${PIP_BUILD_TARGET} || \
die "Build failed."
GPU_FLAG="--gpu"
else
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 3697fd46a0..f0fa8a9381 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -26,7 +26,7 @@
# directory as this script will be used.
#
# COMMAND: Command to be executed in the docker container, e.g.,
-# tensorflow/tools/ci_build/builds/pip.sh gpu
+# tensorflow/tools/ci_build/builds/pip.sh gpu -c opt --config=cuda
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${SCRIPT_DIR}/builds/builds_common.sh"
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 5e2cf94059..c2ad03bc6f 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -83,7 +83,8 @@
# support for Google Cloud Platform (GCP), which is
# enabled by default.
# TF_BUILD_OPTIONS:
-# (FASTBUILD | OPT | OPTDBG | MAVX | MAVX2)
+# (FASTBUILD | OPT | OPTDBG | MAVX | MAVX2_FMA | MAVX_DBG |
+# MAVX2_FMA_DBG)
# Use the specified configurations when building.
# When set, overrides TF_BUILD_IS_OPT and TF_BUILD_MAVX
# options, as this will replace the two.
@@ -304,11 +305,14 @@ else
MAVX)
OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx"
;;
- MAVXDBG)
+ MAVX_DBG)
OPT_FLAG="${OPT_FLAG} -c opt --copt=-g --copt=-mavx"
;;
- MAVX2)
- OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx2"
+ MAVX2_FMA)
+ OPT_FLAG="${OPT_FLAG} -c opt --copt=-mavx2 --copt=-mfma"
+ ;;
+ MAVX2_FMA_DBG)
+ OPT_FLAG="${OPT_FLAG} -c opt --copt=-g --copt=-mavx2 --copt=-mfma"
;;
esac
fi
@@ -376,12 +380,7 @@ if [[ ${TF_BUILD_IS_PIP} == "pip" ]] ||
exit 0
fi
- PIP_MAIN_CMD="${MAIN_CMD} ${PIP_CMD} ${CTYPE} ${EXTRA_AGRS}"
-
- # Add flag for mavx/mavx2
- if [[ ! -z "${TF_BUILD_MAVX}" ]]; then
- PIP_MAIN_CMD="${PIP_MAIN_CMD} --${TF_BUILD_MAVX}"
- fi
+ PIP_MAIN_CMD="${MAIN_CMD} ${PIP_CMD} ${CTYPE} ${EXTRA_ARGS} ${OPT_FLAG}"
# Add flag for integration tests
if [[ ! -z "${TF_BUILD_INTEGRATION_TESTS}" ]] &&
diff --git a/tensorflow/tools/ci_build/install/install_buildifier.sh b/tensorflow/tools/ci_build/install/install_buildifier.sh
index 2f3470881a..5420934c6b 100755
--- a/tensorflow/tools/ci_build/install/install_buildifier.sh
+++ b/tensorflow/tools/ci_build/install/install_buildifier.sh
@@ -16,8 +16,9 @@
set -e
BUILDIFIER_DIR="buildifier"
-rm -rf ${BUILDIFIER_DIR}
-git clone https://github.com/bazelbuild/buildifier.git ${BUILDIFIER_DIR}
+mkdir ${BUILDIFIER_DIR}
+curl -Ls https://github.com/bazelbuild/buildifier/archive/0.4.3.tar.gz | \
+ tar -C "${BUILDIFIER_DIR}" --strip-components=1 -xz
pushd ${BUILDIFIER_DIR}
bazel build buildifier:buildifier --spawn_strategy=standalone --genrule_strategy=standalone
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 35c1218470..886266caaf 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -266,7 +266,7 @@ fi
IMG="${USER}/tensorflow:${FINAL_TAG}"
echo "Building docker image with image name and tag: ${IMG}"
-"${DOCKER_BINARY}" build --no-cache -t "${IMG}" -f "${DOCKERFILE}" "${TMP_DIR}"
+"${DOCKER_BINARY}" build --no-cache --pull -t "${IMG}" -f "${DOCKERFILE}" "${TMP_DIR}"
if [[ $? == "0" ]]; then
echo "${DOCKER_BINARY} build of ${IMG} succeeded"
else
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index 36a5c01a0c..73af8699b2 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -179,7 +179,7 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
--outputs='softmax:0' \
--transforms='\
strip_unused_nodes(type=float, shape="1,299,299,3") \
-fold_constants \
+fold_constants(ignore_errors=true) \
fold_batch_norms \
fold_old_batch_norms\
'
@@ -715,7 +715,7 @@ shape arguments let you control the attributes of any new Placeholders that are
created. Plain `type` and `shape` set global defaults, but if you have different
inputs with varying characteristics, you'll need to pass in a list of arguments
where the preceding name specifies what layer each applies to. For example, if
-you had two inputs in1 and in2, you could call `strip_unused_node(name=in1,
+you had two inputs in1 and in2, you could call `strip_unused_nodes(name=in1,
type_for_name=int32, shape_for_name="2,3", name=in2, type_for_name=float,
shape_for_name="1,10,10,3")`.
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index ea4c7e9ff4..9b229704d6 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
# This version string is semver compatible, but incompatible with pip.
# For pip, we will remove all '-' characters from this string, and use the
# result for pip.
-_VERSION = '1.0.0-rc2'
+_VERSION = '1.0.0'
REQUIRED_PACKAGES = [
'numpy >= 1.11.0',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5a2fe99c67..66ff8843d8 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -406,12 +406,16 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
actual = "@zlib_archive//:zlib",
)
- native.new_http_archive(
+ temp_workaround_http_archive(
name = "nccl_archive",
- url = "https://github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
+ urls = [
+ "http://bazel-mirror.storage.googleapis.com/github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
+ "https://github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
+ ],
sha256 = "6787f0eed88d52ee8e32956fa4947d92c139da469f1d8e311c307f27d641118e",
strip_prefix = "nccl-024d1e267845f2ed06f3e2e42476d50f04a00ee6",
build_file = str(Label("//third_party:nccl.BUILD")),
+ repository = tf_repo_name,
)
java_import_external(
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index dde8e6cdb7..557c2885eb 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -204,13 +204,13 @@ cc_library(
"lib/wildcard.h",
"lib/x509asn1.h",
] + select({
- ":darwin": [
+ "@//tensorflow:darwin": [
"lib/vtls/darwinssl.c",
],
- ":ios": [
+ "@//tensorflow:ios": [
"lib/vtls/darwinssl.c",
],
- ":windows": [
+ "@//tensorflow:windows": [
"lib/asyn-thread.c",
"lib/inet_ntop.c",
"lib/system_win32.c",
@@ -231,7 +231,7 @@ cc_library(
"include/curl/typecheck-gcc.h",
],
copts = select({
- ":windows": [
+ "@//tensorflow:windows": [
"/I%prefix%/curl/lib",
"/DHAVE_CONFIG_H",
"/DCURL_DISABLE_FTP",
@@ -255,10 +255,10 @@ cc_library(
"-Wno-string-plus-int",
],
}) + select({
- ":darwin": [
+ "@//tensorflow:darwin": [
"-fno-constant-cfstrings",
],
- ":windows": [
+ "@//tensorflow:windows": [
# See curl.h for discussion of write size and Windows
"/DCURL_MAX_WRITE_SIZE=16384",
],
@@ -268,17 +268,17 @@ cc_library(
}),
includes = ["include"],
linkopts = select({
- ":android": [
+ "@//tensorflow:android": [
"-pie",
],
- ":darwin": [
+ "@//tensorflow:darwin": [
"-Wl,-framework",
"-Wl,CoreFoundation",
"-Wl,-framework",
"-Wl,Security",
],
- ":ios": [],
- ":windows": [
+ "@//tensorflow:ios": [],
+ "@//tensorflow:windows": [
"ws2_32.lib",
],
"//conditions:default": [
@@ -289,8 +289,8 @@ cc_library(
deps = [
"@zlib_archive//:zlib",
] + select({
- ":ios": [],
- ":windows": [],
+ "@//tensorflow:ios": [],
+ "@//tensorflow:windows": [],
"//conditions:default": [
"@boringssl//:ssl",
],
@@ -386,7 +386,7 @@ cc_binary(
"src/tool_xattr.h",
],
copts = select({
- ":windows": [
+ "@//tensorflow:windows": [
"/I%prefix%/curl/lib",
"/DHAVE_CONFIG_H",
"/DCURL_DISABLE_LIBCURL_OPTION",
@@ -657,23 +657,3 @@ genrule(
"EOF",
]),
)
-
-config_setting(
- name = "ios",
- values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
-)
-
-config_setting(
- name = "darwin",
- values = {"cpu": "darwin"},
-)
-
-config_setting(
- name = "windows",
- values = {"cpu": "x64_windows_msvc"},
-)
-
-config_setting(
- name = "android",
- values = {"crosstool_top": "//external:android/crosstool"},
-)
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 98deb1742e..078be83e0d 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -11,6 +11,13 @@ typedef struct Packet32q8i {
Packet32q8i(__m256i val) : val(val) {}
} Packet32q8i;
+typedef struct Packet16q16i {
+ __m256i val;
+ operator __m256i() const { return val; }
+ Packet16q16i();
+ Packet16q16i(__m256i val) : val(val) {}
+} Packet16q16i;
+
typedef struct Packet32q8u {
__m256i val;
operator __m256i() const { return val; }
@@ -32,6 +39,13 @@ typedef struct Packet16q8u {
Packet16q8u(__m128i val) : val(val) {}
} Packet16q8u;
+typedef struct Packet8q16i {
+ __m128i val;
+ operator __m128i() const { return val; }
+ Packet8q16i();
+ Packet8q16i(__m128i val) : val(val) {}
+} Packet8q16i;
+
typedef struct Packet8q32i {
__m256i val;
operator __m256i() const { return val; }
@@ -92,6 +106,28 @@ struct packet_traits<QUInt8> : default_packet_traits {
};
};
template <>
+struct packet_traits<QInt16> : default_packet_traits {
+ typedef Packet16q16i type;
+ typedef Packet8q16i half;
+ enum {
+ Vectorizable = 1,
+ AlignedOnScalar = 1,
+ size = 16,
+ };
+ enum {
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0
+ };
+};
+template <>
struct packet_traits<QInt32> : default_packet_traits {
typedef Packet8q32i type;
typedef Packet4q32i half;
@@ -122,6 +158,12 @@ struct unpacket_traits<Packet32q8i> {
enum { size = 32, alignment=Aligned32 };
};
template <>
+struct unpacket_traits<Packet16q16i> {
+ typedef QInt16 type;
+ typedef Packet8q16i half;
+ enum { size = 16, alignment=Aligned32 };
+};
+template <>
struct unpacket_traits<Packet32q8u> {
typedef QUInt8 type;
typedef Packet16q8u half;
@@ -146,6 +188,11 @@ EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
reinterpret_cast<const __m256i*>(from));
}
template <>
+EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(from));
+}
+template <>
EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
reinterpret_cast<const __m256i*>(from));
@@ -163,6 +210,11 @@ EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
reinterpret_cast<const __m256i*>(from));
}
template <>
+EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
+ reinterpret_cast<const __m256i*>(from));
+}
+template <>
EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
reinterpret_cast<const __m256i*>(from));
@@ -180,6 +232,11 @@ EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
reinterpret_cast<__m256i*>(to), from.val);
}
template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
+ reinterpret_cast<__m256i*>(to), from.val);
+}
+template <>
EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
reinterpret_cast<__m256i*>(to), from.val);
@@ -192,6 +249,11 @@ EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
from.val);
}
template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
+ from.val);
+}
+template <>
EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
from.val);
@@ -208,6 +270,10 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
}
template <>
+EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
+ return _mm256_extract_epi16(a.val, 0);
+}
+template <>
EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
return static_cast<uint8_t>(_mm256_extract_epi8(a.val, 0));
}
@@ -237,6 +303,10 @@ EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
return _mm256_add_epi32(a.val, b.val);
}
template <>
+EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
+ return _mm256_set1_epi16(from.value);
+}
+template <>
EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
const Packet8q32i& b) {
return _mm256_sub_epi32(a.val, b.val);
@@ -265,6 +335,17 @@ EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
}
template <>
+EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
+ const Packet16q16i& b) {
+ return _mm256_min_epi16(a.val, b.val);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
+ const Packet16q16i& b) {
+ return _mm256_max_epi16(a.val, b.val);
+}
+
+template <>
EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
const Packet32q8u& b) {
return _mm256_min_epu8(a.val, b.val);
@@ -305,6 +386,23 @@ EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
}
template <>
+EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
+ __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+ tmp =
+ _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+ tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+ return std::min(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+template <>
+EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
+ __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
+ tmp =
+ _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+ tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
+ return std::max(_mm256_extract_epi16(tmp, 0), _mm256_extract_epi16(tmp, 1));
+}
+
+template <>
EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
__m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
tmp =
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index ddaf29a086..7e95ebd355 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -8,12 +8,17 @@ config_setting(
visibility = ["//visibility:public"],
)
+load(
+ "//third_party/mkl:build_defs.bzl",
+ "if_mkl",
+)
+
cc_library(
name = "intel_binary_blob",
- srcs = [
- "libiomp5.so",
+ srcs = if_mkl([
"libmklml_intel.so",
- ],
+ "libiomp5.so",
+ ]),
includes = ["."],
visibility = ["//visibility:public"],
)
diff --git a/third_party/nccl.BUILD b/third_party/nccl.BUILD
index bb460a05e0..06b9b8ff68 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl.BUILD
@@ -43,6 +43,24 @@ cc_library(
"-Iexternal/nccl_archive/src",
"-O3",
] + cuda_default_copts(),
+ linkopts = select({
+ "@%ws%//tensorflow:android": [
+ "-pie",
+ ],
+ "@%ws%//tensorflow:darwin": [
+ "-Wl,-framework",
+ "-Wl,CoreFoundation",
+ "-Wl,-framework",
+ "-Wl,Security",
+ ],
+ "@%ws%//tensorflow:ios": [],
+ "@%ws%//tensorflow:windows": [
+ "ws2_32.lib",
+ ],
+ "//conditions:default": [
+ "-lrt",
+ ],
+ }),
visibility = ["//visibility:public"],
deps = ["@local_config_cuda//cuda:cuda_headers"],
)
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
index a5e6b9fe93..66dd9aea7b 100755
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ b/third_party/sycl/crosstool/computecpp.tpl
@@ -26,9 +26,7 @@ def main():
if(output_file_index == 1):
# we are linking
- return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
-
- compiler_flags = compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DEIGEN_USE_SYCL=1']
+ return subprocess.call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
# find what we compile
compiling_cpp = 0
@@ -38,6 +36,28 @@ def main():
if(compited_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))):
compiling_cpp = 1;
+ compiler_flags = compiler_flags + ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DEIGEN_USE_SYCL=1', '-DTENSORFLOW_USE_SYCL', '-DEIGEN_HAS_C99_MATH']
+
+ if(compiling_cpp == 1):
+ # create a blacklist of folders that will be skipped when compiling with ComputeCpp
+ _skip = ["external", "llvm", ".cu.cc"]
+ # if compiling external project skip computecpp
+ if any(_folder in _skip for _folder in output_file_name):
+ return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
+
+ if(compiling_cpp == 1):
+ # this is an optimisation that will check if compiled file has to be compiled with ComputeCpp
+
+ _tmp_flags = [flag for flag in compiler_flags if not flag.startswith(('-o', output_file_name))]
+ # create preprocessed of the file
+ _cmd = " ".join([CPU_CXX_COMPILER] + _tmp_flags + ["-E"])
+ # check if it has parallel_for< in it
+ _cmd += " | grep \".parallel_for\" > /dev/null"
+ ps = subprocess.call(_cmd, shell=True)
+ # if not call CXX compiler
+ if(ps != 0):
+ return subprocess.call([CPU_CXX_COMPILER] + compiler_flags)
+
if(compiling_cpp == 1):
filename, file_extension = os.path.splitext(output_file_name)
bc_out = filename + '.sycl'
@@ -52,9 +72,12 @@ def main():
# dont want that in case of compiling with computecpp first
host_compiler_flags = [flag for flag in compiler_flags
if not flag.startswith(('-MF', '-MD',))
- if not '.d' in flag]
+ if not '.d' in flag
+ ]
+
+ host_compiler_flags[host_compiler_flags.index('-c')] = "--include"
- host_compiler_flags = ['-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '--include', bc_out] + host_compiler_flags
+ host_compiler_flags = ['-xc++', '-D_GLIBCXX_USE_CXX11_ABI=0', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-c', bc_out] + host_compiler_flags
x = subprocess.call([CPU_CXX_COMPILER] + host_compiler_flags)
return x
else:
diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template
index 48c9f5aa3f..3622b9423c 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@@ -7,7 +7,7 @@ build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
build:sycl --define=using_sycl=true
build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -fsanitize=address --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -lasan
+build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
build --force_python=py$PYTHON_MAJOR_VERSION
build --host_force_python=py$PYTHON_MAJOR_VERSION