From 50b999a8336d19400ab75aea66fe46eca2f5fe0b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 27 Jun 2017 16:33:00 -0700 Subject: Merge changes from github. PiperOrigin-RevId: 160344052 --- CONTRIBUTING.md | 7 +- ISSUE_TEMPLATE.md | 1 + README.md | 14 +- RELEASE.md | 2 + configure | 37 ++- tensorflow/BUILD | 1 + tensorflow/c/generate-pc.sh | 2 +- tensorflow/cc/BUILD | 21 +- tensorflow/cc/gradients/math_grad.cc | 26 ++ tensorflow/cc/gradients/math_grad_test.cc | 52 ++++ tensorflow/cc/gradients/nn_grad.cc | 13 + tensorflow/cc/gradients/nn_grad_test.cc | 13 + tensorflow/compiler/jit/BUILD | 5 +- tensorflow/compiler/jit/kernels/BUILD | 1 + tensorflow/compiler/plugin/BUILD | 4 +- tensorflow/compiler/plugin/executor/BUILD | 34 ++ tensorflow/compiler/plugin/executor/compiler.cc | 122 ++++++++ tensorflow/compiler/plugin/executor/compiler.h | 62 ++++ tensorflow/compiler/plugin/executor/device.cc | 60 ++++ tensorflow/compiler/plugin/executor/executable.cc | 147 +++++++++ tensorflow/compiler/plugin/executor/executable.h | 65 ++++ tensorflow/compiler/plugin/executor/executor.cc | 135 ++++++++ tensorflow/compiler/plugin/executor/executor.h | 213 +++++++++++++ tensorflow/compiler/plugin/executor/platform.cc | 125 ++++++++ tensorflow/compiler/plugin/executor/platform.h | 83 +++++ tensorflow/compiler/plugin/executor/platform_id.h | 31 ++ .../compiler/plugin/executor/transfer_manager.cc | 187 +++++++++++ .../compiler/plugin/executor/transfer_manager.h | 77 +++++ tensorflow/compiler/tests/BUILD | 18 +- tensorflow/compiler/tests/ftrl_test.py | 2 +- .../compiler/tf2xla/kernels/batch_matmul_op.cc | 6 +- .../compiler/tf2xla/kernels/batchtospace_op.cc | 3 +- .../compiler/tf2xla/kernels/depthwise_conv_ops.cc | 19 +- tensorflow/compiler/tf2xla/kernels/diag_op.cc | 8 +- .../compiler/tf2xla/kernels/dynamic_stitch_op.cc | 4 +- tensorflow/compiler/tf2xla/kernels/slice_op.cc | 4 +- tensorflow/compiler/tf2xla/kernels/split_op.cc | 14 +- .../compiler/tf2xla/kernels/strided_slice_op.cc | 44 +-- .../compiler/tf2xla/kernels/tensor_array_ops.cc | 8 +- tensorflow/compiler/tf2xla/kernels/unpack_op.cc | 4 +- .../compiler/xla/client/computation_builder.cc | 6 +- .../compiler/xla/client/computation_builder.h | 4 +- tensorflow/compiler/xla/literal_util.cc | 10 +- tensorflow/compiler/xla/literal_util_test.cc | 57 ++++ .../compiler/xla/service/algebraic_simplifier.cc | 7 +- .../xla/service/algebraic_simplifier_test.cc | 8 +- .../compiler/xla/service/buffer_assignment_test.cc | 12 +- .../compiler/xla/service/buffer_liveness_test.cc | 2 +- .../compiler/xla/service/compile_only_service.h | 2 +- .../compiler/xla/service/computation_placer.cc | 9 +- .../compiler/xla/service/computation_placer.h | 6 +- .../compiler/xla/service/elemental_ir_emitter.cc | 17 +- .../compiler/xla/service/gpu/pad_insertion.cc | 11 +- .../xla/service/hlo_constant_folding_test.cc | 3 +- tensorflow/compiler/xla/service/hlo_instruction.cc | 11 +- tensorflow/compiler/xla/service/hlo_instruction.h | 16 +- .../xla/service/hlo_rematerialization_test.cc | 16 +- tensorflow/compiler/xla/service/shape_inference.cc | 28 +- tensorflow/compiler/xla/service/shape_inference.h | 3 +- .../compiler/xla/service/shape_inference_test.cc | 33 +- .../xla/service/tuple_points_to_analysis_test.cc | 2 +- .../compiler/xla/service/user_computation.cc | 6 +- .../xla/tests/array_elementwise_ops_test.cc | 2 +- .../compiler/xla/tests/dot_operation_test.cc | 4 +- tensorflow/compiler/xla/tests/fusion_test.cc | 2 +- .../xla/tests/multidimensional_slice_test.cc | 4 +- tensorflow/compiler/xla/tests/params_test.cc | 2 +- tensorflow/compiler/xla/tests/slice_test.cc | 51 +-- tensorflow/compiler/xla/tests/while_test.cc | 3 +- tensorflow/compiler/xla/util.h | 18 +- tensorflow/compiler/xla/xla_data.proto | 3 +- .../android/TensorFlowInferenceInterface.java | 20 +- .../contrib/cloud/kernels/bigquery_reader_ops.cc | 2 +- .../cloud/python/ops/bigquery_reader_ops.py | 2 +- tensorflow/contrib/cmake/CMakeLists.txt | 2 +- tensorflow/contrib/data/README.md | 2 +- .../python/ops/bijectors/affine_impl.py | 4 +- .../distributions/python/ops/relaxed_bernoulli.py | 2 +- .../distributions/python/ops/sample_stats.py | 2 +- .../python/ops/vector_laplace_linear_operator.py | 2 +- tensorflow/contrib/graph_editor/transform.py | 2 +- tensorflow/contrib/keras/python/keras/backend.py | 6 +- .../contrib/keras/python/keras/layers/core.py | 2 +- .../contrib/keras/python/keras/layers/recurrent.py | 2 +- .../contrib/keras/python/keras/models_test.py | 2 +- tensorflow/contrib/layers/BUILD | 2 +- tensorflow/contrib/layers/__init__.py | 6 + tensorflow/contrib/layers/python/layers/layers.py | 221 ++++++++++++- .../contrib/layers/python/layers/layers_test.py | 150 ++++++++- .../learn/python/learn/estimators/debug_test.py | 4 +- .../contrib/learn/python/learn/estimators/head.py | 2 +- .../learn/python/learn/estimators/linear_test.py | 20 +- .../learn/python/learn/estimators/model_fn_test.py | 10 +- .../contrib/learn/python/learn/experiment.py | 6 +- tensorflow/contrib/learn/python/learn/models.py | 4 +- .../linalg/python/ops/linear_operator_test_util.py | 2 +- tensorflow/contrib/lookup/lookup_ops.py | 4 +- tensorflow/contrib/makefile/build_with_docker.sh | 2 +- .../contrib/makefile/compile_android_protobuf.sh | 4 +- .../contrib/makefile/compile_ios_protobuf.sh | 17 +- .../contrib/makefile/compile_ios_tensorflow.sh | 4 +- tensorflow/contrib/makefile/compile_pi_protobuf.sh | 6 +- tensorflow/contrib/makefile/tf_op_files.txt | 1 + tensorflow/contrib/ndlstm/python/lstm2d.py | 54 +++- tensorflow/contrib/ndlstm/python/lstm2d_test.py | 8 + tensorflow/contrib/remote_fused_graph/pylib/BUILD | 1 + tensorflow/contrib/rnn/python/ops/rnn_cell.py | 6 +- .../python/kernel_tests/beam_search_ops_test.py | 4 +- tensorflow/contrib/seq2seq/python/ops/decoder.py | 6 +- tensorflow/contrib/slim/README.md | 2 +- tensorflow/contrib/tfprof/README.md | 23 ++ .../tools/tfprof/internal/run_metadata_test.py | 2 +- tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 4 +- .../contrib/training/python/training/evaluation.py | 2 +- tensorflow/contrib/verbs/rdma.cc | 55 +++- tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc | 41 ++- tensorflow/contrib/verbs/verbs_util.cc | 34 ++ tensorflow/contrib/verbs/verbs_util.h | 10 + tensorflow/core/BUILD | 7 +- tensorflow/core/common_runtime/direct_session.cc | 4 +- tensorflow/core/debug/grpc_session_debug_test.cc | 5 +- .../distributed_runtime/worker_cache_logger.cc | 54 ++-- .../core/distributed_runtime/worker_cache_logger.h | 8 + tensorflow/core/framework/graph_def_util.h | 2 +- tensorflow/core/framework/op.h | 1 - tensorflow/core/framework/op_kernel.h | 2 +- tensorflow/core/framework/tensor.h | 2 +- tensorflow/core/graph/mkl_layout_pass.cc | 277 ++++++++++++----- tensorflow/core/graph/mkl_layout_pass_test.cc | 252 ++++++++++++++- .../core/grappler/costs/virtual_scheduler_test.cc | 2 +- tensorflow/core/grappler/grappler_item.h | 1 + .../core/grappler/optimizers/auto_parallel.cc | 5 + .../core/grappler/optimizers/auto_parallel.h | 1 + .../core/grappler/optimizers/auto_parallel_test.cc | 42 +-- .../core/grappler/optimizers/layout_optimizer.cc | 2 +- tensorflow/core/kernels/BUILD | 16 +- tensorflow/core/kernels/adjust_contrast_op.cc | 24 ++ .../kernels/adjust_contrast_op_benchmark_test.cc | 5 + tensorflow/core/kernels/colorspace_op.cc | 15 + tensorflow/core/kernels/control_flow_ops.cc | 136 +++++--- tensorflow/core/kernels/cwise_op_add_2.cc | 7 +- tensorflow/core/kernels/cwise_op_cosh.cc | 37 +++ tensorflow/core/kernels/cwise_op_gpu_add.cu.cc | 3 +- tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc | 26 ++ tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc | 26 ++ tensorflow/core/kernels/cwise_op_invert.cc | 2 +- tensorflow/core/kernels/cwise_op_sinh.cc | 37 +++ .../core/kernels/cwise_op_squared_difference.cc | 13 + tensorflow/core/kernels/cwise_ops.h | 6 + tensorflow/core/kernels/dynamic_stitch_op.cc | 27 +- tensorflow/core/kernels/map_stage_op.cc | 7 +- tensorflow/core/kernels/meta_support.cc | 5 +- .../core/kernels/mkl_conv_grad_filter_ops.cc | 69 +++-- tensorflow/core/kernels/mkl_conv_ops.cc | 51 ++- tensorflow/core/kernels/mkl_lrn_op.cc | 112 +++---- tensorflow/core/kernels/mkl_relu_op.cc | 30 +- tensorflow/core/kernels/mkl_tfconv_op.cc | 54 +++- tensorflow/core/kernels/non_max_suppression_op.cc | 36 ++- tensorflow/core/kernels/priority_queue.cc | 2 +- tensorflow/core/kernels/shape_ops.cc | 56 ++-- tensorflow/core/kernels/slice_op.cc | 258 +++++++++++++++- tensorflow/core/kernels/sparse_reduce_op.cc | 341 +++++++++++++++++++++ tensorflow/core/kernels/sparse_reduce_sum_op.cc | 305 ------------------ tensorflow/core/kernels/stack_ops.cc | 68 +++- tensorflow/core/kernels/tile_functor.h | 115 +++++++ tensorflow/core/kernels/tile_functor_cpu.cc | 85 +++++ tensorflow/core/kernels/tile_functor_gpu.cu.cc | 101 ++++++ tensorflow/core/kernels/tile_ops.cc | 76 ++--- tensorflow/core/kernels/tile_ops_cpu_impl.h | 35 +-- tensorflow/core/kernels/tile_ops_gpu_impl.h | 1 - tensorflow/core/kernels/tile_ops_impl.h | 25 +- tensorflow/core/kernels/topk_op.cc | 2 +- tensorflow/core/kernels/transpose_functor.h | 7 + tensorflow/core/kernels/transpose_op.cc | 5 +- .../kernels/typed_conditional_accumulator_base.h | 2 +- tensorflow/core/lib/gtl/optional.h | 2 +- tensorflow/core/ops/math_grad.cc | 20 ++ tensorflow/core/ops/math_grad_test.cc | 20 ++ tensorflow/core/ops/math_ops.cc | 8 + tensorflow/core/ops/nn_ops.cc | 2 + tensorflow/core/ops/ops.pbtxt | 75 +++++ tensorflow/core/ops/sparse_ops.cc | 69 +++++ tensorflow/core/platform/cloud/retrying_utils.cc | 2 +- tensorflow/core/protobuf/worker.proto | 2 +- tensorflow/core/public/version.h | 2 +- tensorflow/core/util/mkl_util.h | 38 ++- .../docs_src/api_guides/python/contrib.losses.md | 10 +- tensorflow/docs_src/api_guides/python/math_ops.md | 2 + tensorflow/docs_src/extend/estimators.md | 5 +- tensorflow/docs_src/get_started/get_started.md | 5 +- tensorflow/docs_src/get_started/mnist/beginners.md | 2 +- tensorflow/docs_src/get_started/mnist/mechanics.md | 2 +- .../docs_src/get_started/tensorboard_histograms.md | 2 + tensorflow/docs_src/install/install_c.md | 2 +- tensorflow/docs_src/install/install_go.md | 2 +- tensorflow/docs_src/install/install_java.md | 18 +- tensorflow/docs_src/install/install_linux.md | 26 +- tensorflow/docs_src/install/install_mac.md | 14 +- tensorflow/docs_src/install/install_sources.md | 4 +- tensorflow/docs_src/install/install_windows.md | 13 +- .../docs_src/performance/performance_guide.md | 8 +- tensorflow/docs_src/performance/quantization.md | 13 +- tensorflow/docs_src/tutorials/seq2seq.md | 13 +- tensorflow/docs_src/tutorials/wide.md | 18 +- tensorflow/docs_src/tutorials/wide_and_deep.md | 6 +- tensorflow/docs_src/tutorials/word2vec.md | 2 +- tensorflow/examples/android/build.gradle | 2 +- tensorflow/examples/image_retraining/retrain.py | 43 ++- tensorflow/examples/label_image/README.md | 23 +- tensorflow/examples/label_image/label_image.py | 132 ++++++++ tensorflow/examples/learn/examples_test.sh | 2 +- tensorflow/examples/tutorials/mnist/mnist.py | 2 +- .../examples/tutorials/word2vec/word2vec_basic.py | 16 +- tensorflow/go/README.md | 29 -- tensorflow/go/genop/generate.sh | 2 +- tensorflow/go/op/wrappers.go | 30 ++ tensorflow/go/shape.go | 2 +- tensorflow/go/tensor.go | 2 +- tensorflow/java/BUILD | 26 +- tensorflow/java/build_defs.bzl | 11 +- tensorflow/java/maven/libtensorflow/pom.xml | 4 +- tensorflow/java/maven/proto/pom.xml | 6 +- tensorflow/java/maven/release.sh | 2 +- tensorflow/java/maven/run_inside_container.sh | 2 +- .../java/src/main/java/org/tensorflow/Input.java | 48 +++ .../src/main/java/org/tensorflow/Operation.java | 73 ++++- .../main/java/org/tensorflow/OperationBuilder.java | 2 +- .../java/src/main/java/org/tensorflow/Output.java | 35 ++- .../java/src/main/java/org/tensorflow/Session.java | 4 +- .../src/main/java/org/tensorflow/op/NameScope.java | 146 +++++++++ .../src/main/java/org/tensorflow/op/Scope.java | 165 ++++++++++ tensorflow/java/src/main/native/operation_jni.cc | 18 ++ tensorflow/java/src/main/native/operation_jni.h | 11 + .../test/java/org/tensorflow/OperationTest.java | 112 +++++++ .../src/test/java/org/tensorflow/SessionTest.java | 6 +- .../src/test/java/org/tensorflow/TensorTest.java | 2 +- .../src/test/java/org/tensorflow/op/ScopeTest.java | 270 ++++++++++++++++ tensorflow/python/BUILD | 3 + tensorflow/python/debug/cli/analyzer_cli.py | 2 +- tensorflow/python/debug/wrappers/framework_test.py | 2 +- .../python/estimator/canned/dnn_linear_combined.py | 2 +- tensorflow/python/framework/test_util.py | 21 +- tensorflow/python/kernel_tests/BUILD | 10 - tensorflow/python/kernel_tests/basic_gpu_test.py | 2 + tensorflow/python/kernel_tests/cwise_ops_test.py | 15 + tensorflow/python/kernel_tests/fft_ops_test.py | 63 ++-- .../python/kernel_tests/map_stage_op_test.py | 93 +++--- .../python/kernel_tests/record_input_test.py | 14 +- tensorflow/python/kernel_tests/shape_ops_test.py | 18 +- tensorflow/python/kernel_tests/sparse_ops_test.py | 46 ++- tensorflow/python/kernel_tests/stage_op_test.py | 87 +++--- tensorflow/python/lib/io/py_record_writer.cc | 8 + tensorflow/python/lib/io/py_record_writer.h | 1 + tensorflow/python/lib/io/py_record_writer.i | 1 + tensorflow/python/lib/io/tf_record.py | 5 + tensorflow/python/ops/array_ops.py | 30 +- tensorflow/python/ops/data_flow_ops.py | 33 +- .../python/ops/distributions/special_math.py | 2 +- tensorflow/python/ops/gradients_impl.py | 52 ++-- tensorflow/python/ops/lookup_ops.py | 6 +- tensorflow/python/ops/math_grad.py | 18 ++ tensorflow/python/ops/math_ops.py | 7 +- tensorflow/python/ops/rnn_cell_impl.py | 4 +- tensorflow/python/ops/sparse_ops.py | 86 ++++++ tensorflow/python/ops/standard_ops.py | 2 +- tensorflow/python/ops/tensor_array_ops.py | 2 +- tensorflow/python/ops/variable_scope.py | 5 + .../tools/print_selective_registration_header.py | 2 +- tensorflow/python/training/input.py | 7 +- tensorflow/python/training/input_test.py | 7 + tensorflow/python/training/momentum.py | 2 +- tensorflow/tensorflow.bzl | 2 +- tensorflow/tools/api/golden/tensorflow.pbtxt | 16 + .../tensorflow.python_io.-t-f-record-writer.pbtxt | 4 + tensorflow/tools/ci_build/builds/pip.sh | 27 +- tensorflow/tools/ci_build/ci_build.sh | 10 +- .../tools/ci_build/ci_parameterized_build.sh | 17 +- tensorflow/tools/ci_build/ci_sanity.sh | 24 +- .../tools/ci_build/install/install_pip_packages.sh | 4 +- .../tools/ci_build/install/install_proto3.sh | 6 +- .../install/install_python3.5_pip_packages.sh | 2 +- .../ci_build/protobuf/protobuf_optimized_pip.sh | 2 +- .../tools/ci_build/windows/bazel/bazel_test_lib.sh | 2 +- .../tools/ci_build/windows/cpu/cmake/run_py.bat | 5 +- .../tools/ci_build/windows/gpu/cmake/run_py.bat | 5 +- tensorflow/tools/dist_test/local_test.sh | 6 +- tensorflow/tools/docker/README.md | 28 +- .../tools/docker/parameterized_docker_build.sh | 44 ++- tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh | 11 +- tensorflow/tools/git/gen_git_source.sh | 2 +- tensorflow/tools/graph_transforms/README.md | 2 +- .../graph_transforms/quantize_weights_test.cc | 4 +- .../tools/lib_package/libtensorflow_java_test.sh | 2 +- tensorflow/tools/lib_package/libtensorflow_test.sh | 2 +- tensorflow/tools/pip_package/setup.py | 2 +- tensorflow/tools/tfprof/README.md | 6 +- tensorflow/tools/tfprof/g3doc/command_line.md | 2 +- tensorflow/workspace.bzl | 24 +- third_party/gpus/cuda_configure.bzl | 4 +- third_party/py/BUILD.tpl | 12 + third_party/py/python_configure.bzl | 56 +++- third_party/sycl/crosstool/computecpp.tpl | 6 +- third_party/sycl/sycl/BUILD.tpl | 2 +- tools/tf_env_collect.sh | 143 +++++---- 304 files changed, 6784 insertions(+), 1689 deletions(-) create mode 100644 tensorflow/compiler/plugin/executor/BUILD create mode 100644 tensorflow/compiler/plugin/executor/compiler.cc create mode 100644 tensorflow/compiler/plugin/executor/compiler.h create mode 100644 tensorflow/compiler/plugin/executor/device.cc create mode 100644 tensorflow/compiler/plugin/executor/executable.cc create mode 100644 tensorflow/compiler/plugin/executor/executable.h create mode 100644 tensorflow/compiler/plugin/executor/executor.cc create mode 100644 tensorflow/compiler/plugin/executor/executor.h create mode 100644 tensorflow/compiler/plugin/executor/platform.cc create mode 100644 tensorflow/compiler/plugin/executor/platform.h create mode 100644 tensorflow/compiler/plugin/executor/platform_id.h create mode 100644 tensorflow/compiler/plugin/executor/transfer_manager.cc create mode 100644 tensorflow/compiler/plugin/executor/transfer_manager.h create mode 100644 tensorflow/core/kernels/cwise_op_cosh.cc create mode 100644 tensorflow/core/kernels/cwise_op_gpu_cosh.cu.cc create mode 100644 tensorflow/core/kernels/cwise_op_gpu_sinh.cu.cc create mode 100644 tensorflow/core/kernels/cwise_op_sinh.cc create mode 100644 tensorflow/core/kernels/sparse_reduce_op.cc delete mode 100644 tensorflow/core/kernels/sparse_reduce_sum_op.cc create mode 100644 tensorflow/core/kernels/tile_functor.h create mode 100644 tensorflow/core/kernels/tile_functor_cpu.cc create mode 100644 tensorflow/core/kernels/tile_functor_gpu.cu.cc create mode 100644 tensorflow/examples/label_image/label_image.py create mode 100644 tensorflow/java/src/main/java/org/tensorflow/Input.java create mode 100644 tensorflow/java/src/main/java/org/tensorflow/op/NameScope.java create mode 100644 tensorflow/java/src/main/java/org/tensorflow/op/Scope.java create mode 100644 tensorflow/java/src/test/java/org/tensorflow/op/ScopeTest.java diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c78b6b1a15..43abdaafbf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -159,7 +159,12 @@ There are two ways to run TensorFlow unit tests. bazel test ${flags} //tensorflow/python/... ``` -2. Using Docker and TensorFlow's CI scripts. +2. Using [Docker](www.docker.com) and TensorFlow's CI scripts. + + ```bash + # Install Docker first, then this will build and run cpu tests + tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/... + ``` See [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details. diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index 6f4c048ce8..4f1666c2e7 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -6,6 +6,7 @@ If you open a GitHub issue, here is our policy: 1. It must be a bug or a feature request. 2. The form below must be filled out. +3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues). **Here's why we have that policy**: TensorFlow developers respond to issues. We want to focus on work that benefits the whole community, e.g., fixing bugs and adding features. Support only helps individuals. GitHub also notifies thousands of people when issues are filed. We want them to see you communicating an interesting problem, rather than being redirected to Stack Overflow. diff --git a/README.md b/README.md index e7dbf57b25..abbead98a7 100644 --- a/README.md +++ b/README.md @@ -34,13 +34,13 @@ and discussion.** People who are a little more adventurous can also try our nightly binaries: -* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/)) -* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) -* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/)) -* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/)) -* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/)) -* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/)) -* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/) +* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/)) +* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) +* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/)) +* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.2.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/)) +* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/)) +* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.2.0-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/)) +* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/)) #### *Try your first TensorFlow program* diff --git a/RELEASE.md b/RELEASE.md index d22c5c62fe..9875838d7e 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -113,6 +113,8 @@ checkpoints containing such RNN cells, in which case you can use the [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py) to convert the variable names in your old checkpoints. +* Added `tf.contrib.kernel_methods` module with Ops and estimators for primal + (explicit) kernel methods in TensorFlow. ## Bug Fixes and Other Changes * In python, `Operation.get_attr` on type attributes returns the Python DType diff --git a/configure b/configure index e1aaddabda..1531b4d40a 100755 --- a/configure +++ b/configure @@ -162,8 +162,12 @@ bazel version > bazel.version curr_bazel_version=$(head -n 1 bazel.version | cut -d ' ' -f3) rm -f bazel.version + echo "You have bazel $curr_bazel_version installed." -if [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then +if [ -z "$curr_bazel_version" ]; then + echo "WARNING: current bazel installation is not a release version." + echo "Make sure you are running at least bazel $MIN_BAZEL_VERSION." +elif [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then echo "Please upgrade your bazel installation to version $MIN_BAZEL_VERSION or higher to build TensorFlow!" echo "Exiting..." exit 1 @@ -535,9 +539,9 @@ done # Set default CUDA version if not set if [ -z "$TF_CUDA_VERSION" ]; then TF_CUDA_VERSION="8.0" - export TF_CUDA_VERSION + export TF_CUDA_VERSION fi -write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" +write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" # Set up which gcc nvcc should use as the host compiler # No need to set this on Windows @@ -586,6 +590,9 @@ while true; do # Result returned from "read" will be used unexpanded. That make "~" unusable. # Going through one more level of expansion to handle that. CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"` + if is_windows; then + CUDNN_INSTALL_PATH="$(cygpath -m "$CUDNN_INSTALL_PATH")" + fi fi if [[ -z "$TF_CUDNN_VERSION" ]]; then @@ -652,16 +659,22 @@ write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION" # Configure the compute capabilities that TensorFlow builds for. # Since Cuda toolkit is not backward-compatible, this is not guaranteed to work. +function get_native_cuda_compute_capabilities { + device_query_bin="$CUDA_TOOLKIT_PATH/extras/demo_suite/deviceQuery" # Also works on Windows without .exe + "$device_query_bin" | grep 'Capability' | grep -o '[0-9]*\.[0-9]*' | sed ':a;{N;s/\n/,/};ba' + exit 0 # ensure that this function always exit success even if device detection fails, to prevent the whole configure from aborting +} while true; do fromuser="" - default_cuda_compute_capabilities="3.5,5.2" + native_cuda_compute_capabilities=$(get_native_cuda_compute_capabilities) + default_cuda_compute_capabilities=${native_cuda_compute_capabilities:-"3.5,5.2"} if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then cat << EOF Please specify a list of comma-separated Cuda compute capabilities you want to build with. You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus. Please note that each additional compute capability significantly increases your build time and binary size. EOF - read -p "[Default is: \"3.5,5.2\"]: " TF_CUDA_COMPUTE_CAPABILITIES + read -p "[Default is: \"$default_cuda_compute_capabilities\"]: " TF_CUDA_COMPUTE_CAPABILITIES fromuser=1 fi if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then @@ -832,17 +845,17 @@ while true; do if [ -e "$MPI_HOME/include" ] && [ -e "$MPI_HOME/lib" ]; then break fi - + echo "Invalid path to the MPI Toolkit. ${MPI_HOME}/include or ${MPI_HOME}/lib cannot be found." if [ -z "$fromuser" ]; then exit 1 fi # Retry - MPI_HOME="" + MPI_HOME="" done - - + + if [ "$TF_NEED_MPI" == "1" ]; then write_to_bazelrc 'build --define with_mpi_support=true' @@ -850,11 +863,11 @@ if [ "$TF_NEED_MPI" == "1" ]; then ln -sf "${MPI_HOME}/include/mpi.h" third_party/mpi/mpi.h - #Determine if we use OpenMPI or MVAPICH, these require different header files + #Determine if we use OpenMPI or MVAPICH, these require different header files #to be included here to make bazel dependency checker happy if [ -e "${MPI_HOME}/include/mpi_portable_platform.h" ]; then - #OpenMPI + #OpenMPI ln -sf "${MPI_HOME}/include/mpi_portable_platform.h" third_party/mpi/ sed -i -e "s/MPI_LIB_IS_OPENMPI=False/MPI_LIB_IS_OPENMPI=True/" third_party/mpi/mpi.bzl else @@ -864,7 +877,7 @@ if [ "$TF_NEED_MPI" == "1" ]; then sed -i -e "s/MPI_LIB_IS_OPENMPI=True/MPI_LIB_IS_OPENMPI=False/" third_party/mpi/mpi.bzl fi - + if [ -e "${MPI_HOME}/lib/libmpi.so" ]; then ln -sf "${MPI_HOME}/lib/libmpi.so" third_party/mpi/ else diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 62ae5ae78c..6e76db02de 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -208,6 +208,7 @@ filegroup( "//tensorflow/compiler/jit/kernels:all_files", "//tensorflow/compiler/jit/legacy_flags:all_files", "//tensorflow/compiler/jit/ops:all_files", + "//tensorflow/compiler/plugin/executor:all_files", "//tensorflow/compiler/tests:all_files", "//tensorflow/compiler/tf2xla:all_files", "//tensorflow/compiler/tf2xla/cc:all_files", diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh index 73d427d9b2..02a6a58b61 100755 --- a/tensorflow/c/generate-pc.sh +++ b/tensorflow/c/generate-pc.sh @@ -26,7 +26,7 @@ usage() { [ $# == 0 ] && usage && exit 0 # read the options -ARGS=`getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@"` +ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@") eval set -- "$ARGS" # extract options and their arguments into variables. diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD index a884f11d48..9801add1da 100644 --- a/tensorflow/cc/BUILD +++ b/tensorflow/cc/BUILD @@ -472,10 +472,23 @@ cc_binary( name = "tutorials_example_trainer", srcs = ["tutorials/example_trainer.cc"], copts = tf_copts(), - linkopts = [ - "-lpthread", - "-lm", - ], + linkopts = select({ + "//tensorflow:windows": [], + "//tensorflow:windows_msvc": [], + "//tensorflow:darwin": [ + "-lm", + "-lpthread", + ], + "//tensorflow:ios": [ + "-lm", + "-lpthread", + ], + "//conditions:default": [ + "-lm", + "-lpthread", + "-lrt", + ], + }), deps = [ ":cc_ops", "//tensorflow/core:core_cpu", diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc index 8c1a01f518..71d9a8ed7b 100644 --- a/tensorflow/cc/gradients/math_grad.cc +++ b/tensorflow/cc/gradients/math_grad.cc @@ -162,6 +162,32 @@ Status Log1pGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Log1p", Log1pGrad); +Status SinhGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // y = sinh(x) + // dy/dx = cosh(x) + auto dydx = Cosh(scope, op.input(0)); + // grad(x) = grad(y) * conj(dy/dx) + grad_outputs->push_back( + Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx))); + return scope.status(); +} +REGISTER_GRADIENT_OP("Sinh", SinhGrad); + +Status CoshGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // y = cosh(x) + // dy/dx = sinh(x) + auto dydx = Sinh(scope, op.input(0)); + // grad(x) = grad(y) * conj(dy/dx) + grad_outputs->push_back( + Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx))); + return scope.status(); +} +REGISTER_GRADIENT_OP("Cosh", CoshGrad); + Status TanhGrad(const Scope& scope, const Operation& op, const std::vector& grad_inputs, std::vector* grad_outputs) { diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc index de6baa1769..1653b04378 100644 --- a/tensorflow/cc/gradients/math_grad_test.cc +++ b/tensorflow/cc/gradients/math_grad_test.cc @@ -45,6 +45,8 @@ class CWiseUnaryGradTest : public ::testing::Test { EXPM1, LOG, LOG1P, + SINH, + COSH, TANH, SIGMOID, SIGN, @@ -111,6 +113,12 @@ class CWiseUnaryGradTest : public ::testing::Test { case LOG1P: y = Log1p(scope_, x); break; + case SINH: + y = Sinh(scope_, x); + break; + case COSH: + y = Cosh(scope_, x); + break; case TANH: y = Tanh(scope_, x); break; @@ -337,6 +345,50 @@ TEST_F(CWiseUnaryGradTest, Log1p_Complex) { TestCWiseGrad(LOG1P, x_fn, dy_fn, dx_fn); } +TEST_F(CWiseUnaryGradTest, Sinh) { + auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); }; + auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); }; + auto dx_fn = [this](const float x, const float dy) { + return dy * std::cosh(x); + }; + TestCWiseGrad(SINH, x_fn, dy_fn, dx_fn); +} + +TEST_F(CWiseUnaryGradTest, Sinh_Complex) { + auto x_fn = [this](const int i) { + return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}}); + }; + auto dy_fn = [this](const complex64& x) { + return x + CRV({{-2, 2}, {-3, 3}, {1, -4}}); + }; + auto dx_fn = [this](const complex64& x, const complex64& dy) { + return dy * conjugate(std::cosh(x)); + }; + TestCWiseGrad(SINH, x_fn, dy_fn, dx_fn); +} + +TEST_F(CWiseUnaryGradTest, Cosh) { + auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); }; + auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); }; + auto dx_fn = [this](const float x, const float dy) { + return dy * std::sinh(x); + }; + TestCWiseGrad(COSH, x_fn, dy_fn, dx_fn); +} + +TEST_F(CWiseUnaryGradTest, Cosh_Complex) { + auto x_fn = [this](const int i) { + return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}}); + }; + auto dy_fn = [this](const complex64& x) { + return x + CRV({{-2, 2}, {-3, 3}, {1, -4}}); + }; + auto dx_fn = [this](const complex64& x, const complex64& dy) { + return dy * conjugate(std::sinh(x)); + }; + TestCWiseGrad(COSH, x_fn, dy_fn, dx_fn); +} + TEST_F(CWiseUnaryGradTest, Tanh) { auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); }; auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); }; diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc index 5e5203d090..952b2015ed 100644 --- a/tensorflow/cc/gradients/nn_grad.cc +++ b/tensorflow/cc/gradients/nn_grad.cc @@ -46,6 +46,19 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad); +Status LogSoftmaxGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + + auto softmax = Exp(scope, op.output(0)); + auto sum = Sum(scope, grad_inputs[0], {1}, Sum::KeepDims(true)); + auto mul = Mul(scope, sum, softmax); + auto dx = Sub(scope, grad_inputs[0], mul); + grad_outputs->push_back(dx); + return scope.status(); +} +REGISTER_GRADIENT_OP("LogSoftmax", LogSoftmaxGrad); + Status ReluGradHelper(const Scope& scope, const Operation& op, const std::vector& grad_inputs, std::vector* grad_outputs) { diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc index 70c9bd4e08..daa87546ec 100644 --- a/tensorflow/cc/gradients/nn_grad_test.cc +++ b/tensorflow/cc/gradients/nn_grad_test.cc @@ -57,6 +57,19 @@ TEST_F(NNGradTest, SoftmaxGrad) { RunTest(x, shape, y, shape); } +TEST_F(NNGradTest, LogSoftmaxGrad) { + TensorShape shape({5, 3}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape)); + auto y = LogSoftmax(scope_, x); + // Avoid numerical instability when computing finite differences. + Tensor x_init_value = test::AsTensor( + {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, + 0.1f, 0.3f, 0.5f, 0.7f, 0.8f, + -0.1f, 0.1f, 0.1f, 0.1f, 1.2f}, + {5, 3}); + RunTest(x, x_init_value, y, shape); +} + TEST_F(NNGradTest, ReluGrad) { TensorShape shape({5, 2}); auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape)); diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 306e704415..7ebd842218 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -15,7 +15,10 @@ package_group( ) package( - default_visibility = [":internal"], + default_visibility = [ + ":internal", + "//tensorflow/compiler/plugin/executor:__pkg__", + ], ) load("//tensorflow:tensorflow.bzl", "cc_header_only_library") diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD index ed204b8182..97f3512a6c 100644 --- a/tensorflow/compiler/jit/kernels/BUILD +++ b/tensorflow/compiler/jit/kernels/BUILD @@ -2,6 +2,7 @@ licenses(["notice"]) # Apache 2.0 package( default_visibility = [ + "//tensorflow/compiler/plugin/executor:__pkg__", "//tensorflow/compiler/tf2xla:internal", ], ) diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD index 4badd3a589..8c2e9a7c81 100644 --- a/tensorflow/compiler/plugin/BUILD +++ b/tensorflow/compiler/plugin/BUILD @@ -32,5 +32,7 @@ package( cc_library( name = "plugin", - deps = [], + deps = [ + "//tensorflow/compiler/plugin/executor:plugin_lib", + ], ) diff --git a/tensorflow/compiler/plugin/executor/BUILD b/tensorflow/compiler/plugin/executor/BUILD new file mode 100644 index 0000000000..2e5875705f --- /dev/null +++ b/tensorflow/compiler/plugin/executor/BUILD @@ -0,0 +1,34 @@ +licenses(["restricted"]) + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "plugin_lib", + srcs = glob([ + "*.cc", + ]), + hdrs = glob([ + "*.h", + ]), + deps = [ + "//tensorflow/compiler/jit:xla_device", + "//tensorflow/compiler/jit:xla_jit_headers_lib", + "//tensorflow/compiler/tf2xla:xla_compiler", + "//tensorflow/compiler/xla:xla_headers_lib", + "//tensorflow/compiler/xla/service", + "//third_party/eigen3", + "@local_config_cuda//cuda:cuda_headers", + "@protobuf//:protobuf_headers", + ], +) + +filegroup( + name = "all_files", + srcs = glob( + ["**/*"], + exclude = [ + "**/METADATA", + "**/OWNERS", + ], + ), +) diff --git a/tensorflow/compiler/plugin/executor/compiler.cc b/tensorflow/compiler/plugin/executor/compiler.cc new file mode 100644 index 0000000000..3a84f08c00 --- /dev/null +++ b/tensorflow/compiler/plugin/executor/compiler.cc @@ -0,0 +1,122 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "tensorflow/compiler/plugin/executor/compiler.h" +#include "tensorflow/compiler/plugin/executor/executable.h" + +#include "tensorflow/compiler/xla/service/algebraic_simplifier.h" +#include "tensorflow/compiler/xla/service/flatten_call_graph.h" +#include "tensorflow/compiler/xla/service/hlo_constant_folding.h" +#include "tensorflow/compiler/xla/service/hlo_cse.h" +#include "tensorflow/compiler/xla/service/hlo_dce.h" +#include "tensorflow/compiler/xla/service/hlo_pass_fix.h" +#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h" +#include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h" +#include "tensorflow/compiler/xla/service/inliner.h" +#include "tensorflow/compiler/xla/service/reshape_mover.h" +#include "tensorflow/compiler/xla/status_macros.h" + +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/strcat.h" + +#include "tensorflow/core/lib/core/errors.h" + +namespace se = ::perftools::gputools; +namespace sep = ::perftools::gputools::executorplugin; +namespace port = ::perftools::gputools::port; + +namespace xla { +namespace executorplugin { + +/* + * Run optimization passes on the module. The graph is transformed by + * each pass in the optimization pipeline. The service subdirectory + * contains useful optimization passes. + */ +Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module) { + HloPassPipeline pipeline("Executor"); + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(false); + + pipeline.AddPass>( + false, [](const Shape&, const Shape&) { return false; }); + pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(true); + + pipeline.AddPass(); + pipeline.AddPass(); + return pipeline.Run(hlo_module).status(); +} + +StatusOr> ExecutorCompiler::Compile( + std::unique_ptr hlo_module, + se::StreamExecutor* stream_exec) { + TF_RET_CHECK(stream_exec != nullptr); + + VLOG(1) << "Generate graph " << hlo_module->name(); + + TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get())); + + // Typically you would visit the HLO graph, building up a compiled equivalent + // In this case we are using an Hlo evaluator at execution time, so we don't + // need to compile anything + + // Create executable from only the Hlo module + std::unique_ptr executable; + executable.reset(new ExecutorExecutable(std::move(hlo_module))); + + return std::move(executable); +} + +StatusOr>> ExecutorCompiler::Compile( + std::vector> hlo_modules, + std::vector stream_execs) { + + return tensorflow::errors::Unimplemented( + "Compilation of multiple HLO modules is not supported on Executor."); +} + +StatusOr>> +ExecutorCompiler::CompileAheadOfTime( + std::vector> hlo_modules, + const AotCompilationOptions& aot_options) { + + return tensorflow::errors::InvalidArgument( + "AOT compilation not supported on Executor"); +} + +se::Platform::Id ExecutorCompiler::PlatformId() const { + return sep::kExecutorPlatformId; +} + +HloCostAnalysis::ShapeSizeFunction +ExecutorCompiler::ShapeSizeBytesFunction() const { + return ExecutorExecutable::ShapeSizeBytes; +} + + +} // namespace executorplugin +} // namespace xla + +REGISTER_MODULE_INITIALIZER(executor_compiler, { + xla::Compiler::RegisterCompilerFactory(sep::kExecutorPlatformId, []() { + return xla::MakeUnique(); + }); +}); diff --git a/tensorflow/compiler/plugin/executor/compiler.h b/tensorflow/compiler/plugin/executor/compiler.h new file mode 100644 index 0000000000..d318eefc49 --- /dev/null +++ b/tensorflow/compiler/plugin/executor/compiler.h @@ -0,0 +1,62 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_ +#define TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_ + +#include + +#include "tensorflow/compiler/xla/service/compiler.h" +#include "tensorflow/compiler/xla/service/executable.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" + +#include "tensorflow/compiler/plugin/executor/platform_id.h" + +namespace xla { +namespace executorplugin { + +class ExecutorCompiler : public Compiler { + public: + ExecutorCompiler() {} + ~ExecutorCompiler() override {} + + StatusOr> Compile( + std::unique_ptr hlo_module, + perftools::gputools::StreamExecutor* stream_exec) override; + + StatusOr>> Compile( + std::vector> hlo_module, + std::vector stream_exec) override; + + StatusOr>> + CompileAheadOfTime( + std::vector> module, + const AotCompilationOptions& options) override; + + HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override; + + perftools::gputools::Platform::Id PlatformId() const override; + + private: + Status RunHloOptimization(HloModule* hlo_module); + + TF_DISALLOW_COPY_AND_ASSIGN(ExecutorCompiler); +}; + +} // namespace executorplugin +} // namespace xla + +#endif // TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_ diff --git a/tensorflow/compiler/plugin/executor/device.cc b/tensorflow/compiler/plugin/executor/device.cc new file mode 100644 index 0000000000..bbc39dc03f --- /dev/null +++ b/tensorflow/compiler/plugin/executor/device.cc @@ -0,0 +1,60 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/kernels/xla_device_launch_op.h" +#include "tensorflow/compiler/jit/xla_device.h" +#include "tensorflow/compiler/jit/xla_device_ops.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" + +namespace tensorflow { + +const char* const DEVICE_XLA_EXEC = "XLA_EXEC"; +const char* const DEVICE_EXEC_XLA_JIT = "XLA_EXEC_JIT"; + +constexpr std::array kExecAllTypes = { + {DT_INT32, DT_FLOAT, DT_BOOL, DT_DOUBLE, DT_INT64}}; + +class XlaExaDeviceFactory : public DeviceFactory { + public: + Status CreateDevices(const SessionOptions& options, const string& name_prefix, + std::vector* devices) override; +}; + +Status XlaExaDeviceFactory::CreateDevices(const SessionOptions& options, + const string& name_prefix, + std::vector* devices) { + static XlaDeviceOpRegistrations* registrations = + RegisterXlaDeviceKernels(DEVICE_XLA_EXEC, DEVICE_EXEC_XLA_JIT); + (void)registrations; + + std::unique_ptr device; + TF_RETURN_IF_ERROR(XlaDevice::Create("Executor", DEVICE_XLA_EXEC, 0, + DEVICE_EXEC_XLA_JIT, options, + name_prefix, &device)); + devices->push_back(device.release()); + return Status::OK(); +} + +REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_EXEC, XlaExaDeviceFactory, 110); + +// Kernel registrations + +static bool OpFilter(KernelDef* kdef) { return true; } + +REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_EXEC, XlaDeviceLaunchOp, kExecAllTypes); +REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_EXEC, kExecAllTypes); +REGISTER_XLA_BACKEND(DEVICE_EXEC_XLA_JIT, kExecAllTypes, OpFilter); + +} // namespace tensorflow diff --git a/tensorflow/compiler/plugin/executor/executable.cc b/tensorflow/compiler/plugin/executor/executable.cc new file mode 100644 index 0000000000..79eea9af3f --- /dev/null +++ b/tensorflow/compiler/plugin/executor/executable.cc @@ -0,0 +1,147 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/plugin/executor/executable.h" +#include "tensorflow/compiler/plugin/executor/executor.h" + +#include "tensorflow/compiler/xla/service/hlo_evaluator.h" + +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/shape_util.h" + +namespace se = ::perftools::gputools; +namespace sep = ::perftools::gputools::executorplugin; + +namespace xla { +namespace executorplugin { + +ExecutorExecutable::ExecutorExecutable(std::unique_ptr hlo_module) + : Executable(std::move(hlo_module), ShapeSizeBytes) {} + +ExecutorExecutable::~ExecutorExecutable() {} + +static se::DeviceMemoryBase AllocateSingleOutput(sep::ExecutorExecutor* executor, + const Literal& literal) { + int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape())); + void* buf = executor->Allocate(size); + const void* src = literal.InternalData(); + memcpy(buf, src, size); + return se::DeviceMemoryBase(buf, size); +} + +static se::DeviceMemoryBase AllocateOutputBuffer(sep::ExecutorExecutor* executor, + const Literal& literal) { + const Shape& shape = literal.shape(); + if (shape.element_type() != xla::TUPLE) { + return AllocateSingleOutput(executor, literal); + } else { + int64 size(xla::ShapeUtil::ByteSizeOf(shape, sizeof(void*))); + void** buf = reinterpret_cast(executor->Allocate(size)); + for (int64 n = 0; n < xla::ShapeUtil::TupleElementCount(shape); n++) { + se::DeviceMemoryBase out = + AllocateSingleOutput(executor, literal.tuple_literals(n)); + *buf++ = out.opaque(); + } + + return se::DeviceMemoryBase(buf, size); + } +} + +StatusOr ExecutorExecutable::ExecuteOnStream( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice arguments, + HloExecutionProfile* hlo_execution_profile) { + se::Stream* stream = run_options->stream(); + + VLOG(1) << "Execute " << module().name(); + if (VLOG_IS_ON(2)) { + for (const auto& a : arguments) { + VLOG(2) << "-- argument " << a.opaque(); + } + } + + uint64 start_micros = tensorflow::Env::Default()->NowMicros(); + + HloComputation* computation = module().entry_computation(); + if (computation->num_parameters() != arguments.size()) { + return tensorflow::errors::Internal( + "Mismatch between argument count and graph parameter count."); + } + + // Create the arguments as an vector of XLA literals + std::vector> arg_literals; + std::vector arg_literals_ptrs; + for (int64 p = 0; p < computation->num_parameters(); p++) { + // Create the input literal for the parameter + HloInstruction* param = computation->parameter_instruction(p); + arg_literals.emplace_back(Literal::CreateFromShape(param->shape())); + arg_literals_ptrs.push_back(arg_literals.back().get()); + + // Copy in the data from the stream_executor buffers + void* buffer = arg_literals.back().get()->MutableInternalData(); + memcpy(buffer, arguments[p].opaque(), + ShapeUtil::ByteSizeOf(param->shape())); + } + + // Execute the graph using the evaluator + HloEvaluator evaluator; + std::unique_ptr output; + TF_ASSIGN_OR_RETURN(output, + evaluator.Evaluate(computation, arg_literals_ptrs)); + + // Copy the result into the return buffer + perftools::gputools::StreamExecutor* executor(stream->parent()); + sep::ExecutorExecutor* executorExecutor( + static_cast(executor->implementation())); + + se::DeviceMemoryBase ret = + AllocateOutputBuffer(executorExecutor, *(output.get())); + + uint64 end_micros = tensorflow::Env::Default()->NowMicros(); + + { + tensorflow::mutex_lock lock(mutex_); + const double nanoseconds = (end_micros - start_micros) * 1000.0; + execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0)); + } + + return ret; +} + +StatusOr> ExecutorExecutable::ExecuteOnStream( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice arguments, + HloExecutionProfile* hlo_execution_profile) { + return tensorflow::errors::Unimplemented( + "ExecuteOnStream is not yet supported on Executor."); +} + +StatusOr ExecutorExecutable::ExecuteAsyncOnStream( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice arguments) { + return tensorflow::errors::Unimplemented( + "ExecuteAsyncOnStream is not yet supported on Executor."); +} + +/*static*/ int64 ExecutorExecutable::ShapeSizeBytes(const Shape& shape) { + if (ShapeUtil::IsOpaque(shape)) { + return sizeof(void*); + } + return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); +} + + +} // namespace executorplugin +} // namespace xla diff --git a/tensorflow/compiler/plugin/executor/executable.h b/tensorflow/compiler/plugin/executor/executable.h new file mode 100644 index 0000000000..ba3d4da21d --- /dev/null +++ b/tensorflow/compiler/plugin/executor/executable.h @@ -0,0 +1,65 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_ +#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_ + +#include +#include +#include +#include +#include + +#include "tensorflow/compiler/xla/service/executable.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" + +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" + +namespace xla { +namespace executorplugin { + +class ExecutorExecutable : public Executable { + public: + ExecutorExecutable(std::unique_ptr hlo_module); + ~ExecutorExecutable() override; + + StatusOr ExecuteOnStream( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice + arguments, + HloExecutionProfile* hlo_execution_profile) override; + + StatusOr> ExecuteOnStream( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice arguments, + HloExecutionProfile* hlo_execution_profile) override; + + StatusOr ExecuteAsyncOnStream( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice + arguments) override; + + static int64 ShapeSizeBytes(const Shape& shape); + + private: + TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable); +}; + +} // namespace executorplugin +} // namespace xla + +#endif // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_ diff --git a/tensorflow/compiler/plugin/executor/executor.cc b/tensorflow/compiler/plugin/executor/executor.cc new file mode 100644 index 0000000000..e72c2711f7 --- /dev/null +++ b/tensorflow/compiler/plugin/executor/executor.cc @@ -0,0 +1,135 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/plugin/executor/executor.h" +#include "tensorflow/compiler/plugin/executor/platform_id.h" + +#include "tensorflow/compiler/xla/status_macros.h" + +#include +#include + +namespace se = ::perftools::gputools; + +namespace perftools { +namespace gputools { +namespace executorplugin { + +host::HostStream *AsExecutorStream(Stream *stream) { + DCHECK(stream != nullptr); + return dynamic_cast(stream->implementation()); +} + +ExecutorExecutor::ExecutorExecutor(const PluginConfig &plugin_config) + : plugin_config_(plugin_config) {} + +ExecutorExecutor::~ExecutorExecutor() {} + +void *ExecutorExecutor::Allocate(uint64 size) { + void *buf = new char[size]; + return buf; +} + +void *ExecutorExecutor::AllocateSubBuffer(DeviceMemoryBase *parent, + uint64 offset_bytes, + uint64 size_bytes) { + return parent + offset_bytes; +} + +void ExecutorExecutor::Deallocate(DeviceMemoryBase *mem) { + if (!mem->is_sub_buffer()) { + delete[] static_cast(mem->opaque()); + } +} + +bool ExecutorExecutor::Memcpy(Stream *stream, void *host_dst, + const DeviceMemoryBase &dev_src, uint64 size) { + AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() { + port::Status ok = SynchronousMemcpy(host_dst, dev_src, size); + }); + return true; +} + +bool ExecutorExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst, + const void *host_src, uint64 size) { + AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() { + port::Status ok = SynchronousMemcpy(dev_dst, host_src, size); + }); + return true; +} + +port::Status ExecutorExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst, + const void *host_src, + uint64 size) { + memcpy(dev_dst->opaque(), host_src, size); + return port::Status::OK(); +} + +port::Status ExecutorExecutor::SynchronousMemcpy(void *host_dst, + const DeviceMemoryBase &dev_src, + uint64 size) { + memcpy(host_dst, dev_src.opaque(), size); + return port::Status::OK(); +} + +bool ExecutorExecutor::HostCallback(Stream *stream, + std::function callback) { + AsExecutorStream(stream)->EnqueueTask(callback); + return true; +} + +bool ExecutorExecutor::CreateStreamDependency(Stream *dependent, Stream *other) { + AsExecutorStream(dependent)->EnqueueTask( + [other]() { other->BlockHostUntilDone(); }); + AsExecutorStream(dependent)->BlockUntilDone(); + return true; +} + +bool ExecutorExecutor::StartTimer(Stream *stream, Timer *timer) { + dynamic_cast(timer->implementation())->Start(stream); + return true; +} + +bool ExecutorExecutor::StopTimer(Stream *stream, Timer *timer) { + dynamic_cast(timer->implementation())->Stop(stream); + return true; +} + +bool ExecutorExecutor::BlockHostUntilDone(Stream *stream) { + AsExecutorStream(stream)->BlockUntilDone(); + return true; +} + +DeviceDescription *ExecutorExecutor::PopulateDeviceDescription() const { + internal::DeviceDescriptionBuilder builder; + + builder.set_device_address_bits(64); + + builder.set_name("Executor"); + builder.set_device_vendor("VectorName"); + builder.set_platform_version("1.0"); + builder.set_driver_version("1.0"); + builder.set_runtime_version("1.0"); + builder.set_pci_bus_id("1"); + builder.set_device_memory_size(static_cast(4) * 1024 * 1024 * 1024); + builder.set_clock_rate_ghz(static_cast(CLOCKS_PER_SEC) / 1e9); + + auto built = builder.Build(); + return built.release(); +} + +} // namespace executorplugin +} // namespace gputools +} // namespace perftools diff --git a/tensorflow/compiler/plugin/executor/executor.h b/tensorflow/compiler/plugin/executor/executor.h new file mode 100644 index 0000000000..32fdb157e4 --- /dev/null +++ b/tensorflow/compiler/plugin/executor/executor.h @@ -0,0 +1,213 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Declares the ExecutorExecutor class, which is a CPU-only implementation of +// the StreamExecutor interface. For now, this is used for testing and to +// examine the performance of host-based StreamExecutor code. +#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_ +#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_ + +#include "tensorflow/stream_executor/host/host_stream.h" +#include "tensorflow/stream_executor/host/host_timer.h" + +#include "tensorflow/compiler/xla/shape_util.h" + +#include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/rng.h" +#include "tensorflow/stream_executor/stream_executor.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" + +#include +#include + +namespace perftools { +namespace gputools { +namespace executorplugin { + +using Args = tensorflow::gtl::ArraySlice; + +class ExecutorExecutor : public internal::StreamExecutorInterface { + public: + explicit ExecutorExecutor(const PluginConfig &plugin_config); + ~ExecutorExecutor() override; + + port::Status Init(int device_ordinal, DeviceOptions device_options) override { + return port::Status::OK(); + } + + bool GetKernel(const MultiKernelLoaderSpec &spec, + KernelBase *kernel) override { + return false; + } + bool Launch(Stream *stream, const ThreadDim &thread_dims, + const BlockDim &block_dims, const KernelBase &kernel, + const KernelArgsArrayBase &args) override { + return false; + } + + void *Allocate(uint64 size) override; + void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes, + uint64 size_bytes) override; + void Deallocate(DeviceMemoryBase *mem) override; + + void *HostMemoryAllocate(uint64 size) override { return new char[size]; } + void HostMemoryDeallocate(void *mem) override { + delete[] static_cast(mem); + } + bool HostMemoryRegister(void *mem, uint64 size) override { return true; } + bool HostMemoryUnregister(void *mem) override { return true; } + + bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &pop_src, + uint64 size) override; + bool Memcpy(Stream *stream, DeviceMemoryBase *pop_dst, const void *host_src, + uint64 size) override; + bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *pop_dst, + const DeviceMemoryBase &host_src, + uint64 size) override { + return false; + } + + bool MemZero(Stream *stream, DeviceMemoryBase *location, + uint64 size) override { + return false; + } + bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern, + uint64 size) override { + return false; + } + bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern, + uint64 size) override { + return false; + } + + // No "synchronize all activity" implemented for this platform at the moment. + bool SynchronizeAllActivity() override { return false; } + bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override { + return false; + } + + bool SynchronousMemSet(DeviceMemoryBase *location, int value, + uint64 size) override { + return false; + } + + port::Status SynchronousMemcpy(DeviceMemoryBase *pop_dst, + const void *host_src, uint64 size) override; + port::Status SynchronousMemcpy(void *host_dst, + const DeviceMemoryBase &pop_src, + uint64 size) override; + port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst, + const DeviceMemoryBase &pop_src, + uint64 size) override { + return port::Status{port::error::UNIMPLEMENTED, ""}; + } + + bool HostCallback(Stream *stream, std::function callback) override; + + port::Status AllocateEvent(Event *event) override { + return port::Status{port::error::UNIMPLEMENTED, ""}; + } + + port::Status DeallocateEvent(Event *event) override { + return port::Status{port::error::UNIMPLEMENTED, ""}; + } + + port::Status RecordEvent(Stream *stream, Event *event) override { + return port::Status{port::error::UNIMPLEMENTED, ""}; + } + + port::Status WaitForEvent(Stream *stream, Event *event) override { + return port::Status{port::error::UNIMPLEMENTED, ""}; + } + + Event::Status PollForEventStatus(Event *event) override { + return Event::Status::kError; + } + + bool AllocateStream(Stream *stream) override { return true; } + void DeallocateStream(Stream *stream) override {} + bool CreateStreamDependency(Stream *dependent, Stream *other) override; + + bool AllocateTimer(Timer *timer) override { return true; } + void DeallocateTimer(Timer *timer) override {} + bool StartTimer(Stream *stream, Timer *timer) override; + bool StopTimer(Stream *stream, Timer *timer) override; + + bool BlockHostUntilDone(Stream *stream) override; + + int PlatformDeviceCount() override { return 1; } + + bool DeviceMemoryUsage(int64 *free, int64 *total) const override { + return false; + } + + DeviceDescription *PopulateDeviceDescription() const override; + + port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override { + return port::Status::OK(); + } + + bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override { + return true; + } + + SharedMemoryConfig GetDeviceSharedMemoryConfig() override { + return SharedMemoryConfig::kDefault; + } + + port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override { + return port::Status{port::error::UNIMPLEMENTED, + "Shared memory not supported"}; + } + + std::unique_ptr CreateEventImplementation() + override { + return nullptr; + } + + std::unique_ptr CreateKernelImplementation() + override { + return nullptr; + } + + std::unique_ptr GetStreamImplementation() + override { + return std::unique_ptr(new host::HostStream()); + } + + std::unique_ptr GetTimerImplementation() override { + return std::unique_ptr(new host::HostTimer()); + } + + port::StatusOr ExecuteGraph(const xla::Shape &shape, + Args args); + + private: + DeviceMemoryBase AllocateSingleOutput(const xla::Shape &shape); + + port::StatusOr AllocateOutputBuffer( + const xla::Shape &shape); + + const PluginConfig plugin_config_; +}; + +} // namespace executorplugin +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_ diff --git a/tensorflow/compiler/plugin/executor/platform.cc b/tensorflow/compiler/plugin/executor/platform.cc new file mode 100644 index 0000000000..2f339f04a7 --- /dev/null +++ b/tensorflow/compiler/plugin/executor/platform.cc @@ -0,0 +1,125 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/plugin/executor/platform.h" +#include "tensorflow/compiler/plugin/executor/executor.h" +#include "tensorflow/compiler/plugin/executor/platform_id.h" + +#include "tensorflow/stream_executor/lib/error.h" +#include "tensorflow/stream_executor/lib/initialize.h" +#include "tensorflow/stream_executor/lib/ptr_util.h" +#include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/lib/status_macros.h" +#include "tensorflow/stream_executor/lib/stringprintf.h" + +namespace se = ::perftools::gputools; +namespace sep = ::perftools::gputools::executorplugin; + +namespace perftools { +namespace gputools { +namespace executorplugin { + +PLATFORM_DEFINE_ID(kExecutorPlatformId); + +ExecutorPlatform::ExecutorPlatform() : name_("Executor") {} + +ExecutorPlatform::~ExecutorPlatform() {} + +Platform::Id ExecutorPlatform::id() const { return kExecutorPlatformId; } + +int ExecutorPlatform::VisibleDeviceCount() const { return 1; } + +const string& ExecutorPlatform::Name() const { return name_; } + +port::StatusOr ExecutorPlatform::ExecutorForDevice( + int ordinal) { + StreamExecutorConfig config; + config.ordinal = ordinal; + config.plugin_config = PluginConfig(); + config.device_options = DeviceOptions::Default(); + return GetExecutor(config); +} + +port::StatusOr +ExecutorPlatform::ExecutorForDeviceWithPluginConfig( + int device_ordinal, const PluginConfig& plugin_config) { + StreamExecutorConfig config; + config.ordinal = device_ordinal; + config.plugin_config = plugin_config; + config.device_options = DeviceOptions::Default(); + return GetExecutor(config); +} + +port::StatusOr ExecutorPlatform::GetExecutor( + const StreamExecutorConfig& config) { + mutex_lock lock(executors_mutex_); + + port::StatusOr status = executor_cache_.Get(config); + if (status.ok()) { + return status.ValueOrDie(); + } + + port::StatusOr> executor = + GetUncachedExecutor(config); + if (!executor.ok()) { + return executor.status(); + } + + StreamExecutor* naked_executor = executor.ValueOrDie().get(); + SE_RETURN_IF_ERROR( + executor_cache_.Insert(config, executor.ConsumeValueOrDie())); + return naked_executor; +} + +port::StatusOr> +ExecutorPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) { + auto executor = port::MakeUnique( + this, port::MakeUnique(config.plugin_config)); + auto init_status = executor->Init(config.ordinal, config.device_options); + if (!init_status.ok()) { + return port::Status{ + port::error::INTERNAL, + port::Printf( + "failed initializing StreamExecutor for device ordinal %d: %s", + config.ordinal, init_status.ToString().c_str())}; + } + + return std::move(executor); +} + +void ExecutorPlatform::RegisterTraceListener( + std::unique_ptr listener) { + LOG(FATAL) << "not yet implemented: register executor trace listener"; +} + +void ExecutorPlatform::UnregisterTraceListener(TraceListener* listener) { + LOG(FATAL) << "not yet implemented: unregister executor trace listener"; +} + +static void InitializeExecutorPlatform() { + std::unique_ptr platform(new sep::ExecutorPlatform); + SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform))); +} + +} // namespace executorplugin +} // namespace gputools +} // namespace perftools + +REGISTER_MODULE_INITIALIZER(executor_platform, sep::InitializeExecutorPlatform()); + +DECLARE_MODULE_INITIALIZER(multi_platform_manager); +// Note that module initialization sequencing is not supported in the +// open-source project, so this will be a no-op there. +REGISTER_MODULE_INITIALIZER_SEQUENCE(executor_platform, multi_platform_manager); diff --git a/tensorflow/compiler/plugin/executor/platform.h b/tensorflow/compiler/plugin/executor/platform.h new file mode 100644 index 0000000000..c252a589d4 --- /dev/null +++ b/tensorflow/compiler/plugin/executor/platform.h @@ -0,0 +1,83 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_ +#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_ + +#include +#include +#include + +#include "tensorflow/stream_executor/executor_cache.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/stream_executor/platform.h" +#include "tensorflow/stream_executor/platform/mutex.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/thread_annotations.h" +#include "tensorflow/stream_executor/stream_executor_pimpl.h" +#include "tensorflow/stream_executor/trace_listener.h" + +namespace perftools { +namespace gputools { +namespace executorplugin { + +class ExecutorPlatform : public Platform { + public: + ExecutorPlatform(); + ~ExecutorPlatform() override; + + Platform::Id id() const override; + + // Device count is less clear-cut for CPUs than accelerators. This call + // currently returns the number of thread units in the host, as reported by + // base::NumCPUs(). + int VisibleDeviceCount() const override; + + const string& Name() const override; + + port::StatusOr ExecutorForDevice(int ordinal) override; + + port::StatusOr ExecutorForDeviceWithPluginConfig( + int ordinal, const PluginConfig& config) override; + + port::StatusOr GetExecutor( + const StreamExecutorConfig& config) override; + + port::StatusOr> GetUncachedExecutor( + const StreamExecutorConfig& config) override; + + void RegisterTraceListener(std::unique_ptr listener) override; + + void UnregisterTraceListener(TraceListener* listener) override; + + private: + // This platform's name. + string name_; + + // mutex that guards the ordinal-to-executor map. + mutable mutex executors_mutex_; + + // Cache of created StreamExecutors. + ExecutorCache executor_cache_; + + SE_DISALLOW_COPY_AND_ASSIGN(ExecutorPlatform); +}; + +} // namespace executorplugin +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_ diff --git a/tensorflow/compiler/plugin/executor/platform_id.h b/tensorflow/compiler/plugin/executor/platform_id.h new file mode 100644 index 0000000000..8d2b29a3e4 --- /dev/null +++ b/tensorflow/compiler/plugin/executor/platform_id.h @@ -0,0 +1,31 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_ +#define TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_ + +#include "tensorflow/stream_executor/platform.h" + +namespace perftools { +namespace gputools { +namespace executorplugin { + +extern const Platform::Id kExecutorPlatformId; + +} // namespace executorplugin +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_ diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.cc b/tensorflow/compiler/plugin/executor/transfer_manager.cc new file mode 100644 index 0000000000..51c5deeea5 --- /dev/null +++ b/tensorflow/compiler/plugin/executor/transfer_manager.cc @@ -0,0 +1,187 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/plugin/executor/transfer_manager.h" +#include "tensorflow/compiler/plugin/executor/platform_id.h" + +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +#include +#include +#include + +namespace sep = ::perftools::gputools::executorplugin; + +namespace xla { +namespace executorplugin { + +ExecutorTransferManager::ExecutorTransferManager() {} + +se::Platform::Id ExecutorTransferManager::PlatformId() const { + return se::executorplugin::kExecutorPlatformId; +} + +Status ExecutorTransferManager::TransferLiteralFromDevice( + se::StreamExecutor* executor, const se::DeviceMemoryBase& source, + const Shape& device_shape, const Shape& literal_shape, Literal* literal) { + TF_RET_CHECK(ShapeUtil::Compatible(device_shape, literal_shape)); + + // Tuples are a special case and contain one or more shapes inside of them to + // an arbitrary nesting depth. + if (device_shape.element_type() == TUPLE) { + *literal->mutable_shape() = literal_shape; + TF_ASSIGN_OR_RETURN( + std::vector element_buffers, + ShallowCopyTupleFromDevice(executor, source, device_shape)); + TF_RET_CHECK(element_buffers.size() == + ShapeUtil::TupleElementCount(device_shape)); + for (int64 i = 0; i < element_buffers.size(); ++i) { + const Shape& element_device_shape = device_shape.tuple_shapes(i); + const Shape& element_literal_shape = literal_shape.tuple_shapes(i); + Literal* element_literal = literal->add_tuple_literals(); + // Recursively call TransferFromDevice to copy over the data in the + // element array. + TF_RETURN_IF_ERROR(TransferLiteralFromDevice( + executor, element_buffers[i], element_device_shape, + element_literal_shape, element_literal)); + } + return Status::OK(); + } + + *literal->mutable_shape() = device_shape; + literal->Reserve(ShapeUtil::ElementsIn(device_shape)); + TF_RETURN_IF_ERROR(TransferBufferFromDevice( + executor, source, ShapeUtil::ByteSizeOf(device_shape), + literal->MutableInternalData())); + if (!ShapeUtil::Equal(literal_shape, device_shape)) { + literal->Swap( + literal->Relayout(literal_shape.layout()).get()); + } + TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape())); + return Status::OK(); +} + +StatusOr> +ExecutorTransferManager::ShallowCopyTupleFromDevice( + se::StreamExecutor* executor, const se::DeviceMemoryBase& source, + const Shape& shape) { + TF_RET_CHECK(ShapeUtil::IsTuple(shape)); + + std::vector element_pointers(ShapeUtil::TupleElementCount(shape), + nullptr); + int64 tuple_size = ShapeUtil::ByteSizeOf(shape, sizeof(void*)); + auto copy_status = executor->SynchronousMemcpyD2H(source, tuple_size, + element_pointers.data()); + if (!copy_status.ok()) { + return AddStatus( + Status(static_cast(copy_status.code()), + copy_status.error_message()), + "failed transfer of tuple buffer " + ShapeUtil::HumanString(shape)); + } + + // Create a DeviceMemoryBase from each void* pointer. + std::vector destination; + for (int i = 0; i < element_pointers.size(); ++i) { + if (element_pointers[i] == nullptr && + !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) { + return FailedPrecondition("tuple contains nullptr at element %d", i); + } + int64 buffer_size = + ShapeUtil::ByteSizeOf(shape.tuple_shapes(i), sizeof(void*)); + destination.emplace_back(element_pointers[i], buffer_size); + } + return std::move(destination); +} + +Status ExecutorTransferManager::TransferLiteralToDevice( + se::StreamExecutor* executor, const Literal& literal, + se::DeviceMemoryBase* destination) { + const Shape& shape = literal.shape(); + + if (ShapeUtil::IsTuple(literal.shape())) { + std::vector tuple_elements_on_device; + for (const Literal& tuple_element : literal.tuple_literals()) { + se::DeviceMemoryBase allocation = executor->AllocateArray( + GetByteSizeRequirement(tuple_element.shape())); + TF_RETURN_IF_ERROR( + TransferLiteralToDevice(executor, tuple_element, &allocation)); + tuple_elements_on_device.push_back(allocation.opaque()); + } + return TransferBufferToDevice( + executor, tuple_elements_on_device.size() * sizeof(void*), + tuple_elements_on_device.data(), destination); + } + + return TransferBufferToDevice(executor, GetByteSizeRequirement(shape), + literal.InternalData(), + destination); +} + +Status ExecutorTransferManager::TransferLiteralToInfeed( + se::StreamExecutor* executor, const Literal& literal) { + const Shape& shape = literal.shape(); + VLOG(1) << "transferring literal shape to infeed: " + << ShapeUtil::HumanString(shape); + + return Status::OK(); +} + +Status ExecutorTransferManager::TransferBufferToInfeed( + se::StreamExecutor* executor, int64 size, const void* source) { + return Unimplemented("Transfer to Infeed"); +} + +Status ExecutorTransferManager::TransferLiteralFromOutfeed( + perftools::gputools::StreamExecutor* executor, const Shape& literal_shape, + Literal* literal) { + const Shape& shape = literal->shape(); + VLOG(1) << "transferring literal shape from outfeed: " + << ShapeUtil::HumanString(shape); + + return Status::OK(); +} + +Status ExecutorTransferManager::ResetDevices( + tensorflow::gtl::ArraySlice + executors) { + return Unimplemented("Device reset not supported"); +} + +int64 ExecutorTransferManager::GetByteSizeRequirement(const Shape& shape) { + return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); +} + +} // namespace executorplugin +} // namespace xla + +static std::unique_ptr CreateExecutorTransferManager() { + return xla::MakeUnique(); +} + +static bool InitModule() { + xla::TransferManager::RegisterTransferManager(sep::kExecutorPlatformId, + &CreateExecutorTransferManager); + return true; +} +static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/plugin/executor/transfer_manager.h b/tensorflow/compiler/plugin/executor/transfer_manager.h new file mode 100644 index 0000000000..7a42e5a2d7 --- /dev/null +++ b/tensorflow/compiler/plugin/executor/transfer_manager.h @@ -0,0 +1,77 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_ +#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_ + +#include "tensorflow/compiler/xla/service/transfer_manager.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/core/platform/types.h" + +#include + +namespace se = ::perftools::gputools; + +namespace xla { +namespace executorplugin { + +class ExecutorTransferManager : public TransferManager { + public: + ExecutorTransferManager(); + + ~ExecutorTransferManager() override {} + + se::Platform::Id PlatformId() const override; + + StatusOr> ShallowCopyTupleFromDevice( + se::StreamExecutor* executor, const se::DeviceMemoryBase& source, + const Shape& shape) override; + + Status TransferLiteralFromDevice(se::StreamExecutor* executor, + const se::DeviceMemoryBase& source, + const Shape& device_shape, + const Shape& literal_shape, + Literal* literal) override; + + Status TransferLiteralToDevice(se::StreamExecutor* executor, + const Literal& literal, + se::DeviceMemoryBase* destination) override; + + Status TransferLiteralToInfeed(se::StreamExecutor* executor, + const Literal& literal) override; + + Status TransferBufferToInfeed(se::StreamExecutor* executor, + int64 size, const void* source) override; + + Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, + const Shape& literal_shape, + Literal* literal) override; + + Status ResetDevices( + tensorflow::gtl::ArraySlice executors) override; + + int64 GetByteSizeRequirement(const Shape& shape) override; + + private: + TF_DISALLOW_COPY_AND_ASSIGN(ExecutorTransferManager); +}; + +} // namespace executorplugin +} // namespace xla + +#endif // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_ diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 044857d422..432b24756d 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -175,6 +175,11 @@ tf_xla_py_test( name = "slice_ops_test", size = "small", srcs = ["slice_ops_test.py"], + # TODO(b/62962492): Test fails with assertion error. + tags = [ + "manual", + "notap", + ], deps = [ ":xla_test", "//tensorflow/python:array_ops", @@ -456,6 +461,11 @@ cuda_py_test( "//tensorflow/python:math_ops", "//tensorflow/python:nn_ops", ], + # TODO(b/62961789): Test fails with SIGABRT + tags = [ + "manual", + "notap", + ], ) cc_library( @@ -524,8 +534,12 @@ cuda_py_test( # --dump_graph_dir, and the config file was written by hand. # # Run the following to build a minimal benchmark of the computation on Android: -# $ bazel build -c opt --config=android_arm \ -# third_party/tensorflow/compiler/tests:lstm_layer_inference_benchmark +# $ bazel build -c opt --cxxopt='-std=c++11' --linkopt='-lm' \ +# --cpu=armeabi-v7a \ +# --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ +# --crosstool_top=//external:android/crosstool \ +# //tensorflow/compiler/tests:lstm_layer_inference_benchmark + # # Currently the resulting binary size is ~190KB tf_library( diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py index 6b328fb618..a75a5cd2cf 100644 --- a/tensorflow/compiler/tests/ftrl_test.py +++ b/tensorflow/compiler/tests/ftrl_test.py @@ -218,7 +218,7 @@ class FtrlOptimizerTest(XLATestCase): self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval()) self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval()) - # When variables are intialized with Zero, FTRL-Proximal has two properties: + # When variables are initialized with Zero, FTRL-Proximal has two properties: # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical # with GradientDescent. # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is idential diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc index f752fb3ae2..16b778bca4 100644 --- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc @@ -94,12 +94,14 @@ class BatchMatMulOp : public XlaOpKernel { // Slice off individual matrices and reshape to 2D tensors. auto x_slice = builder->Slice( x_flat, {i, 0, 0}, - {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)}); + {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)}, + {1, 1, 1}); x_slice = builder->Reshape( x_slice, {x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)}); auto y_slice = builder->Slice( y_flat, {i, 0, 0}, - {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)}); + {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)}, + {1, 1, 1}); y_slice = builder->Reshape( y_slice, {y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)}); diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc index 47d2d747e6..21d3e64872 100644 --- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc @@ -125,6 +125,7 @@ void BatchToSpace(XlaOpKernelContext* ctx, // input_shape[M+1], ..., input_shape[N-1]] std::vector start_indices(input_rank, 0); std::vector end_indices = reshaped_permuted_shape; + std::vector strides(input_rank, 1); for (int i = 0; i < block_rank; ++i) { int64 crop_start = crops.Get({i, 0}); int64 crop_end = crops.Get({i, 1}); @@ -139,7 +140,7 @@ void BatchToSpace(XlaOpKernelContext* ctx, " end: ", crop_end, " size ", reshaped_permuted_shape[1 + i])); } xla::ComputationDataHandle output = - b->Slice(reshaped_permuted, start_indices, end_indices); + b->Slice(reshaped_permuted, start_indices, end_indices, strides); ctx->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc index 92b371cc4e..852d2a966e 100644 --- a/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/depthwise_conv_ops.cc @@ -172,15 +172,14 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel { } else { // These will be used to define the bounds of each slice. // Within the loop, the input_channel index will be modified. - gtl::InlinedVector filter_begin; - gtl::InlinedVector filter_limits; - gtl::InlinedVector input_begin; - gtl::InlinedVector input_limits; + gtl::InlinedVector filter_begin(4, 0); + gtl::InlinedVector filter_limits(4); + gtl::InlinedVector input_begin(4, 0); + gtl::InlinedVector input_limits(4); + gtl::InlinedVector strides(4, 1); for (int i = 0; i < 4; ++i) { - filter_begin.push_back(0); - filter_limits.push_back(filter_shape.dim_size(i)); - input_begin.push_back(0); - input_limits.push_back(input_shape.dim_size(i)); + filter_limits[i] = filter_shape.dim_size(i); + input_limits[i] = input_shape.dim_size(i); } std::vector strides_for_tla{strides_[1], strides_[2]}; @@ -209,9 +208,9 @@ class DepthwiseConv2dNativeOp : public XlaOpKernel { input_limits[3] = i + 1; xla::ComputationDataHandle filter_slice = - b.Slice(filter, filter_begin, filter_limits); + b.Slice(filter, filter_begin, filter_limits, strides); xla::ComputationDataHandle input_slice = - b.Slice(input, input_begin, input_limits); + b.Slice(input, input_begin, input_limits, strides); convs.push_back(b.ConvWithGeneralDimensions( input_slice, filter_slice, strides_for_tla, xla_padding, dims)); } diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc index 74994d8961..ec5017f6ab 100644 --- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc @@ -125,7 +125,7 @@ class DiagPartOp : public XlaOpKernel { diag = builder->Reshape(diag, {new_size, new_size + 1}); // Slices out the first column and reshapes to the final shape. - diag = builder->Slice(diag, {0, 0}, {new_size, 1}); + diag = builder->Slice(diag, {0, 0}, {new_size, 1}, {1, 1}); diag = builder->Reshape(diag, new_dims); ctx->SetOutput(0, diag); @@ -224,8 +224,9 @@ class MatrixDiagPartOp : public XlaOpKernel { } else if (actual_size > target_size) { std::vector start(flattened_dims.size(), 0); std::vector limits(flattened_dims.begin(), flattened_dims.end()); + std::vector strides(flattened_dims.size(), 1); limits[flattened_dims.size() - 1] = target_size; - diag = builder->Slice(diag, start, limits); + diag = builder->Slice(diag, start, limits, strides); } // Reshape so the target values are in the first position of the last @@ -238,8 +239,9 @@ class MatrixDiagPartOp : public XlaOpKernel { // Slices out the first column and reshapes to the final shape. std::vector start(dims.size(), 0); std::vector limits(dims.begin(), dims.end()); + std::vector strides(dims.size(), 1); limits[last_dim] = 1; - diag = builder->Slice(diag, start, limits); + diag = builder->Slice(diag, start, limits, strides); // Collapses away the last dimension. dims.pop_back(); diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc index faa7ef0ef9..0330e34c98 100644 --- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc @@ -156,6 +156,8 @@ class DynamicStitchOp : public XlaOpKernel { indices0_shape.dims()); std::vector slice_limit(1 + data0_shape.dims() - indices0_shape.dims()); + std::vector stride(1 + data0_shape.dims() - + indices0_shape.dims(), 1); for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) { slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d); } @@ -168,7 +170,7 @@ class DynamicStitchOp : public XlaOpKernel { // And place it in the concat list in the place indicated by // the index. to_concat[index_num] = - ctx->builder()->Slice(expression, slice_start, slice_limit); + ctx->builder()->Slice(expression, slice_start, slice_limit, stride); } ctx->SetOutput(0, ctx->builder()->ConcatInDim(to_concat, 0)); diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc index 51c97d85d7..482c54a40c 100644 --- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc @@ -54,7 +54,9 @@ class SliceOp : public XlaOpKernel { for (int i = 0; i < begin.size(); ++i) { limits.push_back(begin[i] + size[i]); } - ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits)); + std::vector strides(begin.size(), 1); + ctx->SetOutput(0, ctx->builder()->Slice(ctx->Input(0), begin, limits, + strides)); } private: diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc index 017f3a110e..44ee81461e 100644 --- a/tensorflow/compiler/tf2xla/kernels/split_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc @@ -77,14 +77,14 @@ class SplitOp : public XlaOpKernel { // The vectors we will use to define the slice. The entry for the // split dimensions varies for each output. - std::vector begin; - std::vector limits; + std::vector begin(input_shape.dims(), 0); + std::vector limits(input_shape.dims()); + std::vector strides(input_shape.dims(), 1); for (int i = 0; i < input_shape.dims(); ++i) { // Initially set up the limits to be the full size of the input: // the split dimension is filled in below. int64 dim = input_shape.dim_size(i); - begin.push_back(0); - limits.push_back(dim); + limits[i] = dim; } auto input = ctx->Input(1); @@ -94,7 +94,7 @@ class SplitOp : public XlaOpKernel { // Slice out the ith split from the split dimension. begin[split_dim] = i * slice_size; limits[split_dim] = (i + 1) * slice_size; - ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits)); + ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides)); } } }; @@ -188,7 +188,7 @@ class SplitVOp : public XlaOpKernel { std::vector begin(input_shape.dims(), 0); auto dim_sizes = input_shape.dim_sizes(); std::vector limits(dim_sizes.begin(), dim_sizes.end()); - + std::vector strides(input_shape.dims(), 1); for (int i = 0; i < num_split; ++i) { TensorShape output_shape(input_shape); int slice_size = split_sizes_vec[i]; @@ -196,7 +196,7 @@ class SplitVOp : public XlaOpKernel { // Slice out the ith split from the split dimension. limits[split_dim] = begin[split_dim] + slice_size; - ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits)); + ctx->SetOutput(i, ctx->builder()->Slice(input, begin, limits, strides)); begin[split_dim] = limits[split_dim]; } } diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc index 8037e90791..6af4bd0496 100644 --- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc @@ -72,55 +72,29 @@ class StridedSliceOp : public XlaOpKernel { &dummy, &dummy, &dummy, &begin, &end, &strides)); gtl::InlinedVector dimensions_to_reverse; - gtl::InlinedVector slice_begin, slice_end; - bool simple_strides = true; + gtl::InlinedVector slice_begin, slice_end, slice_strides; + for (int i = 0; i < begin.size(); ++i) { - simple_strides &= (std::abs(strides[i]) == 1); if (strides[i] > 0) { slice_begin.push_back(begin[i]); slice_end.push_back(end[i]); + slice_strides.push_back(strides[i]); } else { // Negative stride: swap begin and end, add 1 because the interval // is semi-open, and mark the dimension to be reversed. - slice_begin.push_back(end[i] + 1); - slice_end.push_back(begin[i] + 1); + slice_begin.push_back(input_shape.dim_size(i) - begin[i] - 1); + slice_end.push_back(input_shape.dim_size(i) - end[i] - 1); + slice_strides.push_back(-strides[i]); dimensions_to_reverse.push_back(i); } } - xla::ComputationDataHandle slice = - ctx->builder()->Slice(ctx->Input(0), slice_begin, slice_end); + + xla::ComputationDataHandle slice = ctx->Input(0); if (!dimensions_to_reverse.empty()) { slice = ctx->builder()->Rev(slice, dimensions_to_reverse); } - // If at least one of the strides is > 1 (or < -1) then use Slice - // to pull out each of the strided slices, and Concat to put them - // together again. - if (!simple_strides) { - // Re-adjust the begin and end now that the periphery has been - // sliced away. - for (int d = 0; d < strides.size(); ++d) { - slice_end[d] -= slice_begin[d]; - slice_begin[d] = 0; - } - - for (int d = 0; d < strides.size(); ++d) { - int64 stride = std::abs(strides[d]); - if (stride > 1) { - std::vector to_concat; - int64 end = slice_end[d]; - for (int64 i = 0; i < end; i += stride) { - slice_begin[d] = i; - slice_end[d] = i + 1; - to_concat.push_back( - ctx->builder()->Slice(slice, slice_begin, slice_end)); - } - slice = ctx->builder()->ConcatInDim(to_concat, d); - slice_begin[d] = 0; - slice_end[d] = to_concat.size(); - } - } - } + slice = ctx->builder()->Slice(slice, slice_begin, slice_end, slice_strides); slice = ctx->builder()->Reshape(slice, final_shape.dim_sizes()); ctx->SetOutput(0, slice); diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc index 598b341002..9367c1ef22 100644 --- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc @@ -318,7 +318,7 @@ class TensorArrayGatherOp : public XlaOpKernel { for (int i = 0; i < num_indices; ++i) { // Slices the i-th index out of `indices`, and pads it with zeros in the // minor dimensions to form an index into the TensorArray storage. - auto index = b->Slice(indices, {i}, {i + 1}); + auto index = b->Slice(indices, {i}, {i + 1}, {1}); // start_indices of the DynamicSlice are [index, 0, 0, ..., 0]. auto start_indices = PadIndexWithZeros(b, index, ta_shape.dims() - 1); @@ -381,16 +381,18 @@ class TensorArrayScatterOp : public XlaOpKernel { std::vector value_starts(value_shape.dims(), 0); auto value_ends = value_shape.dim_sizes(); + std::vector value_strides(value_shape.dims(), 1); + // For every (index, value) pair, update the corresponding TensorArray // storage. for (int i = 0; i < num_indices; ++i) { // Slice out part of the value. value_starts[0] = i; value_ends[0] = i + 1; - auto slice = b->Slice(value, value_starts, value_ends); + auto slice = b->Slice(value, value_starts, value_ends, value_strides); // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0]. - auto index = b->Slice(indices, {i}, {i + 1}); + auto index = b->Slice(indices, {i}, {i + 1}, {1}); auto start_indices = PadIndexWithZeros(b, index, elem_shape.dims()); ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices); } diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc index a5ce78e520..f87586ba57 100644 --- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc @@ -66,6 +66,7 @@ class UnpackOp : public XlaOpKernel { std::vector start_indices(input_shape.dims(), 0); std::vector limit_indices(input_shape.dims()); + std::vector strides(input_shape.dims(), 1); for (int i = 0; i < input_shape.dims(); ++i) { limit_indices[i] = input_shape.dim_size(i); } @@ -73,7 +74,8 @@ class UnpackOp : public XlaOpKernel { for (int i = 0; i < num; ++i) { start_indices[axis] = i; limit_indices[axis] = i + 1; - auto slice = ctx->builder()->Slice(input, start_indices, limit_indices); + auto slice = ctx->builder()->Slice(input, start_indices, limit_indices, + strides); // Reshape to drop the 'axis' dimension. auto result = ctx->builder()->Reshape(slice, output_shape.dim_sizes()); ctx->SetOutput(i, result); diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc index 735a69d596..dcc313707b 100644 --- a/tensorflow/compiler/xla/client/computation_builder.cc +++ b/tensorflow/compiler/xla/client/computation_builder.cc @@ -256,7 +256,8 @@ void ComputationBuilder::CheckSameShape(const ComputationDataHandle& lhs, ComputationDataHandle ComputationBuilder::Slice( const ComputationDataHandle& operand, tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices) { + tensorflow::gtl::ArraySlice limit_indices, + tensorflow::gtl::ArraySlice stride) { if (!first_error_.ok() || !PrepareComputation().ok()) { return ComputationDataHandle(); } @@ -269,6 +270,9 @@ ComputationDataHandle ComputationBuilder::Slice( for (int64 index : limit_indices) { request.add_limit_indices(index); } + for (int64 index : stride) { + request.add_stride(index); + } OpRequest op_request; *op_request.mutable_computation() = computation_.handle(); *op_request.mutable_slice_request() = request; diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h index 5dceb03281..b411346459 100644 --- a/tensorflow/compiler/xla/client/computation_builder.h +++ b/tensorflow/compiler/xla/client/computation_builder.h @@ -211,9 +211,11 @@ class ComputationBuilder { // // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D // range notation. + // The stride parameter determines the stride over the slice ComputationDataHandle Slice(const ComputationDataHandle& operand, tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices); + tensorflow::gtl::ArraySlice limit_indices, + tensorflow::gtl::ArraySlice stride); // Enqueues a slice operation onto the computation that slices the 'operand' // from dynamic start indices which are passed in 'start_indices'. diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc index 72eec35fc4..ec643e037f 100644 --- a/tensorflow/compiler/xla/literal_util.cc +++ b/tensorflow/compiler/xla/literal_util.cc @@ -1215,11 +1215,7 @@ void Literal::Resize(int64 num_elements, double value) { template <> void Literal::Resize(int64 num_elements, half value) { CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements); - mutable_f16s()->resize(num_elements * sizeof(half)); - auto data = GetMutableArraySlice(); - for (int i = 0; i < num_elements; i++) { - data[i] = value; - } + mutable_f16s()->resize(num_elements, value); } template @@ -1262,7 +1258,7 @@ LiteralProto Literal::ToProto() const { case F16: *proto.mutable_f16s() = string(reinterpret_cast(f16s_.data()), - f16s_.size() / sizeof(half)); + f16s_.size() * sizeof(half)); break; case F32: CopyToRepeatedField(proto.mutable_f32s(), f32s()); @@ -1318,7 +1314,7 @@ void Literal::CopyFromProto(const LiteralProto& literal_proto) { const string& s(literal_proto.f16s()); CHECK_EQ(0, s.size() % sizeof(half)); f16s_ = std::vector(s.size() / sizeof(half)); - memcpy(f16s_.data(), s.data(), s.size() / sizeof(half)); + memcpy(f16s_.data(), s.data(), s.size()); break; } case F32: diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc index 11f25fb847..5976e83950 100644 --- a/tensorflow/compiler/xla/literal_util_test.cc +++ b/tensorflow/compiler/xla/literal_util_test.cc @@ -949,5 +949,62 @@ TEST_F(LiteralUtilTest, CopyFromProto_Bool) { } } +// Note that f16 is currently stored in a byte array in little endian byte order +TEST_F(LiteralUtilTest, ToProto_f16) { + half h1(1.0f); + half h2(2.0f); + + auto m = Literal::CreateR2({{h1, h2}, {h2, h1}}); + Literal* l = m.get(); + EXPECT_EQ(4, ShapeUtil::ElementsIn(l->shape())); + EXPECT_EQ(4, l->f16s().size()); + EXPECT_EQ(4, l->f16s_size()); + + LiteralProto p = l->ToProto(); + EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape())); + EXPECT_EQ(8, p.f16s().size()); + const char* d = p.f16s().data(); + EXPECT_EQ(d[0], 0); + EXPECT_EQ(d[1], 0x3C); + EXPECT_EQ(d[2], 0); + EXPECT_EQ(d[3], 0x40); + EXPECT_EQ(d[4], 0); + EXPECT_EQ(d[5], 0x40); + EXPECT_EQ(d[6], 0); + EXPECT_EQ(d[7], 0x3C); +} + +// Note that f16 is currently stored in a byte array in little endian byte order +TEST_F(LiteralUtilTest, CopyFromProto_f16) { + half h1(1.0f); + half h2(2.0f); + + const char half_vals[8] = { + 0x00, 0x3C, 0x00, 0x40, 0x00, 0x40, 0x00, 0x3C + }; + LiteralProto p; + p.mutable_shape()->set_element_type(F16); + p.mutable_shape()->clear_dimensions(); + p.mutable_shape()->add_dimensions(4); + p.clear_f16s(); + p.set_f16s(half_vals, 8); + + + Literal literal(p); + ASSERT_EQ(4, literal.f16s_size()); + ASSERT_EQ(h1, literal.f16s(0)); + ASSERT_EQ(h2, literal.f16s(1)); + ASSERT_EQ(h2, literal.f16s(2)); + ASSERT_EQ(h1, literal.f16s(3)); + + const std::vector& r = literal.f16s(); + ASSERT_EQ(4, r.size()); + ASSERT_EQ(h1, r[0]); + ASSERT_EQ(h2, r[1]); + ASSERT_EQ(h2, r[2]); + ASSERT_EQ(h1, r[3]); +} + + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 0187c09d7b..5709ac3067 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -855,6 +855,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) { // Second, construct the slice instruction to perform the negative padding. std::vector start_indices; std::vector end_indices; + std::vector strides; for (int64 i = 0; i < pad->padding_config().dimensions_size(); ++i) { const PaddingConfig::PaddingConfigDimension& padding_dimension = pad->padding_config().dimensions(i); @@ -868,16 +869,18 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) { } start_indices.push_back(start); end_indices.push_back(end); + strides.push_back(1); } // Verify that the slice shape matches the pad shape. TF_ASSIGN_OR_RETURN(Shape inferred_slice_shape, ShapeInference::InferSliceShape( - nonzero_pad_shape, start_indices, end_indices)); + nonzero_pad_shape, start_indices, end_indices, + strides)); TF_RET_CHECK(ShapeUtil::Compatible(inferred_slice_shape, pad->shape())); std::unique_ptr slice = HloInstruction::CreateSlice( - pad->shape(), nonzero_pad, start_indices, end_indices); + pad->shape(), nonzero_pad, start_indices, end_indices, strides); return ReplaceWithNewInstruction(pad, std::move(slice)); } diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc index 0792006ddb..7e52c8fb0c 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc @@ -520,7 +520,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) { HloInstruction::CreateConstant(Literal::CreateR1({}))); HloInstruction* empty_slice = builder.AddInstruction(HloInstruction::CreateSlice( - ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42})); + ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}, {1})); Shape result_shape = ShapeUtil::MakeShape(F32, {3 * kParamLength}); builder.AddInstruction(HloInstruction::CreateConcatenate( result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0)); @@ -551,7 +551,7 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) { HloInstruction::CreateConstant(Literal::CreateR1({}))); HloInstruction* empty_slice = builder.AddInstruction(HloInstruction::CreateSlice( - ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42})); + ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}, {1})); Shape result_shape = ShapeUtil::MakeShape(F32, {0}); builder.AddInstruction(HloInstruction::CreateConcatenate( result_shape, {empty_literal, empty_slice}, 0)); @@ -1132,7 +1132,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) { 0, ShapeUtil::MakeShape(F32, {dim0, dim1}), "param")); builder.AddInstruction(HloInstruction::CreateSlice( ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0}, - /*limit_indices=*/{dim0, dim1})); + /*limit_indices=*/{dim0, dim1}, /*slices=*/{1, 1})); HloModule module(TestName()); HloComputation* computation = module.AddEntryComputation(builder.Build()); @@ -1537,7 +1537,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) { Shape slice_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 3}); HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice( - slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6})); + slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1})); HloModule module(TestName()); auto computation = module.AddEntryComputation(builder.Build()); diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc index c498b86dd4..56568fd446 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc @@ -731,7 +731,7 @@ TEST_F(BufferAssignmentTest, ReuseNonOperandBuffer) { auto negate = builder.AddInstruction( HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0)); auto slice = builder.AddInstruction( - HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10})); + HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1})); auto broadcast = builder.AddInstruction( HloInstruction::CreateBroadcast(f32a100x10_, slice, {1})); @@ -763,7 +763,7 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) { auto negate = builder.AddInstruction( HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0)); auto slice = builder.AddInstruction( - HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10})); + HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1})); auto broadcast = builder.AddInstruction( HloInstruction::CreateBroadcast(f32a100x10_, slice, {1})); builder.AddInstruction(HloInstruction::CreateTuple({negate, broadcast})); @@ -800,7 +800,7 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) { auto tuple_element = builder.AddInstruction( HloInstruction::CreateGetTupleElement(f32vec100_, tuple, 0)); auto slice = builder.AddInstruction( - HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10})); + HloInstruction::CreateSlice(f32vec10_, tuple_element, {0}, {10}, {1})); auto broadcast = builder.AddInstruction( HloInstruction::CreateBroadcast(f32a100x10_, slice, {1})); builder.AddInstruction(HloInstruction::CreateTuple({tuple, broadcast})); @@ -835,7 +835,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBuffer) { HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0)); // Slice output is 10 elements. auto slice = builder.AddInstruction( - HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10})); + HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1})); // Broadcast output is 40 elements. auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(F32, {10, 4}), slice, {0})); @@ -867,7 +867,7 @@ TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) { auto negate = builder.AddInstruction( HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0)); auto slice = builder.AddInstruction( - HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10})); + HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1})); // Broadcast output is 40 elements. auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(F32, {10, 10}), slice, {0})); @@ -904,7 +904,7 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBufferInTuple) { HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0)); // Slice output is 10 elements. auto slice = builder.AddInstruction( - HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10})); + HloInstruction::CreateSlice(f32vec10_, negate, {0}, {10}, {1})); // Broadcast output is 40 elements. auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast( ShapeUtil::MakeShape(F32, {10, 4}), slice, {0})); diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc index a31e9b1782..a5f7cc0aeb 100644 --- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc +++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc @@ -588,7 +588,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest { if (update_uses_tuple_element1) { // Create a slice instruction as an additional user of 'gte1'. slice = builder.AddInstruction( - HloInstruction::CreateSlice(update_shape, gte1, {0}, {3})); + HloInstruction::CreateSlice(update_shape, gte1, {0}, {3}, {1})); update = builder.AddInstruction(HloInstruction::CreateBinary( update_shape, HloOpcode::kAdd, update, slice)); } diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h index dd00c58240..0a1911cbd1 100644 --- a/tensorflow/compiler/xla/service/compile_only_service.h +++ b/tensorflow/compiler/xla/service/compile_only_service.h @@ -55,7 +55,7 @@ class CompileOnlyService : public Service { // Override Service methods that require or imply the existence of an // execute backend. Note that this does not include TransferToClient, as - // computing contants produces global data that we may wish to transfer. + // computing constants produces global data that we may wish to transfer. tensorflow::Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override { return Unimplemented("CompileOnlyService does not support execution."); diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc index cdf277581f..cdfa30dd9a 100644 --- a/tensorflow/compiler/xla/service/computation_placer.cc +++ b/tensorflow/compiler/xla/service/computation_placer.cc @@ -49,17 +49,18 @@ Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const { return Status::OK(); } -/* static */ StatusOr DeviceAssignment::Deserialize( - const DeviceAssignmentProto& proto) { +/* static */ StatusOr> +DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) { TF_RET_CHECK(proto.computation_devices_size() == proto.computation_count()); - DeviceAssignment assignment(proto.replica_count(), proto.computation_count()); + auto assignment = MakeUnique(proto.replica_count(), + proto.computation_count()); for (int computation = 0; computation < proto.computation_count(); ++computation) { const auto& computation_device = proto.computation_devices(computation); TF_RET_CHECK(computation_device.replica_device_ids_size() == proto.replica_count()); for (int replica = 0; replica < proto.replica_count(); ++replica) { - assignment(replica, computation) = + (*assignment)(replica, computation) = computation_device.replica_device_ids(replica); } } diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h index 4d26d6bb85..7d9abcd100 100644 --- a/tensorflow/compiler/xla/service/computation_placer.h +++ b/tensorflow/compiler/xla/service/computation_placer.h @@ -49,7 +49,11 @@ class DeviceAssignment : public Array2D { // Protocol buffer serialization and deserialization. Status Serialize(DeviceAssignmentProto* proto) const; - static StatusOr Deserialize( + + // Return a std::unique_ptr instead of a DeviceAssignment + // directly because one of the supported TF platforms (mac) does not compile + // due to a StatusOr of an incomplete type (DeviceAssignment). + static StatusOr> Deserialize( const DeviceAssignmentProto& proto); }; diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index 5b21ae3d2a..db0a8b36cd 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -949,9 +949,20 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( const IrArray::Index& index) -> StatusOr { IrArray::Index sliced_index(index.size()); for (int i = 0; i < index.size(); ++i) { - sliced_index[i] = ir_builder_->CreateAdd( - index[i], llvm::ConstantInt::get(index[i]->getType(), - hlo->slice_starts(i))); + int64 stride = hlo->slice_stride(i); + if (stride != 1) { + sliced_index[i] = ir_builder_->CreateAdd( + ir_builder_->CreateMul( + index[i], llvm::ConstantInt::get(index[i]->getType(), + stride)), + llvm::ConstantInt::get(index[i]->getType(), + hlo->slice_starts(i))); + } else { + sliced_index[i] = ir_builder_->CreateAdd( + index[i], + llvm::ConstantInt::get(index[i]->getType(), + hlo->slice_starts(i))); + } } return operand_to_generator.at(hlo->operand(0))(sliced_index); }; diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc index 4e130de311..b8c6162084 100644 --- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc +++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc @@ -80,6 +80,7 @@ HloInstruction* MaybePaddedAndSlicedInput( std::vector start_indices(input->shape().dimensions_size(), 0); std::vector limit_indices(input->shape().dimensions().begin(), input->shape().dimensions().end()); + std::vector strides(input->shape().dimensions_size(), 1); for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) { int64 dim = conv_dnums.spatial_dimensions(i); // If dimension "dim" has negative padding, increase the start index or @@ -92,9 +93,9 @@ HloInstruction* MaybePaddedAndSlicedInput( input = computation->AddInstruction(HloInstruction::CreateSlice( ShapeInference::InferSliceShape(input->shape(), start_indices, - limit_indices) + limit_indices, strides) .ConsumeValueOrDie(), - input, start_indices, limit_indices)); + input, start_indices, limit_indices, strides)); } return input; @@ -354,6 +355,8 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution( std::vector limit_indices( new_backward_conv->shape().dimensions().begin(), new_backward_conv->shape().dimensions().end()); + std::vector strides(new_backward_conv->shape().dimensions_size(), + 1LL); for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) { int64 padding_low = backward_conv->window().dimensions(i).padding_low(); int64 padding_high = backward_conv->window().dimensions(i).padding_high(); @@ -373,13 +376,13 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution( // Replace the old backward convolution with the slice. CHECK(ShapeUtil::Compatible( ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices, - limit_indices) + limit_indices, strides) .ConsumeValueOrDie(), backward_conv->shape())); TF_CHECK_OK(computation->ReplaceWithNewInstruction( backward_conv, HloInstruction::CreateSlice(backward_conv->shape(), new_backward_conv, - start_indices, limit_indices))); + start_indices, limit_indices, strides))); return true; } diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc index a643bc4076..1c60b06ddd 100644 --- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc +++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc @@ -147,6 +147,7 @@ TEST_F(HloConstantFoldingTest, Slice) { const int64 dimensions[] = {11, 8, 7, 5, 9}; const int64 slice_start[] = {4, 2, 3, 1, 5}; const int64 slice_limits[] = {10, 8, 6, 5, 9}; + const int64 slice_strides[] = {1, 1, 1, 1, 1}; TF_ASSIGN_OR_ASSERT_OK(auto literal, LiteralTestUtil::CreateRandomLiteral( ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0)); @@ -154,7 +155,7 @@ TEST_F(HloConstantFoldingTest, Slice) { HloInstruction::CreateConstant(std::move(literal))); Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4}); builder.AddInstruction(HloInstruction::CreateSlice( - shape, literal_instruction, slice_start, slice_limits)); + shape, literal_instruction, slice_start, slice_limits, slice_strides)); auto module = CreateNewModule(); auto computation = module->AddEntryComputation(builder.Build()); diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index 7c60fc79ed..4b38ecd5de 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -306,11 +306,13 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape, /* static */ std::unique_ptr HloInstruction::CreateSlice( const Shape& shape, HloInstruction* operand, tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices) { + tensorflow::gtl::ArraySlice limit_indices, + tensorflow::gtl::ArraySlice strides) { auto instruction = WrapUnique(new HloInstruction(HloOpcode::kSlice, shape)); instruction->AppendOperand(operand); instruction->slice_starts_.assign(start_indices.begin(), start_indices.end()); instruction->slice_limits_.assign(limit_indices.begin(), limit_indices.end()); + instruction->slice_strides_.assign(strides.begin(), strides.end()); return instruction; } @@ -852,7 +854,8 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( return CreateReshape(shape, new_operands[0]); case HloOpcode::kSlice: CHECK_EQ(new_operands.size(), 1); - return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_); + return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_, + slice_strides_); case HloOpcode::kDynamicSlice: return CreateDynamicSlice(shape, new_operands[0], new_operands[1], dynamic_slice_sizes_); @@ -1672,6 +1675,8 @@ string HloInstruction::ToCategory() const { case FusionKind::kConvBackwardFilter: case FusionKind::kConvBackwardInput: return "convolution fusion"; + case FusionKind::kCustom: + return "custom fusion"; } } @@ -2339,6 +2344,8 @@ string ToString(HloInstruction::FusionKind kind) { return "kConvBackwardFilter"; case HloInstruction::FusionKind::kConvBackwardInput: return "kConvBackwardInput"; + case HloInstruction::FusionKind::kCustom: + return "kCustom"; } } diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 5885261e8f..bf50542cc0 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -63,6 +63,9 @@ class HloInstruction { kTransposeDot, // Fused into a dot with transposed operands. kConvBackwardFilter, // Fused into a backward filter convolution. kConvBackwardInput, // Fused into a backward input convolution. + + kCustom, // Custom category for backend-specific fusions that + // do not match any of the more specific ones. }; ~HloInstruction(); @@ -174,7 +177,8 @@ class HloInstruction { static std::unique_ptr CreateSlice( const Shape& shape, HloInstruction* operand, tensorflow::gtl::ArraySlice start_indices, - tensorflow::gtl::ArraySlice limit_indices); + tensorflow::gtl::ArraySlice limit_indices, + tensorflow::gtl::ArraySlice strides); // Creates a slice instruction, where the first operand is sliced by // start indices specified in the second operand, and by size specfied in @@ -662,6 +666,15 @@ class HloInstruction { return slice_limits_; } + // Returns the stride in the given dimension for a slice node. + // + // Precondition: opcode() == HloOpcode::kSlice + int64 slice_stride(int64 dimension) const { + CHECK_EQ(HloOpcode::kSlice, opcode_); + return slice_strides_[dimension]; + } + const std::vector& slice_strides() const { return slice_strides_; } + // Returns the size of the slice in the given dimension for a dynamic // slice node. // @@ -907,6 +920,7 @@ class HloInstruction { // Describes the [begin, end) index range for a slice. std::vector slice_starts_; std::vector slice_limits_; + std::vector slice_strides_; // The bit sizes for a reduce-precision operation. int32 exponent_bits_; diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc index 8a1e705711..1a861cd16b 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc +++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc @@ -67,7 +67,8 @@ class HloRematerializationTest : public HloTestBase { /*dimension=*/0)); auto slice_1 = builder.AddInstruction(HloInstruction::CreateSlice( vec1_shape_, concat_1, /*start_indices=*/{0}, - /*limit_indices=*/{1})); + /*limit_indices=*/{1}, + /*strides=*/{1})); auto concat_2 = builder.AddInstruction(HloInstruction::CreateConcatenate( ShapeUtil::MakeShape(xla::F32, {1025}), {bcast, slice_1}, /*dimension=*/0)); @@ -75,7 +76,8 @@ class HloRematerializationTest : public HloTestBase { // which is necessary to use this computation in a while. builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat_2, /*start_indices=*/{0}, - /*limit_indices=*/{1})); + /*limit_indices=*/{1}, + /*strides=*/{1})); return builder.Build(); } @@ -103,7 +105,8 @@ class HloRematerializationTest : public HloTestBase { HloInstruction::CreateBroadcast(vec1024_shape_, param, {})); auto slice_1 = builder.AddInstruction( HloInstruction::CreateSlice(vec1_shape_, bcast, /*start_indices=*/{0}, - /*limit_indices=*/{1})); + /*limit_indices=*/{1}, + /*strides=*/{1})); auto while_inst = builder.AddInstruction(HloInstruction::CreateWhile( vec1_shape_, while_cond, while_body, slice_1)); auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate( @@ -111,7 +114,8 @@ class HloRematerializationTest : public HloTestBase { /*dimension=*/0)); builder.AddInstruction(HloInstruction::CreateSlice(vec1_shape_, concat, /*start_indices=*/{0}, - /*limit_indices=*/{1})); + /*limit_indices=*/{1}, + /*strides=*/{1})); return builder.Build(); } @@ -353,7 +357,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) { /*dimension=*/0)); builder.AddInstruction(HloInstruction::CreateSlice( vec1024_shape_, concat, /*start_indices=*/{0}, - /*limit_indices=*/{1024})); + /*limit_indices=*/{1024}, /*slices=*/{1})); subcomputation = module->AddEmbeddedComputation(builder.Build()); } @@ -469,7 +473,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) { /*dimension=*/0)); builder.AddInstruction(HloInstruction::CreateSlice( vec1024_shape_, concat, /*start_indices=*/{0}, - /*limit_indices=*/{1024})); + /*limit_indices=*/{1024}, /*slices=*/{1})); subcomputation = module->AddEmbeddedComputation(builder.Build()); } diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index b332709995..5e4df9ddd6 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -1135,7 +1135,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( /* static */ StatusOr ShapeInference::InferSliceShape( const Shape& arg, tensorflow::gtl::ArraySlice starts, - tensorflow::gtl::ArraySlice limits) { + tensorflow::gtl::ArraySlice limits, + tensorflow::gtl::ArraySlice strides) { TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice")); VLOG(2) << tensorflow::strings::Printf( "slicing shape %s starts={%s} limits={%s}", @@ -1158,13 +1159,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( for (int64 dimension = 0; dimension < starts.size(); ++dimension) { int64 start_index = starts[dimension]; int64 limit_index = limits[dimension]; + int64 stride = strides[dimension]; if (start_index < 0) { return InvalidArgument("negative start index to slice: %lld", start_index); } - if (limit_index < 0) { - return InvalidArgument("negative limit index to slice: %lld", - limit_index); + if (stride == 0) { + return InvalidArgument("Zero stride"); } if (limit_index > arg.dimensions(dimension)) { return InvalidArgument( @@ -1172,18 +1173,21 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "size (%lld)", limit_index, arg.dimensions(dimension)); } - if (start_index > limit_index) { - return InvalidArgument( - "limit index (%lld) must be greater or equal to " - "start index (%lld) in slice", - limit_index, start_index); - } VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension, start_index); VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension, limit_index); - - sizes.push_back(limits[dimension] - starts[dimension]); + if (stride > 0) { + if (start_index > limit_index) { + return InvalidArgument( + "limit index (%lld) must be greater or equal to " + "start index (%lld) in slice with positive stride", + limit_index, start_index); + } + sizes.push_back((limit_index - start_index + stride - 1) / stride); + } else { + return InvalidArgument("Negative strides not supported"); + } } return ShapeUtil::MakeShape(arg.element_type(), sizes); diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h index 55c60e149d..42e4c7d39d 100644 --- a/tensorflow/compiler/xla/service/shape_inference.h +++ b/tensorflow/compiler/xla/service/shape_inference.h @@ -116,7 +116,8 @@ class ShapeInference { // e.g. slice f32[32x32] 0:16 0:16 -> f32[16x16] static StatusOr InferSliceShape( const Shape& arg, tensorflow::gtl::ArraySlice starts, - tensorflow::gtl::ArraySlice limits); + tensorflow::gtl::ArraySlice limits, + tensorflow::gtl::ArraySlice strides); // Infers the shape produced by a dynamic slice operation of size specified // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'. diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc index 7cff042a48..8c731ae297 100644 --- a/tensorflow/compiler/xla/service/shape_inference_test.cc +++ b/tensorflow/compiler/xla/service/shape_inference_test.cc @@ -682,16 +682,43 @@ TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) { TEST_F(ShapeInferenceTest, InferSliceShapeRank2) { Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}); auto inferred_status = - ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}); + ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {1, 1}); ASSERT_IS_OK(inferred_status.status()); Shape inferred = inferred_status.ValueOrDie(); ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 64}), inferred)); } +TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStrides) { + Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}); + auto inferred_status = + ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {2, 4}); + ASSERT_IS_OK(inferred_status.status()); + Shape inferred = inferred_status.ValueOrDie(); + ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {16, 16}), inferred)); +} + +TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStridesNotIntegral) { + Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}); + auto inferred_status = + ShapeInference::InferSliceShape(matrix_shape, {15, 0}, {20, 13}, {2, 4}); + ASSERT_IS_OK(inferred_status.status()); + Shape inferred = inferred_status.ValueOrDie(); + ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {3, 4}), inferred)); +} + +TEST_F(ShapeInferenceTest, InferInvalidStride) { + Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}); + auto inferred_status = + ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {0, 1}); + ASSERT_FALSE(inferred_status.ok()); + ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT, + inferred_status.status().code()); +} + TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) { Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}); auto inferred_status = - ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}); + ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {1, 1}); ASSERT_FALSE(inferred_status.ok()); ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT, inferred_status.status().code()); @@ -700,7 +727,7 @@ TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) { TEST_F(ShapeInferenceTest, InferSliceShapeRank1) { Shape vector_shape = ShapeUtil::MakeShape(F32, {17}); auto inferred_status = - ShapeInference::InferSliceShape(vector_shape, {2}, {4}); + ShapeInference::InferSliceShape(vector_shape, {2}, {4}, {1}); ASSERT_TRUE(inferred_status.ok()); Shape inferred = inferred_status.ValueOrDie(); ASSERT_TRUE(ShapeUtil::Equal(inferred, ShapeUtil::MakeShape(F32, {2}))); diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc index d25e5adee3..cd79e63caf 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc @@ -584,7 +584,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest { if (add_additional_gte0_user) { // Create 'slice' as an additional user of 'input'. auto slice = builder.AddInstruction( - HloInstruction::CreateSlice(update_shape, input, {0}, {3})); + HloInstruction::CreateSlice(update_shape, input, {0}, {3}, {1})); // Modify 'update' to take 'slice' output. update = builder.AddInstruction(HloInstruction::CreateBinary( update_shape, HloOpcode::kAdd, update, slice)); diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc index 1f6e789379..92b8c7bb21 100644 --- a/tensorflow/compiler/xla/service/user_computation.cc +++ b/tensorflow/compiler/xla/service/user_computation.cc @@ -744,7 +744,8 @@ StatusOr UserComputation::AddSliceInstruction( Shape new_shape, ShapeInference::InferSliceShape( operand->output_shape(), AsInt64Slice(slice_request.start_indices()), - AsInt64Slice(slice_request.limit_indices()))); + AsInt64Slice(slice_request.limit_indices()), + AsInt64Slice(slice_request.stride()))); ComputationDataHandle handle = CreateComputationDataHandle(); @@ -2393,7 +2394,8 @@ void ComputationLowerer::Visit( hlo_instruction = add_instruction(HloInstruction::CreateSlice( request.output_shape(), operand, AsInt64Slice(slice_request.start_indices()), - AsInt64Slice(slice_request.limit_indices()))); + AsInt64Slice(slice_request.limit_indices()), + AsInt64Slice(slice_request.stride()))); break; } diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc index bb7fbad000..024988743c 100644 --- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc +++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc @@ -1853,7 +1853,7 @@ TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) { auto x = builder.Parameter(0, x_literal->shape(), "x"); auto y = builder.Parameter(1, y_literal->shape(), "y"); - auto slice = builder.Slice(x, {1}, {2}); + auto slice = builder.Slice(x, {1}, {2}, {1}); builder.Sub(slice, y); ComputeAndCompareR1(&builder, {-2, -3}, {x_data.get(), y_data.get()}, diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc index 7abef6a27b..63a630f9e5 100644 --- a/tensorflow/compiler/xla/tests/dot_operation_test.cc +++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc @@ -365,9 +365,9 @@ XLA_TEST_F(DotOperationTest, BatchMatMul) { std::vector out_slices; for (int i = 0; i < 4; ++i) { // Slice off individual matrices and reshape to 2D tensors. - auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}); + auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1}); x_slice = builder.Reshape(x_slice, {0, 1, 2}, {2, 2}); - auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2}); + auto y_slice = builder.Slice(y_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1}); y_slice = builder.Reshape(y_slice, {0, 1, 2}, {2, 2}); auto out = builder.Dot(x_slice, y_slice); diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc index c8b91eafc7..7803d234fd 100644 --- a/tensorflow/compiler/xla/tests/fusion_test.cc +++ b/tensorflow/compiler/xla/tests/fusion_test.cc @@ -210,7 +210,7 @@ XLA_TEST_F(FusionTest, Test) { HloInstruction::CreateTernary(ShapeUtil::MakeShape(F32, {2, 3}), HloOpcode::kSelect, const10, add8, const9)); auto slice12 = builder.AddInstruction(HloInstruction::CreateSlice( - ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2})); + ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}, {1, 1})); // CreateFusionInstruction needs the `instructions_to_fuse` argument in // reverse topological order, so the first element in `instructions_to_fuse` // must be the root. diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc index df3d4fa21d..56c15e5ff7 100644 --- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc +++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc @@ -36,7 +36,7 @@ XLA_TEST_F(SliceTest, Slice2D) { ComputationBuilder builder(client_, "slice_2d"); auto original = builder.ConstantR2( {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}); - builder.Slice(original, {2, 1}, {4, 3}); + builder.Slice(original, {2, 1}, {4, 3}, {1, 1}); Array2D expected({{8.0f, 9.0f}, {11.0f, 12.0f}}); ComputeAndCompareR2(&builder, expected, {}, ErrorSpec(0.000001)); @@ -47,7 +47,7 @@ XLA_TEST_F(SliceTest, Slice3D) { Array3D array_3d( {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}); auto original = builder.ConstantR3FromArray3D(array_3d); - builder.Slice(original, {0, 0, 1}, {2, 1, 2}); + builder.Slice(original, {0, 0, 1}, {2, 1, 2}, {1, 1, 1}); Array3D expected_3d({{{2.0f}}, {{6.0f}}}); ComputeAndCompareR3(&builder, expected_3d, {}, ErrorSpec(0.000001)); diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc index 2065e9e813..a7692fceb4 100644 --- a/tensorflow/compiler/xla/tests/params_test.cc +++ b/tensorflow/compiler/xla/tests/params_test.cc @@ -325,7 +325,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) { ComputationBuilder builder(client_, TestName()); auto input = builder.Parameter(0, original, "input"); // Use the slice operator to get an off-diagonal element. - builder.Slice(input, {0, 1}, {1, 2}); + builder.Slice(input, {0, 1}, {1, 2}, {1, 1}); std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc index 97120df0c5..5e7d475662 100644 --- a/tensorflow/compiler/xla/tests/slice_test.cc +++ b/tensorflow/compiler/xla/tests/slice_test.cc @@ -44,7 +44,7 @@ class SliceTest : public ClientLibraryTestBase { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR1(constant); - builder.Slice(original, {2}, {4}); + builder.Slice(original, {2}, {4}, {1}); const std::vector expected = {static_cast(2), static_cast(3)}; @@ -55,7 +55,7 @@ class SliceTest : public ClientLibraryTestBase { XLA_TEST_F(SliceTest, SliceZeroToZeroF32) { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR1({}); - builder.Slice(original, {0}, {0}); + builder.Slice(original, {0}, {0}, {1}); ComputeAndCompareR1(&builder, {}, {}); } @@ -64,7 +64,7 @@ XLA_TEST_F(SliceTest, SliceTenToZeroF32) { ComputationBuilder builder(client_, TestName()); std::vector constant(10, 0.3); auto original = builder.ConstantR1(constant); - builder.Slice(original, {7}, {7}); + builder.Slice(original, {7}, {7}, {1}); ComputeAndCompareR1(&builder, {}, {}); } @@ -87,7 +87,7 @@ TEST_F(SliceTest, SliceTenToTen) { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR1(values); - builder.Slice(original, {0}, {10}); + builder.Slice(original, {0}, {10}, {1}); ComputeAndCompareR1(&builder, values, {}, ErrorSpec(0.000001)); } @@ -98,7 +98,7 @@ TEST_F(SliceTest, SliceLastFourOf1024) { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR1(values); - builder.Slice(original, {1024 - 4}, {1024}); + builder.Slice(original, {1024 - 4}, {1024}, {1}); const std::vector expected = {1020, 1021, 1022, 1023}; ComputeAndCompareR1(&builder, expected, {}, ErrorSpec(0.000001)); @@ -112,7 +112,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR1(values); - builder.Slice(original, {7}, {7 + 1024}); + builder.Slice(original, {7}, {7 + 1024}, {1}); std::vector expected(1024); std::iota(values.begin(), values.end(), 7.0); @@ -122,7 +122,7 @@ TEST_F(SliceTest, DISABLED_SliceUnaligned1024In4096Values) { XLA_TEST_F(SliceTest, Slice0x0to0x0F32) { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR2FromArray2D(Array2D(0, 0)); - builder.Slice(original, {0, 0}, {0, 0}); + builder.Slice(original, {0, 0}, {0, 0}, {1, 1}); ComputeAndCompareR2(&builder, Array2D(0, 0), {}); } @@ -130,7 +130,7 @@ XLA_TEST_F(SliceTest, Slice0x0to0x0F32) { XLA_TEST_F(SliceTest, Slice0x20to0x5F32) { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR2FromArray2D(Array2D(0, 20)); - builder.Slice(original, {0, 15}, {0, 20}); + builder.Slice(original, {0, 15}, {0, 20}, {1, 1}); ComputeAndCompareR2(&builder, Array2D(0, 5), {}); } @@ -138,7 +138,7 @@ XLA_TEST_F(SliceTest, Slice0x20to0x5F32) { XLA_TEST_F(SliceTest, Slice3x0to2x0F32) { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR2FromArray2D(Array2D(3, 0)); - builder.Slice(original, {1, 0}, {3, 0}); + builder.Slice(original, {1, 0}, {3, 0}, {1, 1}); ComputeAndCompareR2(&builder, Array2D(2, 0), {}); } @@ -153,7 +153,7 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR2FromArray2D(values); - builder.Slice(original, {128, 128}, {256, 256}); + builder.Slice(original, {128, 128}, {256, 256}, {1, 1}); Array2D expected(128, 128); for (int row = 0; row < 128; ++row) { @@ -171,7 +171,7 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) { ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR2FromArray2D(values); - builder.Slice(original, {0, 3072}, {1, 4096}); + builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1}); Array2D expected(1, 1024); std::iota(expected.data(), expected.data() + 1024, 3072.0); @@ -192,7 +192,7 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) { } ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR2FromArray2D(values); - builder.Slice(original, {0, 0}, {16, 2}); + builder.Slice(original, {0, 0}, {16, 2}, {1, 1}); ComputeAndCompareR2(&builder, expected, {}, ErrorSpec(0.000001)); } @@ -204,7 +204,7 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) { ReferenceUtil::Slice4D(values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}}); ComputationBuilder builder(client_, TestName()); auto original = builder.ConstantR4FromArray4D(values); - builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}); + builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1}); ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001)); } @@ -213,6 +213,7 @@ struct R2Spec { int64 input_dim1; std::array slice_starts; std::array slice_limits; + std::array slice_strides; Layout layout; }; @@ -228,7 +229,7 @@ TEST_P(SliceR2Test, DoIt) { ComputationBuilder builder(client_, TestName()); auto a = builder.ConstantR2FromArray2D(input); - builder.Slice(a, spec.slice_starts, spec.slice_limits); + builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides); std::unique_ptr> expected = ReferenceUtil::Slice2D(input, spec.slice_starts, spec.slice_limits); @@ -239,19 +240,23 @@ TEST_P(SliceR2Test, DoIt) { INSTANTIATE_TEST_CASE_P( SliceR2TestInstantiation, SliceR2Test, ::testing::Values( - R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({0, 1})}, - R2Spec {4, 12, {{0, 3}}, {{4, 6}}, LayoutUtil::MakeLayout({1, 0})}, - R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({0, 1})}, - R2Spec {16, 4, {{0, 2}}, {{16, 4}}, LayoutUtil::MakeLayout({1, 0})}, - R2Spec {256, 400, {{0, 300}}, {{256, 400}}, + R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}}, + LayoutUtil::MakeLayout({0, 1})}, + R2Spec {4, 12, {{0, 3}}, {{4, 6}}, {{1, 1}}, LayoutUtil::MakeLayout({1, 0})}, - R2Spec {500, 400, {{111, 123}}, {{300, 257}}, + R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}}, + LayoutUtil::MakeLayout({0, 1})}, + R2Spec {16, 4, {{0, 2}}, {{16, 4}}, {{1, 1}}, LayoutUtil::MakeLayout({1, 0})}, - R2Spec {500, 400, {{111, 123}}, {{300, 400}}, + R2Spec {256, 400, {{0, 300}}, {{256, 400}}, {{1, 1}}, LayoutUtil::MakeLayout({1, 0})}, - R2Spec {384, 512, {{128, 256}}, {{256, 384}}, + R2Spec {500, 400, {{111, 123}}, {{300, 257}}, {{1, 1}}, LayoutUtil::MakeLayout({1, 0})}, - R2Spec {357, 512, {{111, 256}}, {{301, 384}}, + R2Spec {500, 400, {{111, 123}}, {{300, 400}}, {{1, 1}}, + LayoutUtil::MakeLayout({1, 0})}, + R2Spec {384, 512, {{128, 256}}, {{256, 384}}, {{1, 1}}, + LayoutUtil::MakeLayout({1, 0})}, + R2Spec {357, 512, {{111, 256}}, {{301, 384}}, {{1, 1}}, LayoutUtil::MakeLayout({1, 0})} ) ); diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc index ccd2a95658..afa7d871c0 100644 --- a/tensorflow/compiler/xla/tests/while_test.cc +++ b/tensorflow/compiler/xla/tests/while_test.cc @@ -666,7 +666,8 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) { auto build_condition = [this, v6s32](int count) { ComputationBuilder builder(client_, TestName()); auto prev = builder.Reshape( - builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}), {0}, {}); + builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0}, + {}); builder.Gt(builder.ConstantR0(count), prev); return builder.Build().ConsumeValueOrDie(); }; diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h index 42d5c1d155..31f0c3147e 100644 --- a/tensorflow/compiler/xla/util.h +++ b/tensorflow/compiler/xla/util.h @@ -195,16 +195,24 @@ bool IsPermutation(tensorflow::gtl::ArraySlice permutation, int64 rank); // 2. permutation.size() == input.size(). template