aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rwxr-xr-xconfigure22
-rw-r--r--tensorflow/compiler/jit/BUILD4
-rw-r--r--tensorflow/compiler/xla/service/BUILD13
-rw-r--r--tensorflow/compiler/xla/service/shape_inference.cc6
-rw-r--r--tensorflow/compiler/xla/tests/dynamic_ops_test.cc17
-rw-r--r--tensorflow/contrib/cmake/CMakeLists.txt4
-rw-r--r--tensorflow/contrib/cmake/external/lmdb.cmake60
-rw-r--r--tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt26
-rw-r--r--tensorflow/contrib/keras/python/keras/datasets/boston_housing.py5
-rw-r--r--tensorflow/contrib/keras/python/keras/layers/convolutional_test.py23
-rw-r--r--tensorflow/contrib/keras/python/keras/layers/pooling_test.py21
-rw-r--r--tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc2
-rw-r--r--tensorflow/core/BUILD17
-rw-r--r--tensorflow/core/common_runtime/gpu/process_state.cc2
-rw-r--r--tensorflow/core/common_runtime/kernel_benchmark_testlib.cc9
-rw-r--r--tensorflow/core/common_runtime/kernel_benchmark_testlib.h7
-rw-r--r--tensorflow/core/common_runtime/shape_refiner_test.cc2
-rw-r--r--tensorflow/core/debug/debug_io_utils.cc38
-rw-r--r--tensorflow/core/debug/debug_io_utils.h8
-rw-r--r--tensorflow/core/debug/debug_io_utils_test.cc28
-rw-r--r--tensorflow/core/debug/grpc_session_debug_test.cc10
-rw-r--r--tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc2
-rw-r--r--tensorflow/core/distributed_runtime/worker.cc2
-rw-r--r--tensorflow/core/framework/shape_inference.cc8
-rw-r--r--tensorflow/core/framework/shape_inference.h25
-rw-r--r--tensorflow/core/graph/graph_constructor.cc4
-rw-r--r--tensorflow/core/grappler/costs/BUILD8
-rw-r--r--tensorflow/core/grappler/costs/graph_properties.cc6
-rw-r--r--tensorflow/core/grappler/costs/graph_properties.h1
-rw-r--r--tensorflow/core/grappler/costs/graph_properties_test.cc48
-rw-r--r--tensorflow/core/grappler/optimizers/layout_optimizer.cc66
-rw-r--r--tensorflow/core/grappler/optimizers/layout_optimizer_test.cc83
-rw-r--r--tensorflow/core/grappler/utils.cc11
-rw-r--r--tensorflow/core/grappler/utils.h6
-rw-r--r--tensorflow/core/kernels/BUILD25
-rw-r--r--tensorflow/core/kernels/cuda_solvers.cc4
-rw-r--r--tensorflow/core/kernels/debug_ops_test.cc21
-rw-r--r--tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc260
-rw-r--r--tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h2
-rw-r--r--tensorflow/core/kernels/iterator_ops.cc2
-rwxr-xr-xtensorflow/core/kernels/lmdb_reader_op.cc134
-rw-r--r--tensorflow/core/kernels/matrix_band_part_op.cc4
-rw-r--r--tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc1
-rw-r--r--tensorflow/core/kernels/matrix_diag_op.cc6
-rw-r--r--tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc1
-rw-r--r--tensorflow/core/kernels/matrix_set_diag_op.cc21
-rw-r--r--tensorflow/core/kernels/matrix_set_diag_op.h17
-rw-r--r--tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc1
-rw-r--r--tensorflow/core/kernels/sendrecv_ops.cc52
-rw-r--r--tensorflow/core/kernels/sendrecv_ops.h2
-rw-r--r--tensorflow/core/kernels/sendrecv_ops_test.cc74
-rw-r--r--tensorflow/core/lib/io/inputbuffer_test.cc6
-rw-r--r--tensorflow/core/lib/io/zlib_inputstream.cc18
-rw-r--r--tensorflow/core/lib/lmdb/testdata/data.mdbbin0 -> 20480 bytes
-rw-r--r--tensorflow/core/ops/array_ops_test.cc6
-rw-r--r--tensorflow/core/ops/io_ops.cc15
-rw-r--r--tensorflow/core/ops/ops.pbtxt27
-rw-r--r--tensorflow/python/debug/BUILD16
-rw-r--r--tensorflow/python/debug/lib/debug_data.py809
-rw-r--r--tensorflow/python/debug/lib/debug_data_test.py140
-rw-r--r--tensorflow/python/debug/lib/session_debug_multi_gpu_test.py93
-rw-r--r--tensorflow/python/debug/lib/session_debug_testlib.py47
-rw-r--r--tensorflow/python/framework/importer.py1
-rw-r--r--tensorflow/python/kernel_tests/BUILD3
-rw-r--r--tensorflow/python/kernel_tests/basic_gpu_test.py2
-rw-r--r--tensorflow/python/kernel_tests/diag_op_test.py78
-rw-r--r--tensorflow/python/kernel_tests/reader_ops_test.py43
-rw-r--r--tensorflow/python/layers/convolutional.py40
-rw-r--r--tensorflow/python/layers/pooling.py17
-rw-r--r--tensorflow/python/ops/hidden_ops.txt1
-rw-r--r--tensorflow/python/ops/io_ops.py20
-rw-r--r--tensorflow/python/ops/script_ops.py5
-rw-r--r--tensorflow/python/training/adam.py10
-rw-r--r--tensorflow/python/util/tf_should_use.py93
-rw-r--r--tensorflow/python/util/tf_should_use_test.py33
-rw-r--r--tensorflow/stream_executor/cuda/cuda_diagnostics.cc2
-rw-r--r--tensorflow/stream_executor/stream_executor_internal.h18
-rw-r--r--tensorflow/tensorboard/backend/application_test.py3
-rw-r--r--tensorflow/tensorboard/backend/event_processing/BUILD12
-rw-r--r--tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py10
-rw-r--r--tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py119
-rw-r--r--tensorflow/tensorboard/plugins/scalars/scalars_demo.py4
-rw-r--r--tensorflow/tensorboard/plugins/text/text_plugin.py3
-rw-r--r--tensorflow/tools/ci_build/Dockerfile.gpu7
-rw-r--r--tensorflow/tools/ci_build/Dockerfile.gpu_clang2
-rwxr-xr-xtensorflow/tools/ci_build/install/install_pip_packages.sh3
-rwxr-xr-xtensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh3
-rw-r--r--tensorflow/tools/docker/Dockerfile.devel-gpu2
-rw-r--r--tensorflow/tools/docker/Dockerfile.gpu2
-rw-r--r--tensorflow/tools/docs/parser.py2
-rw-r--r--tensorflow/tools/lib_package/BUILD2
-rw-r--r--tensorflow/tools/pip_package/BUILD1
-rw-r--r--tensorflow/tools/pip_package/pip_smoke_test.py1
-rw-r--r--tensorflow/tools/pip_package/setup.py1
-rw-r--r--tensorflow/tools/test/system_info_lib.py2
-rw-r--r--tensorflow/workspace.bzl11
-rw-r--r--third_party/lmdb.BUILD25
97 files changed, 2122 insertions, 857 deletions
diff --git a/configure b/configure
index a365fe14ed..3a26f73dfa 100755
--- a/configure
+++ b/configure
@@ -466,7 +466,7 @@ done
while true; do
# Configure the Cuda SDK version to use.
if [ -z "$TF_CUDA_VERSION" ]; then
- read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to use system default]: " TF_CUDA_VERSION
+ read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: " TF_CUDA_VERSION
fi
fromuser=""
@@ -509,7 +509,6 @@ while true; do
export CUDA_TOOLKIT_PATH
write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH"
export TF_CUDA_VERSION
- write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION"
break
fi
echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
@@ -522,6 +521,13 @@ while true; do
CUDA_TOOLKIT_PATH=""
done
+# Set default CUDA version if not set
+if [ -z "$TF_CUDA_VERSION" ]; then
+ TF_CUDA_VERSION="8.0"
+ export TF_CUDA_VERSION
+fi
+write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION"
+
# Set up which gcc nvcc should use as the host compiler
# No need to set this on Windows
while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
@@ -555,7 +561,7 @@ done
while true; do
# Configure the cuDNN version to use.
if [ -z "$TF_CUDNN_VERSION" ]; then
- read -p "Please specify the cuDNN version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION
+ read -p "Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: " TF_CUDNN_VERSION
fi
fromuser=""
@@ -605,7 +611,6 @@ while true; do
CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
export TF_CUDNN_VERSION
- write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
break
@@ -626,6 +631,13 @@ while true; do
CUDNN_INSTALL_PATH=""
done
+# Set default CUDNN version if not set
+if [ -z "$TF_CUDNN_VERSION" ]; then
+ TF_CUDNN_VERSION="6"
+ export TF_CUDNN_VERSION
+fi
+write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
+
# Configure the compute capabilities that TensorFlow builds for.
# Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
while true; do
@@ -844,6 +856,4 @@ if [ "$TF_NEED_MPI" == "1" ]; then
fi
-# TODO(gunan): Remove once bazel correctly handles changes in remote repositories.
-bazel clean
echo "Configuration finished"
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 277e3c9906..5f857191da 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -84,7 +84,7 @@ cc_library(
"//tensorflow/compiler/jit/kernels:xla_device_launch_op",
"//tensorflow/compiler/tf2xla:xla_compiler",
"//tensorflow/compiler/tf2xla/kernels:xla_ops",
- "//tensorflow/compiler/xla/service:cpu_plugin",
+ "//tensorflow/compiler/xla/service:cpu_plugin", # buildcleaner: keep
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:lib",
],
@@ -101,7 +101,7 @@ cc_library(
"//tensorflow/compiler/jit/kernels:xla_device_launch_op",
"//tensorflow/compiler/tf2xla:xla_compiler",
"//tensorflow/compiler/tf2xla/kernels:xla_ops",
- "//tensorflow/compiler/xla/service:gpu_plugin",
+ "//tensorflow/compiler/xla/service:gpu_plugin", # buildcleaner: keep
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:lib",
],
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 8fa7d044d1..3f93a7a511 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -679,7 +679,6 @@ cc_test(
srcs = ["buffer_liveness_test.cc"],
deps = [
":buffer_liveness",
- ":cpu_plugin",
":hlo",
"//tensorflow/compiler/xla:shape_util",
"//tensorflow/compiler/xla:types",
@@ -725,7 +724,6 @@ cc_test(
":call_graph",
":computation_tracker",
":copy_insertion",
- ":cpu_plugin",
":flatten_call_graph",
":hlo",
":hlo_ordering",
@@ -790,7 +788,6 @@ cc_test(
name = "hlo_ordering_test",
srcs = ["hlo_ordering_test.cc"],
deps = [
- ":cpu_plugin",
":hlo",
":hlo_ordering",
"//tensorflow/compiler/xla:shape_util",
@@ -860,7 +857,6 @@ cc_test(
srcs = ["algebraic_simplifier_test.cc"],
deps = [
":algebraic_simplifier",
- ":cpu_plugin",
":hlo",
":hlo_matchers",
":hlo_pass",
@@ -928,7 +924,6 @@ cc_test(
name = "inliner_test",
srcs = ["inliner_test.cc"],
deps = [
- ":cpu_plugin",
":hlo",
":hlo_matchers",
":inliner",
@@ -1085,7 +1080,6 @@ cc_test(
name = "hlo_computation_test",
srcs = ["hlo_computation_test.cc"],
deps = [
- ":cpu_plugin",
":hlo",
":hlo_matchers",
"//tensorflow/compiler/xla:literal_util",
@@ -1116,7 +1110,6 @@ cc_test(
name = "hlo_module_test",
srcs = ["hlo_module_test.cc"],
deps = [
- ":cpu_plugin",
":hlo",
"//tensorflow/compiler/xla:literal_util",
"//tensorflow/compiler/xla:shape_util",
@@ -1256,7 +1249,6 @@ cc_test(
srcs = ["copy_insertion_test.cc"],
deps = [
":copy_insertion",
- ":cpu_plugin",
":hlo",
":hlo_matchers",
":tuple_points_to_analysis",
@@ -1321,7 +1313,6 @@ cc_test(
name = "hlo_rematerialization_test",
srcs = ["hlo_rematerialization_test.cc"],
deps = [
- ":cpu_plugin",
":hlo",
":hlo_matchers",
":hlo_ordering",
@@ -1338,7 +1329,6 @@ cc_test(
name = "hlo_dce_test",
srcs = ["hlo_dce_test.cc"],
deps = [
- ":cpu_plugin",
":hlo",
":hlo_dce",
"//tensorflow/compiler/xla:literal_util",
@@ -1361,7 +1351,6 @@ cc_test(
deps = [
":algebraic_simplifier",
":computation_layout",
- ":cpu_plugin",
":hlo",
":hlo_matchers",
":layout_assignment",
@@ -1434,7 +1423,6 @@ cc_test(
name = "hlo_cse_test",
srcs = ["hlo_cse_test.cc"],
deps = [
- ":cpu_plugin",
":hlo",
":hlo_cse",
":hlo_matchers",
@@ -1470,7 +1458,6 @@ cc_test(
name = "hlo_constant_folding_test",
srcs = ["hlo_constant_folding_test.cc"],
deps = [
- ":cpu_plugin",
":hlo",
":hlo_constant_folding",
":hlo_matchers",
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 0840fdf930..afe1a54d3e 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1103,7 +1103,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
for (int64 dim = 0; dim < slice_sizes.size(); ++dim) {
const int64 input_dim_size = operand_shape.dimensions(dim);
const int64 slice_dim_size = slice_sizes[dim];
- if (slice_dim_size <= 0) {
+ if (slice_dim_size < 0) {
return InvalidArgument("negative size index to dynamic slice: %lld",
slice_dim_size);
}
@@ -1175,9 +1175,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
for (int64 dim = 0; dim < ShapeUtil::Rank(operand_shape); ++dim) {
const int64 input_dim_size = operand_shape.dimensions(dim);
const int64 update_dim_size = update_shape.dimensions(dim);
- if (update_dim_size <= 0) {
+ if (update_dim_size < 0) {
return InvalidArgument(
- "size index %lld to dynamic update slice must be > 0",
+ "size index %lld to dynamic update slice must be >= 0",
update_dim_size);
}
if (update_dim_size > input_dim_size) {
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index cdb4498f4e..ea7908f55c 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -58,6 +58,8 @@ class DynamicSliceTest : public ClientLibraryTestBase {
// Slice at dimension boundaries, but with sizes that cause indices to wrap.
RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {6}, {4},
{6.0, 7.0, 0.0, 1.0});
+ // Zero element slice.
+ RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {0}, {});
}
template <typename IndexT>
@@ -75,6 +77,12 @@ class DynamicSliceTest : public ClientLibraryTestBase {
RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
{1, 1}, {3, 3},
{{5.0f, 6.0f, 4.0f}, {8.0f, 9.0f, 7.0f}, {2.0f, 3.0f, 1.0f}});
+ // Zero element slice: 2x0.
+ RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
+ {0, 0}, {2, 0}, {{}, {}});
+ // Zero element slice: 0x2.
+ RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
+ {0, 0}, {0, 2}, Array2D<float>(0, 2));
}
template <typename IndexT>
@@ -200,6 +208,10 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
{8.0, 9.0, 10.0}, {6},
{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0, 9.0});
+ // Zero-sized update.
+ RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
+ {}, {2},
+ {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0});
// clang-format on
}
@@ -226,6 +238,11 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
{{10.0f, 11.0f}}, {2, 2},
{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 10.0f}});
+ // Zero-sized update.
+ RunR2<IndexT>(
+ {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
+ {{}}, {2, 1},
+ {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
// clang-format on
}
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 2c770e75dd..9ffe08eded 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -113,6 +113,7 @@ include(zlib)
include(gif)
include(png)
include(jpeg)
+include(lmdb)
include(eigen)
include(gemmlowp)
include(jsoncpp)
@@ -129,6 +130,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
${gif_STATIC_LIBRARIES}
${png_STATIC_LIBRARIES}
${jpeg_STATIC_LIBRARIES}
+ ${lmdb_STATIC_LIBRARIES}
${jsoncpp_STATIC_LIBRARIES}
${farmhash_STATIC_LIBRARIES}
${fft2d_STATIC_LIBRARIES}
@@ -140,6 +142,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
gif_copy_headers_to_destination
png_copy_headers_to_destination
jpeg_copy_headers_to_destination
+ lmdb_copy_headers_to_destination
jsoncpp
farmhash_copy_headers_to_destination
highwayhash_copy_headers_to_destination
@@ -158,6 +161,7 @@ include_directories(
${gif_INCLUDE_DIR}
${png_INCLUDE_DIR}
${jpeg_INCLUDE_DIR}
+ ${lmdb_INCLUDE_DIR}
${eigen_INCLUDE_DIRS}
${gemmlowp_INCLUDE_DIR}
${jsoncpp_INCLUDE_DIR}
diff --git a/tensorflow/contrib/cmake/external/lmdb.cmake b/tensorflow/contrib/cmake/external/lmdb.cmake
new file mode 100644
index 0000000000..28ec833bab
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/lmdb.cmake
@@ -0,0 +1,60 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(lmdb_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/lmdb)
+set(lmdb_URL http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz)
+set(lmdb_HASH SHA256=108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326)
+set(lmdb_BUILD ${CMAKE_BINARY_DIR}/lmdb/src/lmdb)
+set(lmdb_INSTALL ${CMAKE_BINARY_DIR}/lmdb/install)
+
+ExternalProject_Add(lmdb
+ PREFIX lmdb
+ URL ${lmdb_URL}
+ URL_HASH ${lmdb_HASH}
+ PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different
+ ${CMAKE_CURRENT_SOURCE_DIR}/patches/lmdb/CMakeLists.txt ${lmdb_BUILD}
+ INSTALL_DIR ${lmdb_INSTALL}
+ DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+ CMAKE_CACHE_ARGS
+ -DCMAKE_BUILD_TYPE:STRING=Release
+ -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+ -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
+ -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
+
+if(WIN32)
+ set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib)
+else()
+ set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a)
+endif()
+
+set(lmdb_HEADERS
+ "${lmdb_INSTALL}/include/lmdb.h"
+ "${lmdb_INSTALL}/include/midl.h"
+)
+
+## put lmdb includes in the directory where they are expected
+add_custom_target(lmdb_create_destination_dir
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${lmdb_INCLUDE_DIR}
+ DEPENDS lmdb)
+
+add_custom_target(lmdb_copy_headers_to_destination
+ DEPENDS lmdb_create_destination_dir)
+
+foreach(header_file ${lmdb_HEADERS})
+ add_custom_command(TARGET lmdb_copy_headers_to_destination PRE_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${lmdb_INCLUDE_DIR}/)
+endforeach()
diff --git a/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt b/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt
new file mode 100644
index 0000000000..19fa607a10
--- /dev/null
+++ b/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(liblmdb)
+
+set(LIBLMDB_SRCS
+ "libraries/liblmdb/mdb.c"
+ "libraries/liblmdb/midl.c"
+)
+
+set(LIBLMDB_INCLUDES
+ "libraries/liblmdb/lmdb.h"
+ "libraries/liblmdb/midl.h"
+)
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
+add_library(lmdb ${LIBLMDB_SRCS})
+
+install(TARGETS lmdb
+ RUNTIME DESTINATION bin COMPONENT RuntimeLibraries
+ LIBRARY DESTINATION lib COMPONENT RuntimeLibraries
+ ARCHIVE DESTINATION lib COMPONENT Development)
+
+foreach(LIBLMDB_INCLUDE ${LIBLMDB_INCLUDES})
+ install(FILES ${LIBLMDB_INCLUDE} DESTINATION include COMPONENT Development)
+endforeach()
diff --git a/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
index ac0f0fd422..36b20451ff 100644
--- a/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
+++ b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
@@ -37,8 +37,11 @@ def load_data(path='boston_housing.npz', seed=113, test_split=0.2):
Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
"""
assert 0 <= test_split < 1
+ fh = 'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5'
path = get_file(
- path, origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz')
+ path,
+ origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz',
+ file_hash=fh)
f = np.load(path)
x = f['x']
y = f['y']
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
index 590cdc6504..7e567d3fb0 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
@@ -114,16 +114,19 @@ class Conv2DTest(test.TestCase):
continue
with self.test_session(use_gpu=True):
- testing_utils.layer_test(
- keras.layers.Conv2D,
- kwargs={
- 'filters': filters,
- 'kernel_size': kernel_size,
- 'padding': padding,
- 'strides': strides,
- 'data_format': 'channels_first'
- },
- input_shape=(num_samples, stack_size, num_row, num_col))
+ # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+ # TODO(b/62340061): Support channels_first on CPU.
+ if test.is_gpu_available(cuda_only=True):
+ testing_utils.layer_test(
+ keras.layers.Conv2D,
+ kwargs={
+ 'filters': filters,
+ 'kernel_size': kernel_size,
+ 'padding': padding,
+ 'strides': strides,
+ 'data_format': 'channels_first'
+ },
+ input_shape=(num_samples, stack_size, num_row, num_col))
def test_convolution_2d_regularization(self):
# regularizers
diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
index bbf695a1ba..d8a6a1673b 100644
--- a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
@@ -100,15 +100,18 @@ class Pooling2DTest(test.TestCase):
'padding': 'valid',
'pool_size': (3, 3)},
input_shape=(3, 5, 6, 4))
- testing_utils.layer_test(
- keras.layers.AveragePooling2D,
- kwargs={
- 'strides': (1, 1),
- 'padding': 'valid',
- 'pool_size': (2, 2),
- 'data_format': 'channels_first'
- },
- input_shape=(3, 4, 5, 6))
+ # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+ # TODO(b/62340061): Support channels_first on CPU.
+ if test.is_gpu_available(cuda_only=True):
+ testing_utils.layer_test(
+ keras.layers.AveragePooling2D,
+ kwargs={
+ 'strides': (1, 1),
+ 'padding': 'valid',
+ 'pool_size': (2, 2),
+ 'data_format': 'channels_first'
+ },
+ input_shape=(3, 4, 5, 6))
class Pooling3DTest(test.TestCase):
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
index 2e9544b0b5..e97e8d0163 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -257,7 +257,7 @@ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
this->QueueSendRequest(req);
// Wait for the notification that indicates the tensor has been
- // succesfully transmitted to the remote process. Only needed if we
+ // successfully transmitted to the remote process. Only needed if we
// have not parsed the tensor to proto
if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
}; // done_cb
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1cc712c2e1..b76226f2c9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1894,7 +1894,6 @@ cc_library(
":framework",
":lib",
":lib_internal",
- ":test",
],
)
@@ -1909,7 +1908,7 @@ cc_library(
deps = [
":lib",
":lib_internal",
- ":test",
+ ":test", # buildcleaner: keep
"//tensorflow/core/platform/default/build_config:test_main",
],
alwayslink = 1,
@@ -2887,6 +2886,20 @@ filegroup(
)
filegroup(
+ name = "lmdb_testdata",
+ testonly = 1,
+ srcs = [
+ # A simple key-value store:
+ # 0 : 'a'
+ # 1 : 'b'
+ # ...
+ # 9 : 'j'
+ "lib/lmdb/testdata/data.mdb",
+ ],
+ visibility = ["//visibility:public"],
+)
+
+filegroup(
name = "example_parser_configuration_testdata",
srcs = [
"example/testdata/parse_example_graph_def.pbtxt",
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 0e21e37fd3..7a1c10d900 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -127,7 +127,7 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id,
gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
int bus_id = se->GetDeviceDescription().numa_node();
if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
- for (auto v : gpu_visitors_[bus_id]) {
+ for (const auto& v : gpu_visitors_[bus_id]) {
gpu_allocators_[gpu_id]->AddAllocVisitor(v);
}
}
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 7b5cc1c5cb..4a5b88d5fd 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -39,7 +39,8 @@ namespace tensorflow {
namespace test {
Benchmark::Benchmark(const string& device, Graph* g,
- const SessionOptions* options, Graph* init) {
+ const SessionOptions* options, Graph* init,
+ Rendezvous* rendez) {
SessionOptions default_options;
if (!options) {
options = &default_options;
@@ -61,7 +62,11 @@ Benchmark::Benchmark(const string& device, Graph* g,
pool_->Schedule(closure);
};
- rendez_ = NewLocalRendezvous();
+ if (rendez == nullptr) {
+ rendez_ = NewLocalRendezvous();
+ } else {
+ rendez_ = rendez;
+ }
const int graph_def_version = g->versions().producer();
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 278a6b3f9f..3a7b3a5ace 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -35,10 +35,11 @@ namespace test {
class Benchmark {
public:
- // "device" must be either "cpu" or "gpu". Takes ownership of "g"
- // and "init".
+ // "device" must be either "cpu" or "gpu". Takes ownership of "g",
+ // "init", and one reference on "rendez" (if not null).
Benchmark(const string& device, Graph* g,
- const SessionOptions* options = nullptr, Graph* init = nullptr);
+ const SessionOptions* options = nullptr, Graph* init = nullptr,
+ Rendezvous* rendez = nullptr);
~Benchmark();
// Executes the graph for "iters" times.
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 466b779e9b..b705bd74c2 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -557,7 +557,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
.Finalize(root.graph(), &result));
ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
- for (auto input : inputs) {
+ for (const auto& input : inputs) {
TF_ASSERT_OK(m.AddNode(input.node()));
}
TF_ASSERT_OK(m.AddNode(pack.node()));
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 25847a20a4..54366ce249 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -119,6 +119,18 @@ Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
} // namespace
+// static
+const char* const DebugIO::kMetadataFilePrefix = "_tfdbg_";
+
+// static
+const char* const DebugIO::kCoreMetadataTag = "core_metadata_";
+
+// static
+const char* const DebugIO::kDeviceTag = "device_";
+
+// static
+const char* const DebugIO::kGraphTag = "graph_";
+
DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
const int32 output_slot, const string& debug_op)
: device_name(device_name),
@@ -126,7 +138,8 @@ DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
output_slot(output_slot),
debug_op(debug_op),
debug_node_name(
- strings::StrCat(node_name, ":", output_slot, ":", debug_op)) {}
+ strings::StrCat(node_name, ":", output_slot, ":", debug_op)),
+ device_path(DeviceNameToDevicePath(device_name)) {}
Status ReadEventFromFile(const string& dump_file_path, Event* event) {
Env* env(Env::Default());
@@ -158,6 +171,15 @@ Status ReadEventFromFile(const string& dump_file_path, Event* event) {
}
// static
+const string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) {
+ return strings::StrCat(
+ DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag,
+ str_util::StringReplace(
+ str_util::StringReplace(device_name, ":", "_", true), "/", ",",
+ true));
+}
+
+// static
const char* const DebugIO::kFileURLScheme = "file://";
// static
const char* const DebugIO::kGrpcURLScheme = "grpc://";
@@ -236,7 +258,8 @@ Status DebugIO::PublishDebugMetadata(
const string core_metadata_path = AppendTimestampToFilePath(
io::JoinPath(
dump_root_dir,
- strings::StrCat("_tfdbg_core_metadata_", "sessionrun",
+ strings::StrCat(DebugIO::kMetadataFilePrefix,
+ DebugIO::kCoreMetadataTag, "sessionrun",
strings::Printf("%.14lld", session_run_index))),
Env::Default()->NowMicros());
status.Update(DebugFileIO::DumpEventProtoToFile(
@@ -325,10 +348,11 @@ Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
Status status = Status::OK();
for (const string& debug_url : debug_urls) {
if (debug_url.find(kFileURLScheme) == 0) {
- const string dump_root_dir = debug_url.substr(strlen(kFileURLScheme));
- // TODO(cais): (b/38325442) Serialize the GraphDef to a directory that
- // reflects the device name.
- const string file_name = strings::StrCat("_tfdbg_graph_", now_micros);
+ const string dump_root_dir =
+ io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
+ DebugNodeKey::DeviceNameToDevicePath(device_name));
+ const string file_name = strings::StrCat(DebugIO::kMetadataFilePrefix,
+ DebugIO::kGraphTag, now_micros);
status.Update(
DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
@@ -437,7 +461,7 @@ string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
const DebugNodeKey& debug_node_key,
const uint64 wall_time_us) {
return AppendTimestampToFilePath(
- io::JoinPath(dump_root_dir,
+ io::JoinPath(dump_root_dir, debug_node_key.device_path,
strings::StrCat(debug_node_key.node_name, "_",
debug_node_key.output_slot, "_",
debug_node_key.debug_op)),
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index f3e76cc0ee..69d8c7bd4e 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -44,11 +44,14 @@ struct DebugNodeKey {
DebugNodeKey(const string& device_name, const string& node_name,
const int32 output_slot, const string& debug_op);
+ static const string DeviceNameToDevicePath(const string& device_name);
+
const string device_name;
const string node_name;
const int32 output_slot;
const string debug_op;
const string debug_node_name;
+ const string device_path;
};
class DebugIO {
@@ -136,6 +139,11 @@ class DebugIO {
static Status CloseDebugURL(const string& debug_url);
+ static const char* const kMetadataFilePrefix;
+ static const char* const kCoreMetadataTag;
+ static const char* const kDeviceTag;
+ static const char* const kGraphTag;
+
static const char* const kFileURLScheme;
static const char* const kGrpcURLScheme;
};
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 406bcae07f..77039aa4ab 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
#include "tensorflow/core/lib/core/notification.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/util/event.pb.h"
@@ -47,6 +48,18 @@ class DebugIOUtilsTest : public ::testing::Test {
std::unique_ptr<Tensor> tensor_b_;
};
+TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
+ DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/gpu:2",
+ "hidden_1/MatMul", 0, "DebugIdentity");
+ EXPECT_EQ("/job:worker/replica:1/task:0/gpu:2", debug_node_key.device_name);
+ EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
+ EXPECT_EQ(0, debug_node_key.output_slot);
+ EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
+ EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name);
+ EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,gpu_2",
+ debug_node_key.device_path);
+}
+
TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
Initialize();
@@ -138,10 +151,14 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
// First, create the file at the path.
const string test_dir = testing::TmpDir();
- const string txt_file_name = strings::StrCat(test_dir, "/baz");
-
- if (!env_->FileExists(test_dir).ok()) {
- ASSERT_TRUE(env_->CreateDir(test_dir).ok());
+ const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+ const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0,
+ "DebugIdentity");
+ const string txt_file_dir =
+ io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName));
+ const string txt_file_name = io::JoinPath(txt_file_dir, "baz");
+ if (!env_->FileExists(txt_file_dir).ok()) {
+ ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok());
}
ASSERT_EQ(error::Code::NOT_FOUND, env_->FileExists(txt_file_name).code());
@@ -157,8 +174,7 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
// Second, try to dump the tensor to a path that requires "baz" to be a
// directory, which should lead to an error.
- const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
- "baz/tensor_a", 0, "DebugIdentity");
+
const uint64 wall_time = env_->NowMicros();
string dump_file_name;
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 6c68729410..9584d8b9f3 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -187,7 +187,10 @@ TEST_F(GrpcSessionDebugTest, FileDebugURL) {
IsSingleFloatValue(outputs[0], 4.0);
std::vector<Tensor> dumped_tensors;
- LoadTensorDumps("n", &dumped_tensors);
+ LoadTensorDumps(io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(
+ cluster->devices()[0].name()),
+ "n"),
+ &dumped_tensors);
if (i == 0 || i == 5) {
ASSERT_EQ(0, dumped_tensors.size());
@@ -267,7 +270,10 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
TF_CHECK_OK(session->Close());
std::vector<Tensor> dumped_tensors;
- LoadTensorDumps("n", &dumped_tensors);
+ LoadTensorDumps(
+ io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(a_dev.name()),
+ "n"),
+ &dumped_tensors);
ASSERT_EQ(1, dumped_tensors.size());
ASSERT_EQ(TensorShape({2, 2}), dumped_tensors[0].shape());
for (size_t i = 0; i < 4; ++i) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 2b1a47a93f..1e8c30bad5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -173,7 +173,7 @@ class GrpcRemoteWorker : public WorkerInterface {
}
IssueRequest(req_copy ? req_copy : request, response, recvtensor_,
- std::move(*cb_to_use), call_opts);
+ *cb_to_use, call_opts);
}
void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 32ea0cfaa4..16e450abb0 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -258,7 +258,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
}
}
if (request->is_last_partial_run()) {
- partial_run_mgr_.PartialRunDone(step_id, std::move(finish), s);
+ partial_run_mgr_.PartialRunDone(step_id, finish, s);
} else {
finish(s);
}
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index a4597080f0..1f9e98551f 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -885,14 +885,6 @@ Status InferenceContext::AttachContext(const Status& status) {
strings::StrCat(status.error_message(), error_context));
}
-ShapeHandle InferenceContext::input_handle_shape(int idx) {
- if (input_handle_shapes_and_types_[idx] == nullptr) {
- input_handle_shapes_and_types_[idx].reset(
- new std::vector<ShapeAndType>{{UnknownShape(), DT_INVALID}});
- }
- return (*input_handle_shapes_and_types_[idx])[0].shape;
-}
-
bool InferenceContext::MergeHandleShapesAndTypes(
const std::vector<ShapeAndType>& shapes_and_types,
std::vector<ShapeAndType>* to_update) {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index baeab93e30..119bed4071 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -491,37 +491,12 @@ class InferenceContext {
return input_handle_shapes_and_types_[idx].get();
}
- // DEPRECATED: use input_handle_shapes_and_types.
- ShapeHandle input_handle_shape(int idx);
- // DEPRECATED: use input_handle_shapes_and_types.
- DataType input_handle_dtype(int idx) const {
- if (input_handle_shapes_and_types_[idx] == nullptr) {
- return DT_INVALID;
- } else {
- DCHECK_EQ(input_handle_shapes_and_types_[idx]->size(), 1);
- return (*input_handle_shapes_and_types_[idx])[0].dtype;
- }
- }
-
void set_output_handle_shapes_and_types(
int idx, const std::vector<ShapeAndType>& shapes_and_types) {
output_handle_shapes_and_types_[idx].reset(
new std::vector<ShapeAndType>(shapes_and_types));
}
- // DEPRECATED: use output_handle_shapes_and_types.
- ShapeHandle output_handle_shape(int idx) {
- return output_handle_shapes_and_types_[idx] == nullptr
- ? UnknownShape()
- : (*output_handle_shapes_and_types_[idx])[0].shape;
- }
- // DEPRECATED: use output_handle_shapes_and_types.
- DataType output_handle_dtype(int idx) const {
- return output_handle_shapes_and_types_[idx] == nullptr
- ? DT_INVALID
- : (*output_handle_shapes_and_types_[idx])[0].dtype;
- }
-
// Note that shape functions should usually call MakeShapeFromShapeTensor,
// as it does more analysis to provide partial shapes.
//
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 28ebf7e8c3..8d433fc8c6 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -435,7 +435,7 @@ Status GraphConstructor::MakeNode(const NodeDef& node_def, Node** node) {
Status GraphConstructor::ValidateShape(Node* node) {
if (!opts_.importing) return Status::OK();
TF_RETURN_IF_ERROR(refiner_->AddNode(node));
- // For nodes with the _output_shapes atttribute, override the shape.
+ // For nodes with the _output_shapes attribute, override the shape.
std::vector<TensorShapeProto> shape_attrs;
const char* kAttrName = "_output_shapes";
if (!GetNodeAttr(node->attrs(), kAttrName, &shape_attrs).ok()) {
@@ -481,7 +481,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
"MutableHashTableOfTensors", "Mutex", "CuckooTable", "IndexTable",
"WholeFileReader", "TextLineReader", "FixedLengthRecordReader",
"TFRecordReader", "IdentityReader", "RefSwitch", "RefEnter",
- "RefNextIteration", "RefMerge", "RefIdentity",
+ "RefNextIteration", "RefMerge", "RefIdentity", "LMDBReader",
// To be removed after 2017/04/24.
"ConditionalAccumulator", "SparseConditionalAccumulator", "Table",
};
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index d40e66cd16..bebd5d7bce 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -185,21 +185,13 @@ cc_test(
name = "virtual_scheduler_test",
srcs = ["virtual_scheduler_test.cc"],
deps = [
- ":graph_properties",
- ":utils",
":virtual_placer",
":virtual_scheduler",
"//tensorflow/cc:cc_ops",
- "//tensorflow/core:framework",
- "//tensorflow/core:protos_all_cc",
"//tensorflow/core:tensorflow",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
- "//tensorflow/core/grappler:grappler_item",
- "//tensorflow/core/grappler:utils",
- "//tensorflow/core/grappler/clusters:utils",
"//tensorflow/core/grappler/clusters:virtual_cluster",
- "//tensorflow/core/grappler/costs:cost_estimator",
],
)
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 721cf535b9..5c0515803d 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -218,9 +218,13 @@ Status GraphProperties::InferDynamically(Cluster* cluster) {
TF_RETURN_IF_ERROR(
cluster->Run(item_.graph, item_.feed, item_.fetch, &metadata));
+ return InferFromCostGraph(metadata.cost_graph());
+}
+
+Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
std::unordered_map<string, const NodeDef*> name_to_node; // Empty
- for (auto& node : metadata.cost_graph().node()) {
+ for (auto& node : cost_graph.node()) {
name_to_cost[node.name()] = &node;
std::vector<OpInfo::TensorProperties> output_properties;
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index d2b466175b..f2f6ad19a7 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -36,6 +36,7 @@ class GraphProperties {
Status InferStatically();
Status InferDynamically(Cluster* cluster);
+ Status InferFromCostGraph(const CostGraphDef& cost_graph);
bool HasOutputProperties(const string& name) const;
std::vector<OpInfo::TensorProperties> GetInputProperties(
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 6eca083184..bff5f7acc5 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -149,6 +149,54 @@ TEST_F(GraphPropertiesTest, DynamicProperties) {
}
}
+TEST_F(GraphPropertiesTest, Variables) {
+ GrapplerItem item;
+ TF_CHECK_OK(NodeDefBuilder("Var", "Variable")
+ .Attr("dtype", DT_FLOAT)
+ .Attr("shape", TensorShape({3, 7}))
+ .Finalize(item.graph.add_node()));
+ item.fetch.push_back("Var");
+
+ Tensor initial_val(DT_FLOAT, TensorShape({3, 7}));
+ TF_CHECK_OK(NodeDefBuilder("InitialVal", "Const")
+ .Attr("dtype", DT_FLOAT)
+ .Attr("value", initial_val)
+ .Finalize(item.graph.add_node()));
+ TF_CHECK_OK(NodeDefBuilder("InitVar", "Assign")
+ .Input("Var", 0, DT_FLOAT_REF)
+ .Input("InitialVal", 0, DT_FLOAT)
+ .Finalize(item.graph.add_node()));
+ item.init_ops.push_back("InitVar");
+
+ {
+ GraphProperties static_properties(item);
+ TF_CHECK_OK(static_properties.InferStatically());
+
+ const auto props = static_properties.GetOutputProperties("Var");
+ EXPECT_EQ(1, props.size());
+ const OpInfo::TensorProperties& prop = props[0];
+ EXPECT_EQ(DT_FLOAT_REF, prop.dtype());
+ EXPECT_FALSE(prop.shape().unknown_rank());
+ EXPECT_EQ(2, prop.shape().dim_size());
+ EXPECT_EQ(3, prop.shape().dim(0).size());
+ EXPECT_EQ(7, prop.shape().dim(1).size());
+ }
+ {
+ TF_CHECK_OK(cluster_->Initialize(item));
+ GraphProperties dynamic_properties(item);
+ TF_CHECK_OK(dynamic_properties.InferDynamically(cluster_.get()));
+
+ const auto props = dynamic_properties.GetOutputProperties("Var");
+ EXPECT_EQ(1, props.size());
+ const OpInfo::TensorProperties& prop = props[0];
+ EXPECT_EQ(DT_FLOAT_REF, prop.dtype());
+ EXPECT_FALSE(prop.shape().unknown_rank());
+ EXPECT_EQ(2, prop.shape().dim_size());
+ EXPECT_EQ(3, prop.shape().dim(0).size());
+ EXPECT_EQ(7, prop.shape().dim(1).size());
+ }
+}
+
TEST_F(GraphPropertiesTest, VarHandles) {
GrapplerItem item;
TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index c42218e447..eb1e521fce 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -245,20 +245,20 @@ class NodeProcessor {
virtual Status AddLayoutTransposeToInputs() {
std::vector<int> input_pos = GetInputPos();
for (const auto& pos : input_pos) {
- string node_name_NHWCToNCHW = strings::StrCat(
- kTransposeNHWCToNCHW, "-", node_->name(), "-", node_->input(pos));
+ string base_name = strings::StrCat(node_->name(), "-", node_->input(pos));
+ string node_name =
+ AddPrefixToNodeName(base_name, kTransposeNHWCToNCHW, "-");
auto input_node = node_map_->GetNode(node_->input(pos));
int output_pos = NodePosition(node_->input(pos));
TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
AddNodeTranspose(
- node_name_NHWCToNCHW, node_->input(pos), node_->attr().at("T").type(),
+ node_name, node_->input(pos), node_->attr().at("T").type(),
input_node->attr().at("_output_shapes").list().shape(output_pos),
true);
- node_map_->UpdateOutput(node_->input(pos), node_->name(),
- node_name_NHWCToNCHW);
- node_map_->AddOutput(node_name_NHWCToNCHW, node_->name());
- *node_->mutable_input(pos) = node_name_NHWCToNCHW;
+ node_map_->UpdateOutput(node_->input(pos), node_->name(), node_name);
+ node_map_->AddOutput(node_name, node_->name());
+ *node_->mutable_input(pos) = node_name;
}
return Status::OK();
}
@@ -266,9 +266,10 @@ class NodeProcessor {
virtual Status AddLayoutTransposeToOutputs() {
auto outputs = node_map_->GetOutputs(node_->name());
for (const auto& output : outputs) {
- string node_name_NCHWToNHWC = strings::StrCat(
- kTransposeNCHWToNHWC, "-", node_->name(), "-", output->name());
- // TODO (yaozhang): handle the rare case where node A is connected to more
+ string base_name = strings::StrCat(node_->name(), "-", output->name());
+ string node_name =
+ AddPrefixToNodeName(base_name, kTransposeNCHWToNHWC, "-");
+ // TODO(yaozhang): handle the rare case where node A is connected to more
// than one input of node B.
auto it = std::find_if(output->mutable_input()->begin(),
output->mutable_input()->end(),
@@ -290,13 +291,12 @@ class NodeProcessor {
}
TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes"));
- AddNodeTranspose(
- node_name_NCHWToNHWC, node_->name(), node_->attr().at("T").type(),
- node_->attr().at("_output_shapes").list().shape(0), false);
- *it = node_name_NCHWToNHWC;
- node_map_->UpdateOutput(node_->name(), output->name(),
- node_name_NCHWToNHWC);
- node_map_->AddOutput(node_name_NCHWToNHWC, output->name());
+ AddNodeTranspose(node_name, node_->name(), node_->attr().at("T").type(),
+ node_->attr().at("_output_shapes").list().shape(0),
+ false);
+ *it = node_name;
+ node_map_->UpdateOutput(node_->name(), output->name(), node_name);
+ node_map_->AddOutput(node_name, output->name());
}
return Status::OK();
}
@@ -468,7 +468,13 @@ class Conv2DBackpropInputProcessor : public Conv2DProcessor {
Status CustomizedProcessing() override {
NodeDef* node = node_map_->GetNode(node_->input(0));
- return UpdateAttrValue(node);
+ NodeDef* added_node = graph_->add_node();
+ *added_node = *node;
+ string node_name =
+ AddPrefixToNodeName(node->name(), "LayoutOptimizer", "-");
+ added_node->set_name(node_name);
+ node_map_->AddNode(node_name, added_node);
+ return UpdateAttrValue(added_node);
}
};
@@ -621,9 +627,11 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
Status CustomizedProcessing() override {
if (is_4d_with_vector_) {
- string suffix = strings::StrCat("-", node_->name(), "-", node_->input(1));
- string reshape_node_name = strings::StrCat(kReshapeNHWCToNCHW, suffix);
- string shape_const_node_name = strings::StrCat(kReshapeConst, suffix);
+ string base_name = strings::StrCat(node_->name(), "-", node_->input(1));
+ string reshape_node_name =
+ AddPrefixToNodeName(base_name, kReshapeNHWCToNCHW, "-");
+ string shape_const_node_name =
+ AddPrefixToNodeName(base_name, kReshapeConst, "-");
auto input_node = node_map_->GetNode(node_->input(1));
TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
int vector_size =
@@ -710,15 +718,15 @@ class SliceProcessor : public AgnosticNodeProcessor {
Status CustomizedProcessing() override {
// Skip the first input, which is the data to be sliced.
for (int i = 1; i < node_->input_size(); i++) {
- string node_name_NHWCToNCHW =
- strings::StrCat(kPermVecNHWCToNCHW, "-", node_->name(), "-input", i);
+ string base_name = strings::StrCat(node_->name(), "-input", i);
+ string node_name =
+ AddPrefixToNodeName(base_name, kPermVecNHWCToNCHW, "-");
TF_RETURN_IF_ERROR(HasAttribute(*node_, "Index"));
- AddNodePermVec(node_name_NHWCToNCHW, node_->input(i),
+ AddNodePermVec(node_name, node_->input(i),
node_->attr().at("Index").type(), true);
- node_map_->UpdateOutput(node_->input(i), node_->name(),
- node_name_NHWCToNCHW);
- node_map_->AddOutput(node_name_NHWCToNCHW, node_->name());
- *node_->mutable_input(i) = node_name_NHWCToNCHW;
+ node_map_->UpdateOutput(node_->input(i), node_->name(), node_name);
+ node_map_->AddOutput(node_name, node_->name());
+ *node_->mutable_input(i) = node_name;
}
return Status::OK();
}
@@ -795,7 +803,7 @@ class SliceProcessorConcatOffset : public AgnosticNodeProcessor {
"input 1 of ConcatOffset"));
}
// Need to process if the channel is at dimension 3, which indicates the
- // NHWC format is being used. As mutiple Slice nodes may share the same
+ // NHWC format is being used. As multiple Slice nodes may share the same
// ConcatOffset node, the NHWC to NCHW conversion may have already
// been performed when processing other Slice nodes.
TF_RETURN_IF_ERROR(HasAttribute(*axis_node, "value"));
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index be38ca1a69..566ea1d87d 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -36,8 +36,8 @@ void AddOutputShape(Node* node, const TensorShape& shape) {
class LayoutOptimizerTest : public ::testing::Test {
protected:
- Output SimpleConv(tensorflow::Scope* s, int input_size, int filter_size,
- const string& padding) {
+ Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
+ const string& padding) {
int batch_size = 128;
int input_height = input_size;
int input_width = input_size;
@@ -65,11 +65,80 @@ class LayoutOptimizerTest : public ::testing::Test {
AddOutputShape(conv.node(), input_shape);
return conv;
}
+
+ Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
+ int filter_size, const string& padding) {
+ int batch_size = 128;
+ int input_height = input_size;
+ int input_width = input_size;
+ int input_depth = 3;
+ int filter_count = 2;
+ int stride = 1;
+ TensorShape input_sizes_shape({4});
+ Tensor input_data(DT_INT32, input_sizes_shape);
+ test::FillValues<int>(&input_data,
+ {batch_size, input_height, input_width, input_depth});
+ Output input_sizes =
+ ops::Const(s->WithOpName("InputSizes"), Input::Initializer(input_data));
+ AddOutputShape(input_sizes.node(), input_sizes_shape);
+
+ TensorShape filter_shape(
+ {filter_size, filter_size, input_depth, filter_count});
+ Tensor filter_data(DT_FLOAT, filter_shape);
+ test::FillIota<float>(&filter_data, 1.0f);
+ Output filter =
+ ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
+ AddOutputShape(filter.node(), filter_shape);
+
+ int output_height = input_height;
+ int output_width = input_width;
+ TensorShape output_shape(
+ {batch_size, output_height, output_width, filter_count});
+ Tensor output_data(DT_FLOAT, output_shape);
+ test::FillIota<float>(&output_data, 1.0f);
+ Output output =
+ ops::Const(s->WithOpName("Output"), Input::Initializer(output_data));
+ AddOutputShape(output.node(), output_shape);
+
+ Output conv_backprop_input = ops::Conv2DBackpropInput(
+ s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
+ {1, stride, stride, 1}, padding);
+ TensorShape input_shape(
+ {batch_size, input_height, input_width, input_depth});
+ AddOutputShape(conv_backprop_input.node(), input_shape);
+ return conv_backprop_input;
+ }
+
+ Tensor GetAttrValue(const NodeDef& node) {
+ Tensor tensor;
+ CHECK(tensor.FromProto(node.attr().at({"value"}).tensor()));
+ return tensor;
+ }
};
+TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
+ tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+ auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME");
+ Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+ GrapplerItem item;
+ TF_CHECK_OK(s.ToGraphDef(&item.graph));
+ LayoutOptimizer optimizer;
+ optimizer.set_num_gpus(1);
+ GraphDef output;
+ Status status = optimizer.Optimize(nullptr, item, &output);
+ NodeMap node_map(&output);
+ auto input_sizes_node = node_map.GetNode(
+ AddPrefixToNodeName("InputSizes", "LayoutOptimizer", "-"));
+ CHECK(input_sizes_node);
+ auto input_sizes = GetAttrValue(*input_sizes_node);
+ Tensor input_sizes_expected(DT_INT32, {4});
+ test::FillValues<int>(&input_sizes_expected, {128, 3, 7, 7});
+ test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
+}
+
TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
tensorflow::Scope s = tensorflow::Scope::NewRootScope();
- auto conv = SimpleConv(&s, 2, 1, "SAME");
+ auto conv = SimpleConv2D(&s, 2, 1, "SAME");
Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
GrapplerItem item;
TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -84,7 +153,7 @@ TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
tensorflow::Scope s = tensorflow::Scope::NewRootScope();
- auto conv = SimpleConv(&s, 2, 1, "SAME");
+ auto conv = SimpleConv2D(&s, 2, 1, "SAME");
Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
GrapplerItem item;
TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -99,7 +168,7 @@ TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
tensorflow::Scope s = tensorflow::Scope::NewRootScope();
- auto conv = SimpleConv(&s, 2, 2, "VALID");
+ auto conv = SimpleConv2D(&s, 2, 2, "VALID");
Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
GrapplerItem item;
TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -114,7 +183,7 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
tensorflow::Scope s = tensorflow::Scope::NewRootScope();
- auto conv = SimpleConv(&s, 2, 2, "SAME");
+ auto conv = SimpleConv2D(&s, 2, 2, "SAME");
Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
GrapplerItem item;
TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -129,7 +198,7 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
tensorflow::Scope s = tensorflow::Scope::NewRootScope();
- auto conv = SimpleConv(&s, 2, 3, "VALID");
+ auto conv = SimpleConv2D(&s, 2, 3, "VALID");
Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
GrapplerItem item;
TF_CHECK_OK(s.ToGraphDef(&item.graph));
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index b7a04f4423..cc3cbdacb4 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -98,13 +98,18 @@ int NodePosition(const string& name) {
return position;
}
-string AddPrefixToNodeName(const string& name, const string& prefix) {
+string AddPrefixToNodeName(const string& name, const string& prefix,
+ const string& delimiter) {
if (!name.empty()) {
if (name[0] == '^') {
- return strings::StrCat("^", prefix, "/", name.substr(1));
+ return strings::StrCat("^", prefix, delimiter, name.substr(1));
}
}
- return strings::StrCat(prefix, "/", name);
+ return strings::StrCat(prefix, delimiter, name);
+}
+
+string AddPrefixToNodeName(const string& name, const string& prefix) {
+ return AddPrefixToNodeName(name, prefix, "/");
}
bool ExecuteWithTimeout(std::function<void()> fn, const int64 timeout_in_ms,
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 5a3c0614e7..f86fc608c5 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -60,7 +60,11 @@ int NodePosition(const string& name);
// Returns the node name and position in a single call.
string ParseNodeName(const string& name, int* position);
-// Add a prefix to a node name
+// Add a prefix to a node name with a custom delimiter.
+string AddPrefixToNodeName(const string& name, const string& prefix,
+ const string& delimiter);
+
+// Add a prefix to a node name.
string AddPrefixToNodeName(const string& name, const string& prefix);
// Executes a 'fn' in the 'thread_pool'. The method waits for the configured
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1e5bc0ceab..ed0379bb3f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1904,6 +1904,7 @@ cc_library(
deps = [
":fixed_length_record_reader_op",
":identity_reader_op",
+ ":lmdb_reader_op",
":matching_files_op",
":reader_ops",
":restore_op",
@@ -1939,6 +1940,14 @@ tf_kernel_library(
)
tf_kernel_library(
+ name = "lmdb_reader_op",
+ prefix = "lmdb_reader_op",
+ deps = IO_DEPS + [
+ "@lmdb",
+ ],
+)
+
+tf_kernel_library(
name = "matching_files_op",
prefix = "matching_files_op",
deps = IO_DEPS,
@@ -3170,6 +3179,21 @@ tf_kernel_library(
deps = REQUIRED_DEPS,
)
+tf_cc_test(
+ name = "sendrecv_ops_test",
+ srcs = ["sendrecv_ops_test.cc"],
+ linkstatic = tf_kernel_tests_linkstatic(), # Required for benchmarking
+ deps = [
+ ":ops_testutil",
+ ":ops_util",
+ ":sendrecv_ops",
+ "//tensorflow/core:framework",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ "//tensorflow/core:testlib",
+ ],
+)
+
cc_library(
name = "sparse",
deps = [
@@ -4313,6 +4337,7 @@ filegroup(
# not used on Android. Those ops also do not compile if included,
# unless we add the additional deps they need.
"tf_record_reader_op.*",
+ "lmdb_reader_op.*",
"string_to_hash_bucket_op.*",
"sdca_ops.*",
"sdca_internal.*",
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 73446fbe05..914627992b 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -153,7 +153,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync(
info_checker_callback) const {
std::vector<HostLapackInfo> host_lapack_infos;
if (dev_lapack_infos.empty()) {
- info_checker_callback(Status::OK(), std::move(host_lapack_infos));
+ info_checker_callback(Status::OK(), host_lapack_infos);
return Status::OK();
}
@@ -174,7 +174,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync(
auto wrapped_info_checker_callback =
[info_checker_callback](std::vector<HostLapackInfo> host_lapack_infos) {
Status status;
- for (auto host_lapack_info : host_lapack_infos) {
+ for (const auto& host_lapack_info : host_lapack_infos) {
for (int i = 0; i < host_lapack_info.size() && status.ok(); ++i) {
const int info_value = (host_lapack_info.data())[i];
if (info_value != 0) {
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index 037272e009..1830b2a178 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
#include "tensorflow/core/framework/types.pb.h"
#include "tensorflow/core/kernels/ops_testutil.h"
#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/test.h"
@@ -94,7 +95,22 @@ TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) {
ASSERT_TRUE(env_->FileExists(dump_roots[i]).ok());
ASSERT_TRUE(env_->IsDirectory(dump_roots[i]).ok());
- DIR* dir = opendir(dump_roots[i].c_str());
+ std::vector<string> device_roots;
+ DIR* dir0 = opendir(dump_roots[i].c_str());
+ struct dirent* ent0;
+ const string kDeviceDirPrefix =
+ strings::StrCat(DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag);
+ while ((ent0 = readdir(dir0)) != nullptr) {
+ if (!strncmp(ent0->d_name, kDeviceDirPrefix.c_str(),
+ kDeviceDirPrefix.size())) {
+ device_roots.push_back(io::JoinPath(dump_roots[i], ent0->d_name));
+ }
+ }
+ ASSERT_EQ(1, device_roots.size());
+ closedir(dir0);
+
+ const string& device_root = device_roots[0];
+ DIR* dir = opendir(device_root.c_str());
struct dirent* ent;
int dump_files_found = 0;
while ((ent = readdir(dir)) != nullptr) {
@@ -102,8 +118,7 @@ TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) {
dump_files_found++;
// Try reading the file into a Event proto.
- const string dump_file_path =
- strings::StrCat(dump_roots[i], "/", ent->d_name);
+ const string dump_file_path = io::JoinPath(device_root, ent->d_name);
std::fstream ifs(dump_file_path, std::ios::in | std::ios::binary);
Event event;
event.ParseFromIstream(&ifs);
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 7303c97c0e..0d4feb7c1d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -34,6 +34,19 @@ namespace tensorflow {
using Eigen::GpuDevice;
+// Returns whether depthwise convolution forward pass can be performed using the
+// faster ('Small') variant of the kernel.
+EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
+ const DepthwiseArgs args) {
+ return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 16 &&
+ args.in_cols <= 16 && args.in_rows == args.out_rows &&
+ args.in_cols == args.out_cols && args.pad_rows >= 0 &&
+ args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
+ args.pad_cols < args.filter_cols &&
+ args.filter_rows * args.filter_cols <=
+ (args.in_rows + 1) / 2 * args.in_cols;
+}
+
// A Cuda kernel to compute the depthwise convolution forward pass
// in NHWC format.
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -118,7 +131,8 @@ __global__ void __launch_bounds__(1024, 2)
// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
// tailored for small images up to 16x16. Stride and depth multiplier must be 1.
-// Padding must be 'SAME', which allows to reuse the index computation.
+// Padding must be 'SAME', which allows to reuse the index computation. Only
+// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
// Tiles of the input and filter tensors are loaded into shared memory before
// performing the convolution. Each thread handles two elements per iteration,
// one each in the lower and upper half of a tile.
@@ -126,6 +140,7 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
bool kKnownEvenRows>
__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+ assert(CanLaunchDepthwiseConv2dGPUSmall(args));
// Holds block plus halo and filter data for blockDim.x depths.
extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
T* const shared_data = reinterpret_cast<T*>(shared_memory);
@@ -143,16 +158,15 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
// Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
const int block_slices = 8;
- const int block_cols = blockDim.y;
const int block_rows = blockDim.z;
// These values are the same for all threads and could
// be precomputed on the CPU.
- const int block_size = block_rows * block_cols * block_slices;
+ const int block_size = block_rows * in_cols * block_slices;
const int in_row_size = in_cols * in_depth;
const int in_size = in_rows * in_row_size;
const int in_increment = (in_cols - 1) * block_slices;
- const int filter_size = filter_rows * filter_cols;
+ const int filter_pixels = filter_rows * filter_cols;
const int tile_cols = in_cols + filter_cols - 1;
const int even_rows = kKnownEvenRows || (1 & ~in_rows);
const int tile_rows = in_rows + filter_rows - even_rows;
@@ -170,7 +184,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
const int thread_row = threadIdx.z;
// Position in block.
- const int thread_pix = thread_row * block_cols + thread_col;
+ const int thread_pix = thread_row * in_cols + thread_col;
const int thread_idx = thread_pix * block_slices + thread_depth;
// Initialize tile, in particular the padding.
@@ -192,7 +206,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
const int max_depth = in_depth - thread_depth;
const int filter_write_offset =
- thread_pix < filter_size ? tile_size + thread_idx : 0;
+ thread_pix < filter_pixels ? tile_size + thread_idx : 0;
const int filter_read_offset = tile_size + thread_depth;
const bool skip_second =
!kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
@@ -377,56 +391,198 @@ __global__ void __launch_bounds__(1024, 2)
}
}
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
-bool TryLaunchDepthwiseConv2dGPUSmall(const GpuDevice& d,
- const DepthwiseArgs args, const T* input,
- const T* filter, T* output,
- TensorFormat data_format) {
- if (data_format != FORMAT_NHWC || args.depth_multiplier != 1 ||
- args.stride != 1 || args.in_rows > 16 || args.in_cols > 16 ||
- args.in_rows != args.out_rows || args.in_cols != args.out_cols ||
- args.pad_rows < 0 || args.pad_rows >= args.filter_rows ||
- args.pad_cols < 0 || args.pad_cols >= args.filter_cols) {
- return false;
+// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
+// tailored for small images up to 16x16. Stride and depth multiplier must be 1.
+// Padding must be 'SAME', which allows to reuse the index computation. Only
+// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input and filter tensors are loaded into shared memory before
+// performing the convolution. Each thread handles two elements per iteration,
+// one each in the lower and upper half of a tile.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+ bool kKnownEvenRows>
+__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
+ const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+ assert(CanLaunchDepthwiseConv2dGPUSmall(args));
+ // Holds block plus halo and filter data for blockDim.z depths.
+ extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+ T* const shared_data = reinterpret_cast<T*>(shared_memory);
+
+ const int batches = args.batch;
+ const int in_rows = args.in_rows;
+ const int in_cols = args.in_cols;
+ const int in_depth = args.in_depth;
+ const int filter_rows =
+ kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+ const int filter_cols =
+ kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+ const int pad_rows = args.pad_rows;
+ const int pad_cols = args.pad_cols;
+
+ // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
+ const int block_rows = blockDim.y;
+ const int block_slices = 8;
+
+ // These values are the same for all threads and could
+ // be precomputed on the CPU.
+ const int block_pixels = in_cols * block_rows;
+ const int block_size = block_pixels * block_slices;
+ const int in_pixels = in_cols * in_rows;
+ const int in_increment = in_cols - 1;
+ const int filter_pixels = filter_rows * filter_cols;
+ const int tile_cols = in_cols + filter_cols - 1;
+ const int even_rows = kKnownEvenRows || (1 & ~in_rows);
+ const int tile_rows = in_rows + filter_rows - even_rows;
+ const int tile_pixels = tile_cols * tile_rows;
+ const int tile_size = tile_pixels * block_slices;
+ const int tile_offset = block_rows * tile_cols;
+ const int pad_offset = pad_rows * tile_cols + pad_cols;
+ const int in_slices = in_depth * batches;
+ const int in_blocks = (in_slices + block_slices - 1) / block_slices;
+
+ const int thread_col = threadIdx.x;
+ const int thread_row = threadIdx.y;
+ const int thread_depth = threadIdx.z;
+
+ // Position in block.
+ const int thread_pix = thread_row * in_cols + thread_col;
+ const int thread_idx = thread_depth * block_pixels + thread_pix;
+
+ // Initialize tile, in particular the padding.
+ for (int i = thread_idx; i < tile_size; i += block_size) {
+ shared_data[i] = T(0);
}
+ __syncthreads();
- const int block_rows = (args.in_rows + 1) / 2;
- if (args.filter_rows * args.filter_cols > args.in_cols * block_rows) {
- return false;
+ // Position in tensors.
+ const int tensor_idx = thread_depth * in_pixels + thread_pix;
+
+ // Position in (padded) shared memory.
+ const int data_pix = thread_row * tile_cols + thread_col;
+ const int data_idx = thread_depth * tile_pixels + data_pix;
+
+ // Position in shared memory, offset by pad_rows / pad_cols.
+ const int tile_idx = data_idx + pad_offset;
+
+ // Filter is always in HWCK format, irrespective of the input/output format.
+ const int filter_pix = thread_idx / block_slices;
+ const int filter_depth = thread_idx % block_slices;
+ const int filter_idx = filter_pix * in_depth;
+
+ const int max_slice = in_slices - thread_depth;
+ const int filter_write_offset =
+ filter_pix < filter_pixels ? tile_size + thread_idx : 0;
+ const int filter_read_offset = tile_size + thread_depth;
+ const bool skip_second =
+ !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
+
+ for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+ const int slice = b * block_slices;
+
+ const int inout_offset = slice * in_pixels + tensor_idx;
+ const bool slice_in_range = slice < max_slice;
+
+ if (slice_in_range) {
+ const T* const in_ptr = inout_offset + input;
+ T* const tile_ptr = tile_idx + shared_data;
+ tile_ptr[0] = ldg(in_ptr);
+ if (!skip_second) {
+ tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
+ }
+ }
+
+ if (filter_write_offset != 0) {
+ const int filter_offset = filter_idx + (slice + filter_depth) % in_depth;
+ shared_data[filter_write_offset] = ldg(filter_offset + filter);
+ }
+
+ // Note: the condition to reach this is uniform across the entire block.
+ __syncthreads();
+
+ if (slice_in_range) {
+ T sum1 = 0;
+ T sum2 = 0;
+ int shared_offset = data_idx;
+ const T* filter_ptr = filter_read_offset + shared_data;
+ UNROLL for (int r = 0; r < filter_rows; ++r) {
+ UNROLL for (int c = 0; c < filter_cols; ++c) {
+ const T filter_value = *filter_ptr;
+ const T* const tile_ptr = shared_offset + shared_data;
+ sum1 += filter_value * tile_ptr[0];
+ sum2 += filter_value * tile_ptr[tile_offset];
+ ++shared_offset;
+ filter_ptr += block_slices;
+ }
+ shared_offset += in_increment;
+ }
+ T* const out_ptr = inout_offset + output;
+ out_ptr[0] = sum1;
+ if (!skip_second) {
+ out_ptr[block_pixels] = sum2;
+ }
+ }
+
+ // Note: the condition to reach this is uniform across the entire block.
+ __syncthreads();
}
+}
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+ bool kKnownEvenRows>
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
+ const T* input, const T* filter, T* output,
+ TensorFormat data_format) {
+ const int block_rows = (args.in_rows + 1) / 2;
+ const int block_slices = 8;
const int tile_cols = args.in_cols + args.filter_cols - 1;
const int tile_rows = block_rows * 2 + args.filter_rows - 1;
- const int tile_size = tile_rows * tile_cols;
- const int filter_size = args.filter_rows * args.filter_cols;
- dim3 block_dim = dim3(8, args.in_cols, block_rows);
- const int shared_memory_size =
- block_dim.x * (tile_size + filter_size) * sizeof(T);
+ const int tile_pixels = tile_rows * tile_cols;
+ const int filter_pixels = args.filter_rows * args.filter_cols;
+ const int shared_memory_size =
+ block_slices * (tile_pixels + filter_pixels) * sizeof(T);
const int num_outputs =
args.batch * args.out_rows * args.out_cols * args.out_depth;
- if (args.in_rows & 1) {
+
+ if (data_format == FORMAT_NHWC) {
+ dim3 block_dim = dim3(block_slices, args.in_cols, block_rows);
CudaLaunchConfig config = GetCudaLaunchConfig(
num_outputs, d,
DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth,
- kKnownFilterHeight, false>,
+ kKnownFilterHeight, kKnownEvenRows>,
shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, kKnownFilterHeight,
- false>
+ kKnownEvenRows>
<<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
args, input, filter, output);
- } else {
+ } else if (data_format == FORMAT_NCHW) {
+ dim3 block_dim = dim3(args.in_cols, block_rows, block_slices);
CudaLaunchConfig config = GetCudaLaunchConfig(
num_outputs, d,
- DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth,
- kKnownFilterHeight, true>,
+ DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth,
+ kKnownFilterHeight, kKnownEvenRows>,
shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
- DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, kKnownFilterHeight,
- true>
+ DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+ kKnownEvenRows>
<<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
args, input, filter, output);
+ } else {
+ assert(false);
+ }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
+ const T* input, const T* filter, T* output,
+ TensorFormat data_format) {
+ if (args.in_rows & 1) {
+ LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+ /*kKnownEvenRows=*/false>(
+ d, args, input, filter, output, data_format);
+ } else {
+ LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+ /*kKnownEvenRows=*/true>(
+ d, args, input, filter, output, data_format);
}
- return true;
}
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -434,9 +590,9 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
const T* input, const T* filter, T* output,
TensorFormat data_format) {
- if (TryLaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth,
- kKnownFilterHeight>(
- d, args, input, filter, output, data_format)) {
+ if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
+ LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight>(
+ d, args, input, filter, output, data_format);
return;
}
const int num_outputs =
@@ -588,16 +744,15 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall(
// Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
const int block_slices = 8;
- const int block_cols = blockDim.y;
const int block_rows = blockDim.z;
// These values are the same for all threads and could
// be precomputed on the CPU.
- const int block_size = block_rows * block_cols * block_slices;
+ const int block_size = block_rows * in_cols * block_slices;
const int in_row_size = in_cols * in_depth;
const int in_size = in_rows * in_row_size;
const int in_increment = (in_cols - 1) * block_slices;
- const int filter_size = filter_rows * filter_cols;
+ const int filter_pixels = filter_rows * filter_cols;
const int tile_cols = in_cols + filter_cols - 1;
const int even_rows = kKnownEvenRows || (1 & ~in_rows);
const int tile_rows = in_rows + filter_rows - even_rows;
@@ -615,7 +770,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall(
const int thread_row = threadIdx.z;
// Position in block.
- const int thread_pix = thread_row * block_cols + thread_col;
+ const int thread_pix = thread_row * in_cols + thread_col;
const int thread_idx = thread_pix * block_slices + thread_depth;
// Initialize tile, in particular the padding.
@@ -637,9 +792,9 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall(
const int max_depth = in_depth - thread_depth;
const int filter_write_offset =
- thread_pix < filter_size ? tile_size + thread_idx : 0;
+ thread_pix < filter_pixels ? tile_size + thread_idx : 0;
const int filter_read_offset =
- tile_size + filter_size * block_slices + thread_depth;
+ tile_size + filter_pixels * block_slices + thread_depth;
const bool skip_second =
!kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
@@ -786,11 +941,11 @@ bool TryLaunchDepthwiseConv2dBackpropInputGPUSmall(
const int tile_cols = args.in_cols + args.filter_cols - 1;
const int tile_rows = block_rows * 2 + args.filter_rows - 1;
- const int tile_size = tile_rows * tile_cols;
- const int filter_size = args.filter_rows * args.filter_cols;
+ const int tile_pixels = tile_rows * tile_cols;
+ const int filter_pixels = args.filter_rows * args.filter_cols;
dim3 block_dim = dim3(8, args.in_cols, block_rows);
const int shared_memory_size =
- block_dim.x * (tile_size + filter_size) * sizeof(T);
+ block_dim.x * (tile_pixels + filter_pixels) * sizeof(T);
const int num_in_backprop =
args.batch * args.in_rows * args.in_cols * args.in_depth;
@@ -1007,16 +1162,15 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
// Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
const int block_slices = 8;
- const int block_cols = blockDim.y;
const int block_rows = blockDim.z;
// These values are the same for all threads and could
// be precomputed on the CPU.
- const int block_size = block_rows * block_cols * block_slices;
+ const int block_size = block_rows * in_cols * block_slices;
const int in_row_size = in_cols * in_depth;
const int in_size = in_rows * in_row_size;
const int in_increment = (in_cols - 1) * block_slices;
- const int filter_size = filter_rows * filter_cols;
+ const int filter_pixels = filter_rows * filter_cols;
const int tile_cols = in_cols + filter_cols - 1;
const int tile_rows = 2 * block_rows + filter_rows - 1;
const int tile_row_size = tile_cols * block_slices;
@@ -1028,14 +1182,14 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
const int tensor_offset = block_rows * in_row_size;
const int accum_pixels = 32;
const int accum_increment = accum_pixels * block_slices;
- const int accum_size = filter_size * accum_increment;
+ const int accum_size = filter_pixels * accum_increment;
const int thread_depth = threadIdx.x;
const int thread_col = threadIdx.y;
const int thread_row = threadIdx.z;
// Position in block.
- const int thread_pix = thread_row * block_cols + thread_col;
+ const int thread_pix = thread_row * in_cols + thread_col;
const int thread_idx = thread_pix * block_slices + thread_depth;
// Initialize tile, in particular the padding and accumulator.
@@ -1247,11 +1401,11 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
const int block_rows = (args.in_rows + 1) / 2 + rows_mask & ~rows_mask;
const int tile_cols = args.in_cols + args.filter_cols - 1;
const int tile_rows = block_rows * 2 + args.filter_rows - 1;
- const int tile_size = tile_rows * tile_cols;
+ const int tile_pixels = tile_rows * tile_cols;
const int accum_size = args.filter_rows * args.filter_cols * 32;
dim3 block_dim = dim3(8, args.in_cols, block_rows);
const int shared_memory_size =
- block_dim.x * (tile_size + accum_size) * sizeof(T);
+ block_dim.x * (tile_pixels + accum_size) * sizeof(T);
if (block_rows > args.in_rows ||
args.filter_rows * args.filter_cols > args.in_cols * block_rows ||
diff --git a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
index 3d6f493a9c..d5b4cf7451 100644
--- a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
@@ -48,4 +48,4 @@ class IGraphTransferOpsDefinitions {
} // namespace tensorflow
-#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H
+#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H_
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc
index fa3f3a4db6..51f11f9d2e 100644
--- a/tensorflow/core/kernels/iterator_ops.cc
+++ b/tensorflow/core/kernels/iterator_ops.cc
@@ -246,7 +246,7 @@ class OneShotIteratorOp : public OpKernel {
n.Notify();
});
n.WaitForNotification();
- OP_REQUIRES_OK(ctx, std::move(factory_status));
+ OP_REQUIRES_OK(ctx, factory_status);
OP_REQUIRES(
ctx,
return_values.size() == 1 &&
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
new file mode 100755
index 0000000000..23cabe7b54
--- /dev/null
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "lmdb.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#include <sys/stat.h>
+
+namespace tensorflow {
+
+inline void MDB_CHECK(int mdb_status) {
+ CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+}
+
+class LMDBReader : public ReaderBase {
+ public:
+ LMDBReader(const string& node_name, Env* env)
+ : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")),
+ env_(env),
+ mdb_env_(nullptr),
+ mdb_dbi_(0),
+ mdb_txn_(nullptr),
+ mdb_cursor_(nullptr) {}
+
+ Status OnWorkStartedLocked() override {
+ MDB_CHECK(mdb_env_create(&mdb_env_));
+ int flags = MDB_RDONLY | MDB_NOTLS;
+
+ // Check if the LMDB filename is actually a file instead of a directory.
+ // If so, set appropriate flags so we can open it.
+ struct stat source_stat;
+ if (stat(current_work().c_str(), &source_stat) == 0 &&
+ (source_stat.st_mode & S_IFREG)) {
+ flags |= MDB_NOSUBDIR;
+ }
+
+ MDB_CHECK(mdb_env_open(mdb_env_, current_work().c_str(), flags, 0664));
+ MDB_CHECK(mdb_txn_begin(mdb_env_, nullptr, MDB_RDONLY, &mdb_txn_));
+ MDB_CHECK(mdb_dbi_open(mdb_txn_, nullptr, 0, &mdb_dbi_));
+
+ return Status::OK();
+ }
+
+ Status OnWorkFinishedLocked() override {
+ if (mdb_env_ != nullptr) {
+ if (mdb_cursor_) {
+ mdb_cursor_close(mdb_cursor_);
+ }
+ mdb_txn_abort(mdb_txn_);
+ mdb_dbi_close(mdb_env_, mdb_dbi_);
+ mdb_env_close(mdb_env_);
+ mdb_env_ = nullptr;
+ }
+ return Status::OK();
+ }
+
+ Status ReadLocked(string* key, string* value, bool* produced,
+ bool* at_end) override {
+ if (mdb_cursor_ == nullptr) {
+ MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
+ if (Seek(MDB_FIRST) == false) {
+ *at_end = true;
+ return Status::OK();
+ }
+ }
+ else {
+ if (Seek(MDB_NEXT) == false) {
+ *at_end = true;
+ return Status::OK();
+ }
+ }
+ *key = string(static_cast<const char*>(mdb_key_.mv_data),
+ mdb_key_.mv_size);
+ *value = string(static_cast<const char*>(mdb_value_.mv_data),
+ mdb_value_.mv_size);
+ *produced = true;
+ return Status::OK();
+ }
+
+ Status ResetLocked() override {
+ CHECK_EQ(Seek(MDB_FIRST), true);
+ return ReaderBase::ResetLocked();
+ }
+
+ private:
+ bool Seek(MDB_cursor_op op) {
+ CHECK_NOTNULL(mdb_cursor_);
+ int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+ if (mdb_status == MDB_NOTFOUND) {
+ return false;
+ } else {
+ MDB_CHECK(mdb_status);
+ return true;
+ }
+ }
+
+ Env* const env_;
+ MDB_env* mdb_env_;
+ MDB_dbi mdb_dbi_;
+
+ MDB_txn* mdb_txn_;
+ MDB_cursor* mdb_cursor_;
+ MDB_val mdb_key_, mdb_value_;
+};
+
+class LMDBReaderOp : public ReaderOpKernel {
+ public:
+ explicit LMDBReaderOp(OpKernelConstruction* context)
+ : ReaderOpKernel(context) {
+ Env* env = context->env();
+ SetReaderFactory([this, env]() {
+ return new LMDBReader(name(), env);
+ });
+ }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
+ LMDBReaderOp);
+
+}
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index f119cb1902..894b0113c2 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -83,7 +83,7 @@ class MatrixBandPartOp : public OpKernel {
REGISTER_KERNEL_BUILDER( \
Name("MatrixBandPart").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
MatrixBandPartOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_BAND_PART);
+TF_CALL_POD_TYPES(REGISTER_MATRIX_BAND_PART);
#undef REGISTER_MATRIX_BAND_PART
// Registration of the deprecated kernel.
@@ -143,6 +143,7 @@ namespace functor {
extern template struct MatrixBandPart<GPUDevice, T>;
TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
TF_CALL_complex64(DECLARE_GPU_SPEC);
TF_CALL_complex128(DECLARE_GPU_SPEC);
} // namespace functor
@@ -156,6 +157,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
.HostMemory("num_upper"), \
MatrixBandPartOp<GPUDevice, type>);
TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_BAND_PART_GPU);
+TF_CALL_bool(REGISTER_MATRIX_BAND_PART_GPU);
TF_CALL_complex64(REGISTER_MATRIX_BAND_PART_GPU);
TF_CALL_complex128(REGISTER_MATRIX_BAND_PART_GPU);
#undef REGISTER_MATRIX_BAND_PART_GPU
diff --git a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
index fab3fb4a1f..ccc10ebada 100644
--- a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
@@ -29,6 +29,7 @@ typedef Eigen::GpuDevice GPUDevice;
template struct functor::MatrixBandPart<GPUDevice, T>;
TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
TF_CALL_complex64(DEFINE_GPU_SPEC);
TF_CALL_complex128(DEFINE_GPU_SPEC);
diff --git a/tensorflow/core/kernels/matrix_diag_op.cc b/tensorflow/core/kernels/matrix_diag_op.cc
index bc193357ad..75c49baaa8 100644
--- a/tensorflow/core/kernels/matrix_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_diag_op.cc
@@ -123,7 +123,7 @@ class MatrixDiagOp : public OpKernel {
REGISTER_KERNEL_BUILDER( \
Name("MatrixDiagPart").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
MatrixDiagPartOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_DIAG);
+TF_CALL_POD_TYPES(REGISTER_MATRIX_DIAG);
#undef REGISTER_MATRIX_DIAG
// Registration of the deprecated kernel.
@@ -136,7 +136,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_DIAG);
.Device(DEVICE_CPU) \
.TypeConstraint<type>("T"), \
MatrixDiagPartOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_DIAG);
+TF_CALL_POD_TYPES(REGISTER_BATCH_MATRIX_DIAG);
#undef REGISTER_BATCH_MATRIX_DIAG
// Implementation of the functor specialization for CPU.
@@ -187,6 +187,7 @@ namespace functor {
extern template struct MatrixDiagPart<GPUDevice, T>;
TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
TF_CALL_complex64(DECLARE_GPU_SPEC);
TF_CALL_complex128(DECLARE_GPU_SPEC);
@@ -201,6 +202,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
Name("MatrixDiagPart").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
MatrixDiagPartOp<GPUDevice, type>);
TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_DIAG_GPU);
+TF_CALL_bool(REGISTER_MATRIX_DIAG_GPU);
TF_CALL_complex64(REGISTER_MATRIX_DIAG_GPU);
TF_CALL_complex128(REGISTER_MATRIX_DIAG_GPU);
#undef REGISTER_MATRIX_DIAG_GPU
diff --git a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
index 14774d5404..cfb1fa10fc 100644
--- a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
@@ -31,6 +31,7 @@ typedef Eigen::GpuDevice GPUDevice;
template struct functor::MatrixDiagPart<GPUDevice, T>;
TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
TF_CALL_complex64(DEFINE_GPU_SPEC);
TF_CALL_complex128(DEFINE_GPU_SPEC);
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index cbb2b68b7f..3397af56bc 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -100,7 +100,7 @@ class MatrixSetDiagOp : public OpKernel {
REGISTER_KERNEL_BUILDER( \
Name("MatrixSetDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
MatrixSetDiagOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG);
+TF_CALL_POD_TYPES(REGISTER_MATRIX_SET_DIAG);
#undef REGISTER_MATRIX_SET_DIAG
// Registration of the deprecated kernel.
@@ -109,7 +109,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG);
REGISTER_KERNEL_BUILDER( \
Name("BatchMatrixSetDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
MatrixSetDiagOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_SET_DIAG);
+TF_CALL_POD_TYPES(REGISTER_BATCH_MATRIX_SET_DIAG);
#undef REGISTER_BATCH_MATRIX_SET_DIAG
namespace functor {
@@ -131,6 +131,21 @@ struct MatrixSetDiag<CPUDevice, T> {
}
};
+template <>
+struct MatrixSetDiag<CPUDevice, bool> {
+ static void Compute(const CPUDevice& d, TTypes<bool, 3>::ConstTensor input,
+ TTypes<bool, 2>::ConstTensor diag,
+ TTypes<bool>::Scalar scratch,
+ TTypes<bool, 3>::Tensor output) {
+ output.device(d) = input;
+ for (int64 r = 0; r < output.dimension(0); ++r) {
+ for (int64 d = 0; d < diag.dimension(1); ++d) {
+ output(r, d, d) = diag(r, d);
+ }
+ }
+ }
+};
+
} // namespace functor
#if GOOGLE_CUDA
@@ -147,6 +162,7 @@ namespace functor {
extern template struct MatrixSetDiag<GPUDevice, T>;
TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
TF_CALL_complex64(DECLARE_GPU_SPEC);
TF_CALL_complex128(DECLARE_GPU_SPEC);
@@ -158,6 +174,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
Name("MatrixSetDiag").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
MatrixSetDiagOp<GPUDevice, type>);
TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG_GPU);
+TF_CALL_bool(REGISTER_MATRIX_SET_DIAG_GPU);
TF_CALL_complex64(REGISTER_MATRIX_SET_DIAG_GPU);
TF_CALL_complex128(REGISTER_MATRIX_SET_DIAG_GPU);
#undef REGISTER_MATRIX_SET_DIAG_GPU
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.h b/tensorflow/core/kernels/matrix_set_diag_op.h
index 8ba2f3756a..63e5650bf0 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.h
+++ b/tensorflow/core/kernels/matrix_set_diag_op.h
@@ -71,6 +71,23 @@ struct MatrixSetDiag {
}
};
+template <typename Device>
+struct MatrixSetDiag<Device, bool> {
+ EIGEN_ALWAYS_INLINE static void Compute(const Device& d,
+ TTypes<bool, 3>::ConstTensor input,
+ TTypes<bool, 2>::ConstTensor diag,
+ TTypes<bool>::Scalar scratch,
+ TTypes<bool, 3>::Tensor output) {
+ output.device(d) = input;
+ generator::OverwriteDiagGenerator<bool> generator(diag, output);
+ // Use all() to force the generation to aggregate to the scalar
+ // output scratch. This in turn forces each element of the
+ // generator to execute. The side effect of the execution is to
+ // update the diagonal components of output with diag.
+ scratch.device(d) = diag.generate(generator).all();
+ }
+};
+
} // namespace functor
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
index bd097ff328..8e41ce5860 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
@@ -29,6 +29,7 @@ typedef Eigen::GpuDevice GPUDevice;
template struct functor::MatrixSetDiag<GPUDevice, T>;
TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
TF_CALL_complex64(DEFINE_GPU_SPEC);
TF_CALL_complex128(DEFINE_GPU_SPEC);
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 1c7d50e161..4c656bd74b 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -52,17 +52,16 @@ SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
send_device_incarnation, tensor_name);
+ // The vast majority of Send nodes are outside any loop context, so
+ // proactively cache the rendezvous key for the top-level.
+ GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
+ OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
}
void SendOp::Compute(OpKernelContext* ctx) {
OP_REQUIRES(
ctx, ctx->rendezvous() != nullptr,
errors::Internal("Op kernel context needs to provide a rendezvous."));
- Rendezvous::ParsedKey parsed;
- GetRendezvousKey(key_prefix_, ctx->frame_iter(), &parsed.buf_);
- VLOG(2) << "Send " << parsed.buf_;
-
- OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed.buf_, &parsed));
// The device context may be passed between the Send/Recv
// boundary, so that the device context used to produce the Tensor
@@ -71,8 +70,24 @@ void SendOp::Compute(OpKernelContext* ctx) {
Rendezvous::Args args;
args.device_context = ctx->op_device_context();
args.alloc_attrs = ctx->input_alloc_attr(0);
- OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed, args, ctx->input(0),
- ctx->is_input_dead()));
+
+ if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+ // Use the cached rendezvous key.
+ VLOG(2) << "Send " << parsed_key_.buf_;
+ OP_REQUIRES_OK(ctx,
+ ctx->rendezvous()->Send(parsed_key_, args, ctx->input(0),
+ ctx->is_input_dead()));
+ } else {
+ Rendezvous::ParsedKey in_loop_parsed;
+ GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+ VLOG(2) << "Send " << in_loop_parsed.buf_;
+ OP_REQUIRES_OK(ctx,
+ Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed));
+
+ OP_REQUIRES_OK(ctx,
+ ctx->rendezvous()->Send(in_loop_parsed, args, ctx->input(0),
+ ctx->is_input_dead()));
+ }
}
REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
@@ -101,17 +116,16 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
send_device_incarnation, tensor_name);
+ // The vast majority of Recv nodes are outside any loop context, so
+ // proactively cache the rendezvous key for the top-level.
+ GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
+ OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
}
void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
OP_REQUIRES(
ctx, ctx->rendezvous() != nullptr,
errors::Internal("Op kernel context needs to provide a rendezvous."));
- Rendezvous::ParsedKey parsed;
- GetRendezvousKey(key_prefix_, ctx->frame_iter(), &parsed.buf_);
- VLOG(2) << "Recv " << parsed.buf_;
-
- OP_REQUIRES_OK_ASYNC(ctx, Rendezvous::ParseKey(parsed.buf_, &parsed), done);
Rendezvous::Args args;
args.device_context = ctx->op_device_context();
@@ -136,7 +150,19 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
done();
},
std::move(done), _1, _2, _3, _4, _5);
- ctx->rendezvous()->RecvAsync(parsed, args, std::move(done_cb));
+
+ if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+ VLOG(2) << "Recv " << parsed_key_.buf_;
+ ctx->rendezvous()->RecvAsync(parsed_key_, args, std::move(done_cb));
+ } else {
+ Rendezvous::ParsedKey in_loop_parsed;
+ GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+ VLOG(2) << "Recv " << in_loop_parsed.buf_;
+ OP_REQUIRES_OK_ASYNC(
+ ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
+
+ ctx->rendezvous()->RecvAsync(in_loop_parsed, args, std::move(done_cb));
+ }
}
REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp);
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
index 6e91422682..67867e3308 100644
--- a/tensorflow/core/kernels/sendrecv_ops.h
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -28,6 +28,7 @@ class SendOp : public OpKernel {
private:
string key_prefix_;
+ Rendezvous::ParsedKey parsed_key_;
TF_DISALLOW_COPY_AND_ASSIGN(SendOp);
};
@@ -39,6 +40,7 @@ class RecvOp : public AsyncOpKernel {
private:
string key_prefix_;
+ Rendezvous::ParsedKey parsed_key_;
TF_DISALLOW_COPY_AND_ASSIGN(RecvOp);
};
diff --git a/tensorflow/core/kernels/sendrecv_ops_test.cc b/tensorflow/core/kernels/sendrecv_ops_test.cc
new file mode 100644
index 0000000000..092a29f2f3
--- /dev/null
+++ b/tensorflow/core/kernels/sendrecv_ops_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Implement a trivial version of the Rendezvous interface, to avoid
+// clouding the benchmark results with the time spent in the various
+// implementations, and to avoid the duplicate-send or duplicate-recv
+// errors that would arise from running either benchmark in a loop.
+class DummyRendezvous : public Rendezvous {
+ Status Send(const ParsedKey& key, const Args& args, const Tensor& val,
+ const bool is_dead) override {
+ return Status::OK();
+ }
+ void RecvAsync(const ParsedKey& key, const Args& args,
+ DoneCallback done) override {
+ static Tensor* t = new Tensor(DT_FLOAT, TensorShape({0}));
+ done(Status::OK(), args, args, *t, false);
+ }
+ void StartAbort(const Status& status) override {}
+};
+
+static Graph* Send() {
+ Graph* g = new Graph(OpRegistry::Global());
+ Tensor in0(DT_FLOAT, TensorShape({0}));
+ test::graph::Send(g, test::graph::Constant(g, in0), "T", "/cpu:0", 1,
+ "/cpu:0");
+ test::graph::Recv(g, "T", "float", "/cpu:0", 1, "/cpu:0");
+ return g;
+}
+
+static Graph* Recv() {
+ Graph* g = new Graph(OpRegistry::Global());
+ test::graph::Recv(g, "T", "float", "/cpu:0", 1, "/cpu:0");
+ return g;
+}
+
+static void BM_Send(int iters) {
+ testing::UseRealTime();
+ testing::ItemsProcessed(static_cast<int64>(iters));
+ test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous)
+ .Run(iters);
+}
+BENCHMARK(BM_Send);
+
+static void BM_Recv(int iters) {
+ testing::UseRealTime();
+ testing::ItemsProcessed(static_cast<int64>(iters));
+ test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous)
+ .Run(iters);
+}
+BENCHMARK(BM_Recv);
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc
index 4571df9a98..6771697a16 100644
--- a/tensorflow/core/lib/io/inputbuffer_test.cc
+++ b/tensorflow/core/lib/io/inputbuffer_test.cc
@@ -298,9 +298,9 @@ TEST(InputBuffer, ReadVarint32) {
// Generates data.
std::vector<uint32> data;
uint32 i = 0;
- for (; i < (1 << 10); i += 1) data.push_back(i);
- for (; i < (1 << 15); i += 5) data.push_back(i);
- for (; i < (1 << 31); i += 132817) data.push_back(i);
+ for (; i < (1U << 10); i += 1) data.push_back(i);
+ for (; i < (1U << 15); i += 5) data.push_back(i);
+ for (; i < (1U << 31); i += 132817) data.push_back(i);
data.push_back(std::numeric_limits<uint32>::max());
// Writes the varints.
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index 3de157a1fc..4999d5cc90 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -58,16 +58,14 @@ void ZlibInputStream::InitZlibBuffer() {
z_stream_->avail_in = 0;
int status = inflateInit2(z_stream_.get(), zlib_options_.window_bits);
- if (status != Z_OK) {
- LOG(FATAL) << "inflateInit failed with status " << status;
- z_stream_.reset(nullptr);
- } else {
- z_stream_->next_in = z_stream_input_.get();
- z_stream_->next_out = z_stream_output_.get();
- next_unread_byte_ = reinterpret_cast<char*>(z_stream_output_.get());
- z_stream_->avail_in = 0;
- z_stream_->avail_out = output_buffer_capacity_;
- }
+
+ CHECK_EQ(status, Z_OK) << "inflateInit failed with status " << status;
+
+ z_stream_->next_in = z_stream_input_.get();
+ z_stream_->next_out = z_stream_output_.get();
+ next_unread_byte_ = reinterpret_cast<char*>(z_stream_output_.get());
+ z_stream_->avail_in = 0;
+ z_stream_->avail_out = output_buffer_capacity_;
}
Status ZlibInputStream::ReadFromStream() {
diff --git a/tensorflow/core/lib/lmdb/testdata/data.mdb b/tensorflow/core/lib/lmdb/testdata/data.mdb
new file mode 100644
index 0000000000..3ea75699cb
--- /dev/null
+++ b/tensorflow/core/lib/lmdb/testdata/data.mdb
Binary files differ
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index a7b4422bab..39a3b42a87 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -175,7 +175,11 @@ TEST(ArrayOpsTest, Identity_ShapeFnHandles) {
TF_ASSERT_OK(c.construction_status());
ASSERT_TRUE(op_reg_data->shape_inference_fn != nullptr);
TF_ASSERT_OK(c.Run(op_reg_data->shape_inference_fn));
- EXPECT_TRUE(c.output_handle_dtype(0) == DT_BOOL);
+
+ const auto* shapes_and_types = c.output_handle_shapes_and_types(0);
+ ASSERT_TRUE(shapes_and_types != nullptr);
+ ASSERT_EQ(1, shapes_and_types->size());
+ EXPECT_EQ((*shapes_and_types)[0].dtype, DT_BOOL);
}
TEST(ArrayOpsTest, Diag_ShapeFn) {
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index 0bce6fc0ea..b7afab84a1 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -520,6 +520,21 @@ shared_name: If non-empty, this reader is named in the given bucket
with this shared_name. Otherwise, the node name is used instead.
)doc");
+REGISTER_OP("LMDBReader")
+ .Output("reader_handle: Ref(string)")
+ .Attr("container: string = ''")
+ .Attr("shared_name: string = ''")
+ .SetIsStateful()
+ .SetShapeFn(TwoElementOutput)
+ .Doc(R"doc(
+A Reader that outputs the records from a LMDB file.
+reader_handle: The handle to reference the Reader.
+container: If non-empty, this reader is placed in the given container.
+ Otherwise, a default container is used.
+shared_name: If non-empty, this reader is named in the given bucket
+ with this shared_name. Otherwise, the node name is used instead.
+)doc");
+
// TODO(cwhipkey): mark this deprecated in favor of V2.
REGISTER_OP("IdentityReader")
.Output("reader_handle: Ref(string)")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d82e98beb9..997b7de3d9 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -26050,6 +26050,33 @@ op {
is_stateful: true
}
op {
+ name: "LMDBReader"
+ output_arg {
+ name: "reader_handle"
+ description: "The handle to reference the Reader."
+ type: DT_STRING
+ is_ref: true
+ }
+ attr {
+ name: "container"
+ type: "string"
+ default_value {
+ s: ""
+ }
+ description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
+ }
+ attr {
+ name: "shared_name"
+ type: "string"
+ default_value {
+ s: ""
+ }
+ description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
+ }
+ summary: "A Reader that outputs the records from a LMDB database."
+ is_stateful: true
+}
+op {
name: "TakeDataset"
input_arg {
name: "input_dataset"
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 615ca85ab7..39446b6ca2 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -546,6 +546,22 @@ cuda_py_test(
tags = ["notsan"],
)
+cuda_py_test(
+ name = "session_debug_multi_gpu_test",
+ size = "small",
+ srcs = ["lib/session_debug_multi_gpu_test.py"],
+ additional_deps = [
+ ":debug_data",
+ ":debug_utils",
+ "//tensorflow/python:client",
+ "//tensorflow/python:framework_for_generated_wrappers",
+ "//tensorflow/python:framework_test_lib",
+ "//tensorflow/python:math_ops",
+ "//tensorflow/python:platform_test",
+ "//tensorflow/python:variables",
+ ],
+)
+
py_test(
name = "debugger_cli_common_test",
size = "small",
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 35e9fef29f..748e5d78ba 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -19,10 +19,12 @@ from __future__ import division
from __future__ import print_function
import collections
+import glob
import json
import os
import numpy as np
+import six
from six.moves import xrange # pylint: disable=redefined-builtin
from tensorflow.core.framework import graph_pb2
@@ -32,9 +34,12 @@ from tensorflow.python.framework import tensor_util
from tensorflow.python.platform import gfile
+# TODO(cais): Tie these string constants in with C++?
METADATA_FILE_PREFIX = "_tfdbg_"
CORE_METADATA_TAG = "core_metadata_"
GRAPH_FILE_TAG = "graph_"
+DEVICE_TAG = "device_"
+
FETCHES_INFO_FILE_TAG = "fetches_info_"
FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_"
@@ -158,10 +163,6 @@ def parse_node_or_tensor_name(name):
return name, None
-def _is_core_metadata_file(file_name):
- return file_name.startswith(METADATA_FILE_PREFIX + CORE_METADATA_TAG)
-
-
def _is_graph_file(file_name):
return file_name.startswith(METADATA_FILE_PREFIX + GRAPH_FILE_TAG)
@@ -344,6 +345,28 @@ def extract_core_metadata_from_event_proto(event):
json_metadata["target_nodes"])
+def device_name_to_device_path(device_name):
+ """Convert device name to device path."""
+ device_name_items = device_name.split("/")
+ device_name_items = [item.replace(":", "_") for item in device_name_items]
+ return METADATA_FILE_PREFIX + DEVICE_TAG + ",".join(device_name_items)
+
+
+def device_path_to_device_name(device_dir):
+ """Parse device name from device path.
+
+ Args:
+ device_dir: (str) a directory name for the device.
+
+ Returns:
+ (str) parsed device name.
+ """
+ path_items = os.path.basename(device_dir)[
+ len(METADATA_FILE_PREFIX) + len(DEVICE_TAG):].split(",")
+ return "/".join([
+ path_item.replace("_", ":", 1) for path_item in path_items])
+
+
class DebugTensorDatum(object):
"""A single tensor dumped by TensorFlow Debugger (tfdbg).
@@ -360,13 +383,17 @@ class DebugTensorDatum(object):
"""`DebugTensorDatum` constructor.
Args:
- dump_root: (`str`) Debug dump root directory.
+ dump_root: (`str`) Debug dump root directory. This path should not include
+ the path component that represents the device name (see also below).
debug_dump_rel_path: (`str`) Path to a debug dump file, relative to the
- `dump_root`. For example, suppose the debug dump root
- directory is `/tmp/tfdbg_1` and the dump file is at
- `/tmp/tfdbg_1/ns_1/node_a_0_DebugIdentity_123456789`, then
- the value of the debug_dump_rel_path should be
- `ns_1/node_a_0_DebugIdenity_1234456789`.
+ `dump_root`. The first item of this relative path is assumed to be
+ a path representing the name of the device that the Tensor belongs to.
+ See `device_path_to_device_name` for more details on the device path.
+ For example, suppose the debug dump root
+ directory is `/tmp/tfdbg_1` and the dump file is at
+ `/tmp/tfdbg_1/<device_path>/>ns_1/node_a_0_DebugIdentity_123456789`,
+ then the value of the debug_dump_rel_path should be
+ `<device_path>/ns_1/node_a_0_DebugIdenity_1234456789`.
Raises:
ValueError: If the base file name of the dump file does not conform to
@@ -374,15 +401,13 @@ class DebugTensorDatum(object):
`node_name`_`output_slot`_`debug_op`_`timestamp`
"""
- base = os.path.basename(debug_dump_rel_path)
-
+ path_components = os.path.normpath(debug_dump_rel_path).split(os.sep)
+ self._device_name = device_path_to_device_name(path_components[0])
+ base = path_components[-1]
if base.count("_") < 3:
raise ValueError(
"Dump file path does not conform to the naming pattern: %s" % base)
- # TODO(cais): Add hostname and pid to support dumps from distributed
- # sessions.
-
self._extended_timestamp = base.split("_")[-1]
# It may include an index suffix at the end if file path collision happened
# due to identical timestamps.
@@ -395,31 +420,23 @@ class DebugTensorDatum(object):
self._debug_op = base.split("_")[-2]
self._output_slot = int(base.split("_")[-3])
- namespace = os.path.dirname(debug_dump_rel_path).replace("\\", "/")
node_base_name = "_".join(base.split("_")[:-3])
- if not namespace or namespace == ".":
- self._node_name = node_base_name
- else:
- self._node_name = namespace + "/" + node_base_name
+ self._node_name = "/".join(path_components[1:-1] + [node_base_name])
self._file_path = os.path.join(dump_root, debug_dump_rel_path)
self._dump_size_bytes = (gfile.Stat(self._file_path).length if
gfile.Exists(self._file_path) else None)
- self._run_fetches_info = None
- self._run_feed_keys_info = None
-
def __str__(self):
- return "{DebugTensorDatum: %s:%d @ %s @ %d}" % (self.node_name,
- self.output_slot,
- self.debug_op,
- self.timestamp)
+ return "{DebugTensorDatum (%s) %s:%d @ %s @ %d}" % (self.device_name,
+ self.node_name,
+ self.output_slot,
+ self.debug_op,
+ self.timestamp)
def __repr__(self):
return self.__str__()
- # TODO(cais): (b/38325442) Add device name information to this class.
-
def get_tensor(self):
"""Get tensor from the dump (`Event`) file.
@@ -465,6 +482,16 @@ class DebugTensorDatum(object):
return self._debug_op
@property
+ def device_name(self):
+ """Name of the device that the tensor belongs to.
+
+ Returns:
+ (`str`) device name.
+ """
+
+ return self._device_name
+
+ @property
def node_name(self):
"""Name of the node from which the tensor value was dumped.
@@ -529,6 +556,8 @@ class WatchKeyDoesNotExistInDebugDumpDirError(ValueError):
pass
+# TODO(cais): This class is getting too large in line count. Refactor to make it
+# smaller and easier to maintain.
class DebugDumpDir(object):
"""Data set from a debug-dump directory on filesystem.
@@ -548,23 +577,54 @@ class DebugDumpDir(object):
Raises:
IOError: If dump_root does not exist as a directory.
+ ValueError: If more than one core metadata file is found under the dump
+ root directory.
"""
if not gfile.IsDirectory(dump_root):
raise IOError("Dump root directory %s does not exist" % dump_root)
- self._core_metadata = None
- self._load_dumps(dump_root)
- self._create_tensor_watch_maps()
- self._load_partition_graphs(partition_graphs, validate)
+ self._core_metadata = []
+
+ # Find the list of devices.
+ self._dump_root = dump_root
+
+ self._load_core_metadata()
+ self._load_fetches_info()
+ self._load_feeds_info()
+ self._load_all_device_dumps(partition_graphs, validate)
self._python_graph = None
- def _load_dumps(self, dump_root):
- """Load `DebugTensorDatum` instances from the dump root.
+ def _load_all_device_dumps(self, partition_graphs, validate):
+ """Load the dump data for all devices."""
+ device_dirs = glob.glob(os.path.join(
+ self._dump_root, METADATA_FILE_PREFIX + DEVICE_TAG + "*"))
+
+ self._device_names = []
+ self._t0s = {}
+ self._dump_tensor_data = {}
+ self._dump_graph_file_paths = {}
+ self._debug_watches = {}
+ self._watch_key_to_devices = {}
+ self._watch_key_to_datum = {}
+ self._watch_key_to_rel_time = {}
+ self._watch_key_to_dump_size_bytes = {}
+ for device_dir in device_dirs:
+ device_name = device_path_to_device_name(device_dir)
+ self._device_names.append(device_name)
+ self._load_device_dumps(device_name, device_dir)
+ self._load_partition_graphs(partition_graphs, validate)
+ self._calculate_t0()
+
+ for device_name in self._device_names:
+ self._create_tensor_watch_maps(device_name)
- Populates a list of `DebugTensorDatum` instance and sorts the list by
- ascending timestamp.
+ def _load_device_dumps(self, device_name, device_root):
+ """Load `DebugTensorDatum` instances from the dump root of a given device.
+
+ Populates a map {device_name: a list of `DebugTensorDatum`}, where the list
+ is sorted by ascending timestamp.
This sorting order reflects the order in which the TensorFlow executor
processed the nodes of the graph. It is (one of many possible) topological
@@ -584,55 +644,67 @@ class DebugDumpDir(object):
graphs may not be available, e.g., when the run errors out.
Args:
- dump_root: (`str`) Dump root directory.
- """
+ device_name: (`str`) name of the device.
+ device_root: (`str`) dump root directory of the given device.
- self._dump_root = dump_root
- self._dump_tensor_data = []
- self._dump_graph_file_paths = []
+ Raises:
+ ValueError: If GraphDef for the device is not available.
+ """
- self._debug_watches = collections.defaultdict(
+ self._dump_tensor_data[device_name] = []
+ self._debug_watches[device_name] = collections.defaultdict(
lambda: collections.defaultdict(set))
- for root, _, files in gfile.Walk(self._dump_root):
+ for root, _, files in gfile.Walk(device_root):
for f in files:
- if f.startswith(METADATA_FILE_PREFIX):
- if _is_core_metadata_file(f):
- self._load_core_metadata(os.path.join(self._dump_root, root, f))
-
- if _is_graph_file(f):
- self._dump_graph_file_paths.append(
- os.path.join(self._dump_root, root, f))
-
- if _is_run_fetches_info_file(f):
- self._run_fetches_info = _load_log_message_from_event_file(
- os.path.join(root, f))
-
- if _is_run_feed_keys_info_file(f):
- self._run_feed_keys_info = _load_log_message_from_event_file(
- os.path.join(root, f))
-
- continue
-
- datum = self._dump_file_name_to_datum(root, f)
- self._dump_tensor_data.append(datum)
-
- self._debug_watches[datum.node_name][datum.output_slot].add(
- datum.debug_op)
-
- self._dump_tensor_data = sorted(
- self._dump_tensor_data, key=lambda x: x.extended_timestamp)
-
- if self._dump_tensor_data:
- self._t0 = self._dump_tensor_data[0].timestamp
+ if _is_graph_file(f):
+ self._dump_graph_file_paths[device_name] = os.path.join(
+ device_root, root, f)
+ else:
+ datum = self._dump_file_name_to_datum(root, f)
+ self._dump_tensor_data[device_name].append(datum)
+ self._debug_watches[device_name][datum.node_name][
+ datum.output_slot].add(datum.debug_op)
+
+ self._dump_tensor_data[device_name] = sorted(
+ self._dump_tensor_data[device_name],
+ key=lambda x: x.extended_timestamp)
+
+ if self._dump_tensor_data[device_name]:
+ self._t0s[device_name] = self._dump_tensor_data[device_name][0].timestamp
else:
- self._t0 = None
-
- def _load_core_metadata(self, event_file_path):
- event = event_pb2.Event()
- with gfile.Open(event_file_path, "rb") as f:
- event.ParseFromString(f.read())
- self._core_metadata = extract_core_metadata_from_event_proto(event)
+ self._t0s[device_name] = None
+
+ def _calculate_t0(self):
+ """Calculate the first timestamp across all devices."""
+ t0s = [t0 for t0 in six.itervalues(self._t0s) if t0 is not None]
+ self._t0 = min(t0s) if t0s else None
+
+ def _load_core_metadata(self):
+ core_metadata_files = glob.glob(os.path.join(
+ self._dump_root, METADATA_FILE_PREFIX + CORE_METADATA_TAG + "*"))
+ for core_metadata_file in core_metadata_files:
+ with gfile.Open(core_metadata_file, "rb") as f:
+ event = event_pb2.Event()
+ event.ParseFromString(f.read())
+ self._core_metadata.append(
+ extract_core_metadata_from_event_proto(event))
+
+ def _load_fetches_info(self):
+ fetches_info_files = glob.glob(os.path.join(
+ self._dump_root, METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG + "*"))
+ self._run_fetches_info = []
+ for fetches_info_file in fetches_info_files:
+ self._run_fetches_info.append(
+ _load_log_message_from_event_file(fetches_info_file))
+
+ def _load_feeds_info(self):
+ feeds_info_files = glob.glob(os.path.join(
+ self._dump_root, METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG + "*"))
+ self._run_feed_keys_info = []
+ for feeds_info_file in feeds_info_files:
+ self._run_feed_keys_info.append(
+ _load_log_message_from_event_file(feeds_info_file))
def _dump_file_name_to_datum(self, dir_name, file_name):
"""Obtain a DebugTensorDatum from the directory and file name.
@@ -648,34 +720,39 @@ class DebugDumpDir(object):
# Calculate the relative path of the dump file with respect to the root.
debug_dump_rel_path = os.path.join(
os.path.relpath(dir_name, self._dump_root), file_name)
-
return DebugTensorDatum(self._dump_root, debug_dump_rel_path)
- def _create_tensor_watch_maps(self):
+ def _create_tensor_watch_maps(self, device_name):
"""Create maps from tensor watch keys to datum and to timestamps.
Create a map from watch key (tensor name + debug op) to `DebugTensorDatum`
item. Also make a map from watch key to relative timestamp.
"relative" means (absolute timestamp - t0).
+
+ Args:
+ device_name: (str) name of the device.
"""
- self._watch_key_to_datum = {}
- self._watch_key_to_rel_time = {}
- self._watch_key_to_dump_size_bytes = {}
- for datum in self._dump_tensor_data:
- if datum.watch_key not in self._watch_key_to_datum:
- self._watch_key_to_datum[datum.watch_key] = [datum]
- self._watch_key_to_rel_time[datum.watch_key] = [
- datum.timestamp - self._t0
- ]
- self._watch_key_to_dump_size_bytes[datum.watch_key] = [
- datum.dump_size_bytes
- ]
+ self._watch_key_to_datum[device_name] = {}
+ self._watch_key_to_rel_time[device_name] = {}
+ self._watch_key_to_dump_size_bytes[device_name] = {}
+ for datum in self._dump_tensor_data[device_name]:
+ if datum.watch_key not in self._watch_key_to_devices:
+ self._watch_key_to_devices[datum.watch_key] = {device_name}
+ else:
+ self._watch_key_to_devices[datum.watch_key].add(device_name)
+
+ if datum.watch_key not in self._watch_key_to_datum[device_name]:
+ self._watch_key_to_datum[device_name][datum.watch_key] = [datum]
+ self._watch_key_to_rel_time[device_name][datum.watch_key] = [
+ datum.timestamp - self._t0]
+ self._watch_key_to_dump_size_bytes[device_name][datum.watch_key] = [
+ datum.dump_size_bytes]
else:
- self._watch_key_to_datum[datum.watch_key].append(datum)
- self._watch_key_to_rel_time[datum.watch_key].append(datum.timestamp -
- self._t0)
- self._watch_key_to_dump_size_bytes[datum.watch_key].append(
+ self._watch_key_to_datum[device_name][datum.watch_key].append(datum)
+ self._watch_key_to_rel_time[device_name][datum.watch_key].append(
+ datum.timestamp - self._t0)
+ self._watch_key_to_dump_size_bytes[device_name][datum.watch_key].append(
datum.dump_size_bytes)
def set_python_graph(self, python_graph):
@@ -733,22 +810,32 @@ class DebugDumpDir(object):
`output_names`: Names of the output (fetched) Tensors.
`target_nodes`: Names of the target nodes.
If the core metadata have not been loaded, `None`.
+ If more than one core metadata files exist, return a list of the
+ `nametuple` described above.
"""
- return self._core_metadata
+ output = self._core_metadata
+ return output[0] if len(output) == 1 else output
@property
def dumped_tensor_data(self):
- return self._dump_tensor_data
+ """Retrieve dumped tensor data."""
+ if len(self.devices()) == 1:
+ return self._dump_tensor_data[self.devices()[0]]
+ else:
+ all_devices_data = six.itervalues(self._dump_tensor_data)
+ data = []
+ for device_data in all_devices_data:
+ data.extend(device_data)
+ return sorted(data, key=lambda x: x.extended_timestamp)
@property
def t0(self):
- """Absolute timestamp of the first dumped tensor.
+ """Absolute timestamp of the first dumped tensor across all devices.
Returns:
(`int`) absolute timestamp of the first dumped tensor, in microseconds.
"""
-
return self._t0
@property
@@ -756,10 +843,10 @@ class DebugDumpDir(object):
"""Total number of dumped tensors in the dump root directory.
Returns:
- (`int`) total number of dumped tensors in the dump root directory.
+ (`int`) The total number of dumped tensors in the dump root directory.
"""
-
- return len(self._dump_tensor_data)
+ return sum(len(self._dump_tensor_data[device_name])
+ for device_name in self._dump_tensor_data)
def _load_partition_graphs(self, partition_graphs, validate):
"""Load and process partition graphs.
@@ -770,56 +857,73 @@ class DebugDumpDir(object):
tensor dumps.
Args:
- partition_graphs: Partition graphs executed by the TensorFlow runtime,
- represented as repeated fields of GraphDef.
- If no partition_graph is available, use None.
+ partition_graphs: A repeated field of GraphDefs representing the
+ partition graphs executed by the TensorFlow runtime.
validate: (`bool`) Whether the dump files are to be validated against the
partition graphs.
- """
- if partition_graphs:
- self._partition_graphs = partition_graphs
- elif self._dump_graph_file_paths:
- # In case partition graphs are not available from arguments, load them
- # from the dump directory.
- self._partition_graphs = [
- _load_graph_def_from_event_file(dump_file_path)
- for dump_file_path in self._dump_graph_file_paths
- ]
- else:
- self._partition_graphs = None
- return
+ Raises:
+ ValueError: If the partition GraphDef of one or more devices fail to be
+ loaded.
+ """
self._node_attributes = {}
-
self._node_inputs = {}
self._node_ctrl_inputs = {}
-
self._node_recipients = {}
self._node_ctrl_recipients = {}
-
- self._devices = []
self._node_devices = {}
self._node_op_types = {}
+ self._copy_send_nodes = {}
+
+ self._partition_graphs = {}
+ for device_name in self._device_names:
+ partition_graph = None
+ if device_name in self._dump_graph_file_paths:
+ partition_graph = _load_graph_def_from_event_file(
+ self._dump_graph_file_paths[device_name])
+ else:
+ partition_graph = self._find_partition_graph(partition_graphs,
+ device_name)
+
+ if partition_graph:
+ self._partition_graphs[device_name] = partition_graph
- self._copy_send_nodes = []
+ self._node_attributes[device_name] = {}
+ self._node_inputs[device_name] = {}
+ self._node_ctrl_inputs[device_name] = {}
+ self._node_recipients[device_name] = {}
+ self._node_ctrl_recipients[device_name] = {}
+ self._node_op_types[device_name] = {}
+ self._copy_send_nodes[device_name] = []
- for pg in self._partition_graphs:
- for node in pg.node:
- self._process_partition_graph_node(node)
+ if partition_graph:
+ for node in partition_graph.node:
+ self._process_partition_graph_node(device_name, node)
- self._prune_non_control_edges_of_debug_ops()
- self._prune_control_edges_of_debug_ops()
+ self._prune_non_control_edges_of_debug_ops(device_name)
+ self._prune_control_edges_of_debug_ops(device_name)
- self._populate_recipient_maps()
+ self._populate_recipient_maps(device_name)
- if validate:
- self._validate_dump_with_graphs()
+ if device_name in self._partition_graphs and validate:
+ self._validate_dump_with_graphs(device_name)
+
+ def _find_partition_graph(self, partition_graphs, device_name):
+ if partition_graphs is None:
+ return None
+ else:
+ for graph_def in partition_graphs:
+ for node_def in graph_def.node:
+ if node_def.device == device_name:
+ return graph_def
+ return None
- def _process_partition_graph_node(self, node):
+ def _process_partition_graph_node(self, device_name, node):
"""Process a node from the partition graphs.
Args:
+ device_name: (str) device name.
node: (NodeDef) A partition-graph node to be processed.
Raises:
@@ -833,84 +937,91 @@ class DebugDumpDir(object):
(watched_node_name, watched_output_slot, _,
debug_op) = parse_debug_node_name(node.name)
- self._debug_watches[watched_node_name][watched_output_slot].add(
- debug_op)
+ self._debug_watches[device_name][watched_node_name][
+ watched_output_slot].add(debug_op)
return
- if node.name in self._node_inputs:
- raise ValueError("Duplicate node name: '%s'" % node.name)
+ if node.name in self._node_inputs[device_name]:
+ raise ValueError("Duplicate node name on device %s: '%s'" %
+ (device_name, node.name))
- self._node_attributes[node.name] = node.attr
+ self._node_attributes[device_name][node.name] = node.attr
- if node.device not in self._devices and node.device:
- self._devices.append(node.device)
+ self._node_inputs[device_name][node.name] = []
+ self._node_ctrl_inputs[device_name][node.name] = []
+ self._node_recipients[device_name][node.name] = []
+ self._node_ctrl_recipients[device_name][node.name] = []
- self._node_inputs[node.name] = []
- self._node_ctrl_inputs[node.name] = []
- self._node_recipients[node.name] = []
- self._node_ctrl_recipients[node.name] = []
-
- self._node_devices[node.name] = node.device
- self._node_op_types[node.name] = node.op
+ if node.name not in self._node_devices:
+ self._node_devices[node.name] = set()
+ self._node_devices[node.name].add(node.device)
+ self._node_op_types[device_name][node.name] = node.op
for inp in node.input:
if is_copy_node(inp) and (node.op == "_Send" or node.op == "_Retval"):
- self._copy_send_nodes.append(node.name)
+ self._copy_send_nodes[device_name].append(node.name)
if inp.startswith("^"):
cinp = inp[1:]
- self._node_ctrl_inputs[node.name].append(cinp)
+ self._node_ctrl_inputs[device_name][node.name].append(cinp)
else:
- self._node_inputs[node.name].append(inp)
+ self._node_inputs[device_name][node.name].append(inp)
- def _prune_nodes_from_input_and_recipient_maps(self, nodes_to_prune):
+ def _prune_nodes_from_input_and_recipient_maps(self,
+ device_name,
+ nodes_to_prune):
"""Prune nodes out of input and recipient maps.
Args:
+ device_name: (`str`) device name.
nodes_to_prune: (`list` of `str`) Names of the nodes to be pruned.
"""
for node in nodes_to_prune:
- del self._node_inputs[node]
- del self._node_ctrl_inputs[node]
- del self._node_recipients[node]
- del self._node_ctrl_recipients[node]
+ del self._node_inputs[device_name][node]
+ del self._node_ctrl_inputs[device_name][node]
+ del self._node_recipients[device_name][node]
+ del self._node_ctrl_recipients[device_name][node]
- def _prune_non_control_edges_of_debug_ops(self):
+ def _prune_non_control_edges_of_debug_ops(self, device_name):
"""Prune (non-control) edges related to debug ops.
Prune the Copy ops and associated _Send ops inserted by the debugger out
from the non-control inputs and output recipients map. Replace the inputs
and recipients with original ones.
+
+ Args:
+ device_name: (`str`) device name.
"""
copy_nodes = []
- for node in self._node_inputs:
- if node in self._copy_send_nodes:
+ for node in self._node_inputs[device_name]:
+ if node in self._copy_send_nodes[device_name]:
continue
if is_copy_node(node):
copy_nodes.append(node)
- inputs = self._node_inputs[node]
+ inputs = self._node_inputs[device_name][node]
for i in xrange(len(inputs)):
inp = inputs[i]
if is_copy_node(inp):
# Find the input to the Copy node, which should be the original
# input to the node.
- orig_inp = self._node_inputs[inp][0]
+ orig_inp = self._node_inputs[device_name][inp][0]
inputs[i] = orig_inp
- self._prune_nodes_from_input_and_recipient_maps(copy_nodes)
- self._prune_nodes_from_input_and_recipient_maps(self._copy_send_nodes)
+ self._prune_nodes_from_input_and_recipient_maps(device_name, copy_nodes)
+ self._prune_nodes_from_input_and_recipient_maps(
+ device_name, self._copy_send_nodes[device_name])
- def _prune_control_edges_of_debug_ops(self):
+ def _prune_control_edges_of_debug_ops(self, device_name):
"""Prune control edges related to the debug ops."""
- for node in self._node_ctrl_inputs:
- ctrl_inputs = self._node_ctrl_inputs[node]
+ for node in self._node_ctrl_inputs[device_name]:
+ ctrl_inputs = self._node_ctrl_inputs[device_name][node]
debug_op_inputs = []
for ctrl_inp in ctrl_inputs:
if is_debug_node(ctrl_inp):
@@ -918,33 +1029,36 @@ class DebugDumpDir(object):
for debug_op_inp in debug_op_inputs:
ctrl_inputs.remove(debug_op_inp)
- def _populate_recipient_maps(self):
+ def _populate_recipient_maps(self, device_name):
"""Populate the map from node name to recipient(s) of its output(s)."""
- for node in self._node_inputs:
- inputs = self._node_inputs[node]
+ for node in self._node_inputs[device_name]:
+ inputs = self._node_inputs[device_name][node]
for inp in inputs:
inp = get_node_name(inp)
- if inp not in self._node_recipients:
- self._node_recipients[inp] = []
- self._node_recipients[inp].append(node)
+ if inp not in self._node_recipients[device_name]:
+ self._node_recipients[device_name][inp] = []
+ self._node_recipients[device_name][inp].append(node)
- for node in self._node_ctrl_inputs:
- ctrl_inputs = self._node_ctrl_inputs[node]
+ for node in self._node_ctrl_inputs[device_name]:
+ ctrl_inputs = self._node_ctrl_inputs[device_name][node]
for ctrl_inp in ctrl_inputs:
- if ctrl_inp in self._copy_send_nodes:
+ if ctrl_inp in self._copy_send_nodes[device_name]:
continue
- if ctrl_inp not in self._node_ctrl_recipients:
- self._node_ctrl_recipients[ctrl_inp] = []
- self._node_ctrl_recipients[ctrl_inp].append(node)
+ if ctrl_inp not in self._node_ctrl_recipients[device_name]:
+ self._node_ctrl_recipients[device_name][ctrl_inp] = []
+ self._node_ctrl_recipients[device_name][ctrl_inp].append(node)
- def _validate_dump_with_graphs(self):
+ def _validate_dump_with_graphs(self, device_name):
"""Validate the dumped tensor data against the partition graphs.
Only the watched nodes are validated by this method, because tfdbg allows
clients to watch only a subset of the nodes.
+ Args:
+ device_name: (`str`) device name.
+
Raises:
LookupError: If the partition graphs have not been loaded yet.
ValueError: If dumps contain node names not found in partition graph.
@@ -952,33 +1066,35 @@ class DebugDumpDir(object):
input relations on the partition graphs.
"""
- if not self._partition_graphs:
- raise LookupError("No partition graphs loaded.")
+ if not self._partition_graphs[device_name]:
+ raise LookupError(
+ "No partition graphs loaded for device %s" % device_name)
# Verify that the node names in the dump data are all present in the
# partition graphs.
- for datum in self._dump_tensor_data:
- if datum.node_name not in self._node_inputs:
- raise ValueError("Node name '%s' is not found in partition graphs." %
- datum.node_name)
+ for datum in self._dump_tensor_data[device_name]:
+ if datum.node_name not in self._node_inputs[device_name]:
+ raise ValueError("Node name '%s' is not found in partition graphs of "
+ "device %s." % (datum.node_name, device_name))
pending_inputs = {}
- for node in self._node_inputs:
+ for node in self._node_inputs[device_name]:
pending_inputs[node] = []
- inputs = self._node_inputs[node]
+ inputs = self._node_inputs[device_name][node]
for inp in inputs:
inp_node = get_node_name(inp)
inp_output_slot = get_output_slot(inp)
# Inputs from Enter and NextIteration nodes are not validated because
# DebugNodeInserter::InsertNodes() in the debugger core skips creating
# control edges from debug ops watching these types of nodes.
- if (inp_node in self._debug_watches and
- inp_output_slot in self._debug_watches[inp_node] and
- self._node_op_types.get(inp) not in ("Enter", "NextIteration") and
+ if (inp_node in self._debug_watches[device_name] and
+ inp_output_slot in self._debug_watches[device_name][inp_node] and
+ self._node_op_types[device_name].get(inp) not in (
+ "Enter", "NextIteration") and
(inp_node, inp_output_slot) not in pending_inputs[node]):
pending_inputs[node].append((inp_node, inp_output_slot))
- for i, datum in enumerate(self._dump_tensor_data):
+ for i, datum in enumerate(self._dump_tensor_data[device_name]):
node = datum.node_name
slot = datum.output_slot
# In some cases (e.g., system clocks with insufficient precision),
@@ -986,13 +1102,13 @@ class DebugDumpDir(object):
# following check examines this possibility and avoids raising an error if
# that is the case.
if not self._satisfied_at_timestamp(
- pending_inputs[node], datum.timestamp, start_i=i + 1):
+ device_name, pending_inputs[node], datum.timestamp, start_i=i + 1):
raise ValueError("Causality violated in timing relations of debug "
"dumps: %s (%d): "
"these input(s) are not satisfied: %s" %
(node, datum.timestamp, repr(pending_inputs[node])))
- recipients = self._node_recipients[node]
+ recipients = self._node_recipients[device_name][node]
for recipient in recipients:
recipient_pending_inputs = pending_inputs[recipient]
if (node, slot) in recipient_pending_inputs:
@@ -1004,12 +1120,13 @@ class DebugDumpDir(object):
del recipient_pending_inputs[
recipient_pending_inputs.index((node, slot))]
- def _satisfied_at_timestamp(self, pending, timestamp, start_i=0):
+ def _satisfied_at_timestamp(self, device_name, pending, timestamp, start_i=0):
"""Determine whether pending inputs are satisfied at given timestamp.
Note: This method mutates the input argument "pending".
Args:
+ device_name: (str) device name.
pending: A list of 2-tuple (node_name, output_slot): the dependencies to
check.
timestamp: (int) the timestamp in question.
@@ -1023,7 +1140,7 @@ class DebugDumpDir(object):
if not pending:
return True
- for datum in self._dump_tensor_data[start_i:]:
+ for datum in self._dump_tensor_data[device_name][start_i:]:
if datum.timestamp > timestamp:
break
if (datum.timestamp == timestamp and
@@ -1042,7 +1159,7 @@ class DebugDumpDir(object):
"""Get the partition graphs.
Returns:
- Partition graphs as repeated fields of GraphDef.
+ Partition graphs as a list of GraphDef.
Raises:
LookupError: If no partition graphs have been loaded.
@@ -1051,50 +1168,90 @@ class DebugDumpDir(object):
if self._partition_graphs is None:
raise LookupError("No partition graphs have been loaded.")
- return self._partition_graphs
+ return self._partition_graphs.values()
@property
def run_fetches_info(self):
"""Get a str representation of the fetches used in the Session.run() call.
Returns:
- If the information is available, a `str` obtained from `repr(fetches)`.
+ If the information is available from one `Session.run` call, a `str`
+ obtained from `repr(fetches)`.
+ If the information is available from multiple `Session.run` calls, a
+ `list` of `str` from `repr(fetches)`.
If the information is not available, `None`.
"""
- return self._run_fetches_info
+ output = self._run_fetches_info
+ return output[0] if len(output) == 1 else output
@property
def run_feed_keys_info(self):
"""Get a str representation of the feed_dict used in the Session.run() call.
Returns:
- If the information is available, a `str` obtained from `repr(feed_dict)`.
+ If the information is available from one `Session.run` call, a `str`
+ obtained from `repr(feed_dict)`.
+ If the information is available from multiple `Session.run` calls, a
+ `list` of `str` obtained from `repr(feed_dict)`.
If the information is not available, `None`.
"""
- return self._run_feed_keys_info
+ output = self._run_feed_keys_info
+ return output[0] if len(output) == 1 else output
+
+ def _infer_device_name(self, device_name, node_name):
+ if device_name is None:
+ if len(self.devices()) == 1:
+ return self.devices()[0]
+ else:
+ if node_name in self._node_devices:
+ if len(self._node_devices[node_name]) == 1:
+ return list(self._node_devices[node_name])[0]
+ else:
+ raise ValueError(
+ "There are multiple (%d) devices with nodes named '%s' but "
+ "device_name is not specified." %
+ (len(self._node_devices[node_name]), node_name))
+ else:
+ raise ValueError("None of the %d devices has a node named '%s'." %
+ (len(self._device_names), node_name))
+ else:
+ return device_name
- def nodes(self):
+ def nodes(self, device_name=None):
"""Get a list of all nodes from the partition graphs.
+ Args:
+ device_name: (`str`) name of device. If there is only one device, this
+ argumnet is optional.
+
Returns:
All nodes' names, as a list of str.
Raises:
LookupError: If no partition graphs have been loaded.
+ ValueError: If there are multiple devices, but device_name is not
+ specified.
"""
-
if self._partition_graphs is None:
raise LookupError("No partition graphs have been loaded.")
+ if device_name is None:
+ if len(self.devices()) == 1:
+ device_name = self.devices()[0]
+ else:
+ raise ValueError(
+ "There are multiple (%d) devices, but "
+ "device_name is not specified." % len(self.devices()))
+ return [node_name for node_name in self._node_inputs[device_name]]
- return [node_name for node_name in self._node_inputs]
-
- def node_attributes(self, node_name):
+ def node_attributes(self, node_name, device_name=None):
"""Get the attributes of a node.
Args:
node_name: Name of the node in question.
+ device_name: (`str`) name of the device. If there is only one device or if
+ node_name exists on only one device, this argumnet is optional.
Returns:
Attributes of the node.
@@ -1103,22 +1260,25 @@ class DebugDumpDir(object):
LookupError: If no partition graphs have been loaded.
ValueError: If no node named node_name exists.
"""
-
if self._partition_graphs is None:
raise LookupError("No partition graphs have been loaded.")
- if node_name in self._node_attributes:
- return self._node_attributes[node_name]
+ device_name = self._infer_device_name(device_name, node_name)
+ if node_name in self._node_attributes[device_name]:
+ return self._node_attributes[device_name][node_name]
else:
- raise ValueError("No node named \"%s\" exists." % node_name)
+ raise ValueError("No node named \"%s\" exists on device %s." % (
+ node_name, device_name))
- def node_inputs(self, node_name, is_control=False):
+ def node_inputs(self, node_name, is_control=False, device_name=None):
"""Get the inputs of given node according to partition graphs.
Args:
node_name: Name of the node.
is_control: (`bool`) Whether control inputs, rather than non-control
inputs, are to be returned.
+ device_name: (`str`) name of the device. If there is only one device or if
+ node_name exists on only one device, this argumnet is optional.
Returns:
(`list` of `str`) inputs to the node, as a list of node names.
@@ -1133,21 +1293,27 @@ class DebugDumpDir(object):
raise LookupError(
"Node inputs are not loaded from partition graphs yet.")
- if node_name not in self._node_inputs:
- raise ValueError("Node '%s' does not exist in partition graphs." %
- node_name)
+ device_name = self._infer_device_name(device_name, node_name)
+ if node_name not in self._node_inputs[device_name]:
+ raise ValueError("Node '%s' does not exist in the partition graph of "
+ "device %s." % (node_name, device_name))
if is_control:
- return self._node_ctrl_inputs[node_name]
+ return self._node_ctrl_inputs[device_name][node_name]
else:
- return self._node_inputs[node_name]
+ return self._node_inputs[device_name][node_name]
- def transitive_inputs(self, node_name, include_control=True):
+ def transitive_inputs(self,
+ node_name,
+ include_control=True,
+ device_name=None):
"""Get the transitive inputs of given node according to partition graphs.
Args:
- node_name: Name of the node
+ node_name: Name of the node.
include_control: Include control inputs (True by default).
+ device_name: (`str`) name of the device. If there is only one device or if
+ node_name exists on only one device, this argumnet is optional.
Returns:
(`list` of `str`) all transitive inputs to the node, as a list of node
@@ -1163,9 +1329,11 @@ class DebugDumpDir(object):
raise LookupError(
"Node inputs are not loaded from partition graphs yet.")
- if node_name not in self._node_inputs:
- raise ValueError("Node '%s' does not exist in partition graphs." %
- node_name)
+ device_name = self._infer_device_name(device_name, node_name)
+ if node_name not in self._node_inputs[device_name]:
+ raise ValueError(
+ "Node '%s' does not exist in the partition graph of device %s." %
+ (node_name, device_name))
inputs = []
@@ -1186,21 +1354,21 @@ class DebugDumpDir(object):
# Stop the tracing at a Merge op, as it is generally impossible to infer
# outside the runtime which input to the Merge op is alive.
- if self._node_op_types[node] == "Merge":
+ if self._node_op_types[device_name][node] == "Merge":
return
if node in visited_nodes:
return
visited_nodes.append(node)
- for inp in self._node_inputs[node]:
+ for inp in self._node_inputs[device_name][node]:
if inp == node_name:
continue
inputs.append(inp)
trace_inputs(inp)
if include_control:
- for ctrl_inp in self._node_ctrl_inputs[node]:
+ for ctrl_inp in self._node_ctrl_inputs[device_name][node]:
if ctrl_inp == node_name:
continue
inputs.append(ctrl_inp)
@@ -1210,13 +1378,15 @@ class DebugDumpDir(object):
return inputs
- def node_recipients(self, node_name, is_control=False):
+ def node_recipients(self, node_name, is_control=False, device_name=None):
"""Get recipient of the given node's output according to partition graphs.
Args:
node_name: (`str`) name of the node.
is_control: (`bool`) whether control outputs, rather than non-control
outputs, are to be returned.
+ device_name: (`str`) name of the device. If there is only one device or if
+ node_name exists on only one device, this argumnet is optional.
Returns:
(`list` of `str`) all inputs to the node, as a list of node names.
@@ -1231,58 +1401,67 @@ class DebugDumpDir(object):
raise LookupError(
"Node recipients are not loaded from partition graphs yet.")
- if node_name not in self._node_recipients:
- raise ValueError("Node '%s' does not exist in partition graphs." %
- node_name)
+ device_name = self._infer_device_name(device_name, node_name)
+ if node_name not in self._node_recipients[device_name]:
+ raise ValueError(
+ "Node '%s' does not exist in the partition graph of device %s." %
+ (node_name, device_name))
if is_control:
- return self._node_ctrl_recipients[node_name]
+ return self._node_ctrl_recipients[device_name][node_name]
else:
- return self._node_recipients[node_name]
+ return self._node_recipients[device_name][node_name]
def devices(self):
- """Get the list of devices.
+ """Get the list of device names.
Returns:
(`list` of `str`) names of the devices.
-
- Raises:
- LookupError: If node inputs and control inputs have not been loaded
- from partition graphs yet.
"""
- if self._partition_graphs is None:
- raise LookupError("Devices are not loaded from partition graphs yet.")
-
- return self._devices
+ return self._device_names
- def node_exists(self, node_name):
+ def node_exists(self, node_name, device_name=None):
"""Test if a node exists in the partition graphs.
Args:
node_name: (`str`) name of the node to be checked.
+ device_name: optional device name. If None, will search for the node
+ on all available devices. Otherwise, search for the node only on
+ the given device.
Returns:
A boolean indicating whether the node exists.
Raises:
LookupError: If no partition graphs have been loaded yet.
+ ValueError: If device_name is specified but cannot be found.
"""
if self._node_inputs is None:
raise LookupError(
"Nodes have not been loaded from partition graphs yet.")
- return node_name in self._node_inputs
+ if (device_name is not None) and device_name not in self._node_inputs:
+ raise ValueError(
+ "The specified device_name '%s' cannot be found." % device_name)
+
+ node_inputs_all_devices = (self._node_inputs if device_name is None
+ else (self._node_inputs[device_name],))
+
+ return any(node_name in node_inputs_all_devices[dev_name]
+ for dev_name in node_inputs_all_devices)
def node_device(self, node_name):
- """Get the device of a node.
+ """Get the names of the devices that has nodes of the specified name.
Args:
node_name: (`str`) name of the node.
Returns:
- (`str`) name of the device on which the node is placed.
+ (`str` or `list` of `str`) name of the device(s) on which the node of the
+ given name is found. Returns a `str` if there is only one such device,
+ otherwise return a `list` of `str`.
Raises:
LookupError: If node inputs and control inputs have not been loaded
@@ -1298,13 +1477,16 @@ class DebugDumpDir(object):
raise ValueError("Node '%s' does not exist in partition graphs." %
node_name)
- return self._node_devices[node_name]
+ output = list(self._node_devices[node_name])
+ return output[0] if len(output) == 1 else output
- def node_op_type(self, node_name):
+ def node_op_type(self, node_name, device_name=None):
"""Get the op type of given node.
Args:
node_name: (`str`) name of the node.
+ device_name: (`str`) name of the device. If there is only one device or if
+ node_name exists on only one device, this argumnet is optional.
Returns:
(`str`) op type of the node.
@@ -1319,17 +1501,21 @@ class DebugDumpDir(object):
raise LookupError(
"Node op types are not loaded from partition graphs yet.")
- if node_name not in self._node_op_types:
- raise ValueError("Node '%s' does not exist in partition graphs." %
- node_name)
+ device_name = self._infer_device_name(device_name, node_name)
+ if node_name not in self._node_op_types[device_name]:
+ raise ValueError(
+ "Node '%s' does not exist in the partition graph of device '%s'. " %
+ (node_name, device_name))
- return self._node_op_types[node_name]
+ return self._node_op_types[device_name][node_name]
- def debug_watch_keys(self, node_name):
+ def debug_watch_keys(self, node_name, device_name=None):
"""Get all tensor watch keys of given node according to partition graphs.
Args:
node_name: (`str`) name of the node.
+ device_name: (`str`) name of the device. If there is only one device or if
+ node_name exists on only one device, this argumnet is optional.
Returns:
(`list` of `str`) all debug tensor watch keys. Returns an empty list if
@@ -1340,35 +1526,61 @@ class DebugDumpDir(object):
partition graphs yet.
"""
- if node_name not in self._debug_watches:
+ try:
+ device_name = self._infer_device_name(device_name, node_name)
+ except ValueError:
+ return []
+
+ if node_name not in self._debug_watches[device_name]:
return []
watch_keys = []
- for watched_slot in self._debug_watches[node_name]:
- debug_ops = self._debug_watches[node_name][watched_slot]
+ for watched_slot in self._debug_watches[device_name][node_name]:
+ debug_ops = self._debug_watches[device_name][node_name][watched_slot]
for debug_op in debug_ops:
watch_keys.append(
_get_tensor_watch_key(node_name, watched_slot, debug_op))
return watch_keys
- def watch_key_to_data(self, debug_watch_key):
+ def watch_key_to_data(self, debug_watch_key, device_name=None):
"""Get all `DebugTensorDatum` instances corresponding to a debug watch key.
Args:
debug_watch_key: (`str`) debug watch key.
+ device_name: (`str`) name of the device. If there is only one device or if
+ the specified debug_watch_key exists on only one device, this argumnet
+ is optional.
Returns:
A list of `DebugTensorDatum` instances that correspond to the debug watch
key. If the watch key does not exist, returns an empty list.
Raises:
- ValueError: If the debug watch key does not exist.
+ ValueError: If there are multiple devices that have the debug_watch_key,
+ but device_name is not specified.
"""
+ if device_name is None:
+ matching_device_names = [
+ name for name in self._watch_key_to_datum
+ if debug_watch_key in self._watch_key_to_datum[name]]
+ if not matching_device_names:
+ return []
+ elif len(matching_device_names) == 1:
+ device_name = matching_device_names[0]
+ else:
+ raise ValueError(
+ "The debug watch key '%s' exists on multiple (%d) devices, but "
+ "device name is not specified." %
+ (debug_watch_key, len(matching_device_names)))
+ elif device_name not in self._debug_key_to_datum:
+ raise ValueError(
+ "There is no device named '%s' consisting of debug watch keys." %
+ device_name)
- return self._watch_key_to_datum.get(debug_watch_key, [])
+ return self._watch_key_to_datum[device_name].get(debug_watch_key, [])
- def find(self, predicate, first_n=0):
+ def find(self, predicate, first_n=0, device_name=None):
"""Find dumped tensor data by a certain predicate.
Args:
@@ -1386,6 +1598,7 @@ class DebugDumpDir(object):
first_n: (`int`) return only the first n `DebugTensotDatum` instances (in
time order) for which the predicate returns True. To return all the
`DebugTensotDatum` instances, let first_n be <= 0.
+ device_name: optional device name.
Returns:
A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object
@@ -1394,22 +1607,31 @@ class DebugDumpDir(object):
"""
matched_data = []
- for datum in self._dump_tensor_data:
- if predicate(datum, datum.get_tensor()):
- matched_data.append(datum)
+ for device in (self._dump_tensor_data if device_name is None
+ else (self._dump_tensor_data[device_name],)):
+ for datum in self._dump_tensor_data[device]:
+ if predicate(datum, datum.get_tensor()):
+ matched_data.append(datum)
- if first_n > 0 and len(matched_data) >= first_n:
- break
+ if first_n > 0 and len(matched_data) >= first_n:
+ return matched_data
return matched_data
- def get_tensor_file_paths(self, node_name, output_slot, debug_op):
+ def get_tensor_file_paths(self,
+ node_name,
+ output_slot,
+ debug_op,
+ device_name=None):
"""Get the file paths from a debug-dumped tensor.
Args:
node_name: (`str`) name of the node that the tensor is produced by.
output_slot: (`int`) output slot index of tensor.
debug_op: (`str`) name of the debug op.
+ device_name: (`str`) name of the device. If there is only one device or if
+ the specified debug_watch_key exists on only one device, this argumnet
+ is optional.
Returns:
List of file path(s) loaded. This is a list because each debugged tensor
@@ -1420,14 +1642,17 @@ class DebugDumpDir(object):
the debug-dump data.
"""
+ device_name = self._infer_device_name(device_name, node_name)
watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
- if watch_key not in self._watch_key_to_datum:
+ if watch_key not in self._watch_key_to_datum[device_name]:
raise WatchKeyDoesNotExistInDebugDumpDirError(
- "Watch key \"%s\" does not exist in the debug dump" % watch_key)
+ "Watch key \"%s\" does not exist in the debug dump of device %s" %
+ (watch_key, device_name))
- return [datum.file_path for datum in self._watch_key_to_datum[watch_key]]
+ return [datum.file_path for datum in
+ self._watch_key_to_datum[device_name][watch_key]]
- def get_tensors(self, node_name, output_slot, debug_op):
+ def get_tensors(self, node_name, output_slot, debug_op, device_name=None):
"""Get the tensor value from for a debug-dumped tensor.
The tensor may be dumped multiple times in the dump root directory, so a
@@ -1437,6 +1662,9 @@ class DebugDumpDir(object):
node_name: (`str`) name of the node that the tensor is produced by.
output_slot: (`int`) output slot index of tensor.
debug_op: (`str`) name of the debug op.
+ device_name: (`str`) name of the device. If there is only one device or if
+ the specified debug_watch_key exists on only one device, this argumnet
+ is optional.
Returns:
List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s).
@@ -1447,13 +1675,20 @@ class DebugDumpDir(object):
"""
watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
- if watch_key not in self._watch_key_to_datum:
+ try:
+ device_name = self._infer_device_name(device_name, node_name)
+ return [datum.get_tensor() for datum in
+ self._watch_key_to_datum[device_name][watch_key]]
+ except (ValueError, KeyError):
raise WatchKeyDoesNotExistInDebugDumpDirError(
- "Watch key \"%s\" does not exist in the debug dump" % watch_key)
-
- return [datum.get_tensor() for datum in self._watch_key_to_datum[watch_key]]
-
- def get_rel_timestamps(self, node_name, output_slot, debug_op):
+ "Watch key \"%s\" does not exist in the debug dump of device %s" %
+ (watch_key, device_name))
+
+ def get_rel_timestamps(self,
+ node_name,
+ output_slot,
+ debug_op,
+ device_name=None):
"""Get the relative timestamp from for a debug-dumped tensor.
Relative timestamp means (absolute timestamp - `t0`), where `t0` is the
@@ -1465,6 +1700,9 @@ class DebugDumpDir(object):
node_name: (`str`) name of the node that the tensor is produced by.
output_slot: (`int`) output slot index of tensor.
debug_op: (`str`) name of the debug op.
+ device_name: (`str`) name of the device. If there is only one device or if
+ the specified debug_watch_key exists on only one device, this argumnet
+ is optional.
Returns:
(`list` of `int`) list of relative timestamps.
@@ -1474,14 +1712,20 @@ class DebugDumpDir(object):
exist in the debug dump data.
"""
+ device_name = self._infer_device_name(device_name, node_name)
watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
- if watch_key not in self._watch_key_to_datum:
+ if watch_key not in self._watch_key_to_datum[device_name]:
raise WatchKeyDoesNotExistInDebugDumpDirError(
"Watch key \"%s\" does not exist in the debug dump" % watch_key)
- return self._watch_key_to_rel_time[watch_key]
+ # TODO(cais): Figure out whether this should be relative to the global t0.
+ return self._watch_key_to_rel_time[device_name][watch_key]
- def get_dump_sizes_bytes(self, node_name, output_slot, debug_op):
+ def get_dump_sizes_bytes(self,
+ node_name,
+ output_slot,
+ debug_op,
+ device_name=None):
"""Get the sizes of the dump files for a debug-dumped tensor.
Unit of the file size: byte.
@@ -1490,6 +1734,9 @@ class DebugDumpDir(object):
node_name: (`str`) name of the node that the tensor is produced by.
output_slot: (`int`) output slot index of tensor.
debug_op: (`str`) name of the debug op.
+ device_name: (`str`) name of the device. If there is only one device or if
+ the specified debug_watch_key exists on only one device, this argumnet
+ is optional.
Returns:
(`list` of `int`): list of dump file sizes in bytes.
@@ -1499,12 +1746,14 @@ class DebugDumpDir(object):
exist in the debug dump data.
"""
+ device_name = self._infer_device_name(device_name, node_name)
watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
- if watch_key not in self._watch_key_to_datum:
+ if watch_key not in self._watch_key_to_datum[device_name]:
raise WatchKeyDoesNotExistInDebugDumpDirError(
- "Watch key \"%s\" does not exist in the debug dump" % watch_key)
+ "Watch key \"%s\" does not exist in the debug dump of device %s" %
+ (watch_key, device_name))
- return self._watch_key_to_dump_size_bytes[watch_key]
+ return self._watch_key_to_dump_size_bytes[device_name][watch_key]
def node_traceback(self, element_name):
"""Try to retrieve the Python traceback of node's construction.
diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py
index dc45e8df6c..dd621a5afc 100644
--- a/tensorflow/python/debug/lib/debug_data_test.py
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@@ -23,12 +23,29 @@ import tempfile
import numpy as np
+from tensorflow.core.framework import graph_pb2
from tensorflow.core.framework import tensor_pb2
from tensorflow.python.debug.lib import debug_data
from tensorflow.python.framework import test_util
from tensorflow.python.platform import googletest
+class DeviceNamePathConversionTest(test_util.TensorFlowTestCase):
+
+ def testDeviceNameToDevicePath(self):
+ self.assertEqual(
+ debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+ ",job_ps,replica_1,task_2,cpu_0",
+ debug_data.device_name_to_device_path("/job:ps/replica:1/task:2/cpu:0"))
+
+ def testDevicePathToDeviceName(self):
+ self.assertEqual(
+ "/job:ps/replica:1/task:2/cpu:0",
+ debug_data.device_path_to_device_name(
+ debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+ ",job_ps,replica_1,task_2,cpu_0"))
+
+
class ParseNodeOrTensorNameTest(test_util.TensorFlowTestCase):
def testParseNodeName(self):
@@ -163,7 +180,10 @@ class DebugTensorDatumTest(test_util.TensorFlowTestCase):
def testDebugDatum(self):
dump_root = "/tmp/tfdbg_1"
- debug_dump_rel_path = "ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385"
+ debug_dump_rel_path = (
+ debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+ ",job_localhost,replica_0,task_0,cpu_0" +
+ "/ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385")
datum = debug_data.DebugTensorDatum(dump_root, debug_dump_rel_path)
@@ -175,16 +195,18 @@ class DebugTensorDatumTest(test_util.TensorFlowTestCase):
self.assertEqual("ns1/ns2/node_a_1:2:DebugIdentity", datum.watch_key)
self.assertEqual(
os.path.join(dump_root, debug_dump_rel_path), datum.file_path)
- self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name,
- datum.output_slot,
- datum.debug_op,
- datum.timestamp),
- str(datum))
- self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name,
- datum.output_slot,
- datum.debug_op,
- datum.timestamp),
- repr(datum))
+ self.assertEqual(
+ "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) "
+ "%s:%d @ %s @ %d}" % (datum.node_name,
+ datum.output_slot,
+ datum.debug_op,
+ datum.timestamp), str(datum))
+ self.assertEqual(
+ "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) "
+ "%s:%d @ %s @ %d}" % (datum.node_name,
+ datum.output_slot,
+ datum.debug_op,
+ datum.timestamp), repr(datum))
def testDumpSizeBytesIsNoneForNonexistentFilePath(self):
dump_root = "/tmp/tfdbg_1"
@@ -204,18 +226,112 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
# Tear down temporary dump directory.
shutil.rmtree(self._dump_root)
+ def _makeDataDirWithMultipleDevicesAndDuplicateNodeNames(self):
+ cpu_0_dir = os.path.join(
+ self._dump_root,
+ debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+ ",job_localhost,replica_0,task_0,cpu_0")
+ gpu_0_dir = os.path.join(
+ self._dump_root,
+ debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+ ",job_localhost,replica_0,task_0,gpu_0")
+ gpu_1_dir = os.path.join(
+ self._dump_root,
+ debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+ ",job_localhost,replica_0,task_0,gpu_1")
+ os.makedirs(cpu_0_dir)
+ os.makedirs(gpu_0_dir)
+ os.makedirs(gpu_1_dir)
+ open(os.path.join(
+ cpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536386"), "wb")
+ open(os.path.join(
+ gpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536385"), "wb")
+ open(os.path.join(
+ gpu_1_dir, "node_foo_1_2_DebugIdentity_1472563253536387"), "wb")
+
def testDebugDumpDir_nonexistentDumpRoot(self):
with self.assertRaisesRegexp(IOError, "does not exist"):
debug_data.DebugDumpDir(tempfile.mktemp() + "_foo")
def testDebugDumpDir_invalidFileNamingPattern(self):
# File name with too few underscores should lead to an exception.
- open(os.path.join(self._dump_root, "node1_DebugIdentity_1234"), "wb")
+ device_dir = os.path.join(
+ self._dump_root,
+ debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+ ",job_localhost,replica_0,task_0,cpu_0")
+ os.makedirs(device_dir)
+ open(os.path.join(device_dir, "node1_DebugIdentity_1234"), "wb")
with self.assertRaisesRegexp(ValueError,
"does not conform to the naming pattern"):
debug_data.DebugDumpDir(self._dump_root)
+ def testDebugDumpDir_validDuplicateNodeNamesWithMultipleDevices(self):
+ self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames()
+
+ graph_cpu_0 = graph_pb2.GraphDef()
+ node = graph_cpu_0.node.add()
+ node.name = "node_foo_1"
+ node.op = "FooOp"
+ node.device = "/job:localhost/replica:0/task:0/cpu:0"
+ graph_gpu_0 = graph_pb2.GraphDef()
+ node = graph_gpu_0.node.add()
+ node.name = "node_foo_1"
+ node.op = "FooOp"
+ node.device = "/job:localhost/replica:0/task:0/gpu:0"
+ graph_gpu_1 = graph_pb2.GraphDef()
+ node = graph_gpu_1.node.add()
+ node.name = "node_foo_1"
+ node.op = "FooOp"
+ node.device = "/job:localhost/replica:0/task:0/gpu:1"
+
+ dump_dir = debug_data.DebugDumpDir(
+ self._dump_root,
+ partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
+
+ self.assertItemsEqual(
+ ["/job:localhost/replica:0/task:0/cpu:0",
+ "/job:localhost/replica:0/task:0/gpu:0",
+ "/job:localhost/replica:0/task:0/gpu:1"], dump_dir.devices())
+ self.assertEqual(1472563253536385, dump_dir.t0)
+ self.assertEqual(3, dump_dir.size)
+
+ with self.assertRaisesRegexp(
+ ValueError,
+ r"There are multiple \(3\) devices, but device_name is not specified"):
+ dump_dir.nodes()
+ self.assertItemsEqual(
+ ["node_foo_1"],
+ dump_dir.nodes(device_name="/job:localhost/replica:0/task:0/cpu:0"))
+
+ def testDuplicateNodeNamesInGraphDefOfSingleDeviceRaisesException(self):
+ self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames()
+ graph_cpu_0 = graph_pb2.GraphDef()
+ node = graph_cpu_0.node.add()
+ node.name = "node_foo_1"
+ node.op = "FooOp"
+ node.device = "/job:localhost/replica:0/task:0/cpu:0"
+ graph_gpu_0 = graph_pb2.GraphDef()
+ node = graph_gpu_0.node.add()
+ node.name = "node_foo_1"
+ node.op = "FooOp"
+ node.device = "/job:localhost/replica:0/task:0/gpu:0"
+ graph_gpu_1 = graph_pb2.GraphDef()
+ node = graph_gpu_1.node.add()
+ node.name = "node_foo_1"
+ node.op = "FooOp"
+ node.device = "/job:localhost/replica:0/task:0/gpu:1"
+ node = graph_gpu_1.node.add() # Here is the duplicate.
+ node.name = "node_foo_1"
+ node.op = "FooOp"
+ node.device = "/job:localhost/replica:0/task:0/gpu:1"
+
+ with self.assertRaisesRegexp(
+ ValueError, r"Duplicate node name on device "):
+ debug_data.DebugDumpDir(
+ self._dump_root,
+ partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
+
def testDebugDumpDir_emptyDumpDir(self):
dump_dir = debug_data.DebugDumpDir(self._dump_root)
diff --git a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
new file mode 100644
index 0000000000..b0dc25851c
--- /dev/null
+++ b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
@@ -0,0 +1,93 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for debugger functionalities under multiple (i.e., >1) GPUs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import device_lib
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class SessionDebugMultiGPUTest(test_util.TensorFlowTestCase):
+
+ def setUp(self):
+ self._dump_root = tempfile.mkdtemp()
+
+ def tearDown(self):
+ ops.reset_default_graph()
+
+ # Tear down temporary dump directory.
+ if os.path.isdir(self._dump_root):
+ shutil.rmtree(self._dump_root)
+
+ def testMultiGPUSessionRun(self):
+ local_devices = device_lib.list_local_devices()
+ gpu_device_names = []
+ for device in local_devices:
+ if device.device_type == "GPU":
+ gpu_device_names.append(device.name)
+ gpu_device_names = sorted(gpu_device_names)
+
+ if len(gpu_device_names) < 2:
+ self.skipTest(
+ "This test requires at least 2 GPUs, but only %d is available." %
+ len(gpu_device_names))
+
+ with session.Session() as sess:
+ v = variables.Variable([10.0, 15.0], dtype=dtypes.float32, name="v")
+ with ops.device(gpu_device_names[0]):
+ u0 = math_ops.add(v, v, name="u0")
+ with ops.device(gpu_device_names[1]):
+ u1 = math_ops.multiply(v, v, name="u1")
+ w = math_ops.subtract(u1, u0, name="w")
+
+ sess.run(v.initializer)
+
+ run_options = config_pb2.RunOptions(output_partition_graphs=True)
+ debug_utils.watch_graph(run_options, sess.graph,
+ debug_urls="file://" + self._dump_root)
+ run_metadata = config_pb2.RunMetadata()
+ self.assertAllClose(
+ [80.0, 195.0],
+ sess.run(w, options=run_options, run_metadata=run_metadata))
+
+ debug_dump_dir = debug_data.DebugDumpDir(
+ self._dump_root, partition_graphs=run_metadata.partition_graphs)
+ self.assertEqual(3, len(debug_dump_dir.devices()))
+ self.assertAllClose(
+ [10.0, 15.0], debug_dump_dir.get_tensors("v", 0, "DebugIdentity")[0])
+ self.assertAllClose(
+ [20.0, 30.0], debug_dump_dir.get_tensors("u0", 0, "DebugIdentity")[0])
+ self.assertAllClose(
+ [100.0, 225.0],
+ debug_dump_dir.get_tensors("u1", 0, "DebugIdentity")[0])
+
+
+if __name__ == "__main__":
+ googletest.main()
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 1b4444ef5a..a14ff1af3c 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -250,8 +250,12 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
self.assertIn(results.v.op.type, results.dump.node_op_type(results.v_name))
self.assertIn(results.w.op.type, results.dump.node_op_type(results.w_name))
- with self.assertRaisesRegexp(
- ValueError, "Node 'foo_bar' does not exist in partition graphs."):
+ if test_util.gpu_device_name():
+ expected_error_regexp = r"None of the .* devices has a node named "
+ else:
+ expected_error_regexp = (
+ r"Node \'foo_bar\' does not exist in the partition graph of device")
+ with self.assertRaisesRegexp(ValueError, expected_error_regexp):
results.dump.node_op_type("foo_bar")
def testDumpStringTensorsWorks(self):
@@ -437,9 +441,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
# Verify dump files
self.assertTrue(os.path.isdir(self._dump_root))
- self.assertTrue(os.path.isdir(os.path.join(self._dump_root, u_namespace)))
- self.assertTrue(
- os.path.isdir(os.path.join(self._dump_root, v_namespace, "v")))
+ u_glob_out = glob.glob(os.path.join(self._dump_root, "*", u_namespace))
+ v_glob_out = glob.glob(os.path.join(
+ self._dump_root, "*", v_namespace, "v"))
+ self.assertTrue(os.path.isdir(u_glob_out[0]))
+ self.assertTrue(os.path.isdir(v_glob_out[0]))
dump = debug_data.DebugDumpDir(
self._dump_root, partition_graphs=run_metadata.partition_graphs)
@@ -689,7 +695,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
u_read_name = u_name + "/read"
# Test node name list lookup of the DebugDumpDir object.
- node_names = dump.nodes()
+ if test_util.gpu_device_name():
+ node_names = dump.nodes(
+ device_name="/job:localhost/replica:0/task:0/gpu:0")
+ else:
+ node_names = dump.nodes()
self.assertTrue(u_name in node_names)
self.assertTrue(u_read_name in node_names)
@@ -699,7 +709,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
self.assertEqual(1, len(u_attr["shape"].shape.dim))
self.assertEqual(2, u_attr["shape"].shape.dim[0].size)
- with self.assertRaisesRegexp(ValueError, "No node named \"foo\" exists"):
+ if test_util.gpu_device_name():
+ expected_error_regexp = r"None of the .* devices has a node named "
+ else:
+ expected_error_regexp = r"No node named \"foo\" exists"
+ with self.assertRaisesRegexp(ValueError, expected_error_regexp):
dump.node_attributes("foo")
def testGraphStructureLookupGivesDebugWatchKeys(self):
@@ -722,7 +736,6 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
self.assertEqual(0, u_data[0].output_slot)
self.assertEqual("DebugIdentity", u_data[0].debug_op)
self.assertGreaterEqual(u_data[0].timestamp, 0)
-
self.assertEqual([], dump.watch_key_to_data("foo"))
def testGraphStructureLookupGivesNodeInputsAndRecipients(self):
@@ -753,12 +766,13 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
self.assertEqual([], dump.node_recipients(w_name, is_control=True))
# Test errors raised on invalid node names.
- with self.assertRaisesRegexp(ValueError,
- "does not exist in partition graphs"):
+ if test_util.gpu_device_name():
+ expected_error_regexp = r"None of the .* devices has a node named "
+ else:
+ expected_error_regexp = "does not exist in the partition graph of device "
+ with self.assertRaisesRegexp(ValueError, expected_error_regexp):
dump.node_inputs(u_name + "foo")
-
- with self.assertRaisesRegexp(ValueError,
- "does not exist in partition graphs"):
+ with self.assertRaisesRegexp(ValueError, expected_error_regexp):
dump.node_recipients(u_name + "foo")
# Test transitive_inputs().
@@ -769,8 +783,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
self.assertEqual(
set([u_name, u_read_name, v_name]), set(dump.transitive_inputs(w_name)))
- with self.assertRaisesRegexp(ValueError,
- "does not exist in partition graphs"):
+ with self.assertRaisesRegexp(ValueError, expected_error_regexp):
dump.transitive_inputs(u_name + "foo")
def testGraphStructureLookupWithoutPartitionGraphsDoesNotErrorOut(self):
@@ -1067,10 +1080,12 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
y = array_ops.squeeze(ph, name="mismatch/y")
run_options = config_pb2.RunOptions(output_partition_graphs=True)
+ run_metadata = config_pb2.RunMetadata()
debug_utils.watch_graph(
run_options, sess.graph, debug_urls=self._debug_urls(), global_step=1)
- sess.run(x, feed_dict={ph: np.array([[7.0, 8.0]])}, options=run_options)
+ sess.run(x, feed_dict={ph: np.array([[7.0, 8.0]])}, options=run_options,
+ run_metadata=run_metadata)
dump1 = debug_data.DebugDumpDir(self._dump_root)
self.assertEqual(1, dump1.core_metadata.global_step)
self.assertGreaterEqual(dump1.core_metadata.session_run_index, 0)
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index ffce6ce4c8..c2fc7e3af9 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -437,6 +437,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
'WholeFileReader', 'TextLineReader',
'FixedLengthRecordReader',
'TFRecordReader', 'IdentityReader',
+ 'LMDBReader',
'RefSwitch', 'RefEnter', 'RefNextIteration',
'RefMerge', 'RefIdentity']:
pass
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 79c7905427..242c396acb 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -947,7 +947,7 @@ cuda_py_test(
tags = ["notsan"],
)
-tf_py_test(
+cuda_py_test(
name = "diag_op_test",
size = "medium",
srcs = ["diag_op_test.py"],
@@ -979,6 +979,7 @@ tf_py_test(
"//tensorflow/python:util",
"//tensorflow/python:variables",
],
+ data = ["//tensorflow/core:lmdb_testdata"],
)
cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index dbbc2de811..013aa1ba8a 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -129,7 +129,7 @@ class MathBuiltinUnaryTest(test.TestCase):
for dtype in [np.float32]:
self._testDtype(dtype, use_gpu=True)
- def testFloorDevide(self):
+ def testFloorDivide(self):
x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
[1, 3, 2])
y = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 4744e68051..f0b7885732 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -39,21 +39,23 @@ class MatrixDiagTest(test.TestCase):
self.assertEqual((3, 3), v_diag.get_shape())
self.assertAllEqual(v_diag.eval(), mat)
- def testBatchVector(self):
+ def _testBatchVector(self, dtype):
with self.test_session(use_gpu=True):
- v_batch = np.array([[1.0, 2.0, 3.0],
- [4.0, 5.0, 6.0]])
- mat_batch = np.array(
- [[[1.0, 0.0, 0.0],
- [0.0, 2.0, 0.0],
- [0.0, 0.0, 3.0]],
- [[4.0, 0.0, 0.0],
- [0.0, 5.0, 0.0],
- [0.0, 0.0, 6.0]]])
+ v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype)
+ mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]],
+ [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0],
+ [0.0, 0.0, 6.0]]]).astype(dtype)
v_batch_diag = array_ops.matrix_diag(v_batch)
self.assertEqual((2, 3, 3), v_batch_diag.get_shape())
self.assertAllEqual(v_batch_diag.eval(), mat_batch)
+ def testBatchVector(self):
+ self._testBatchVector(np.float32)
+ self._testBatchVector(np.float64)
+ self._testBatchVector(np.int32)
+ self._testBatchVector(np.int64)
+ self._testBatchVector(np.bool)
+
def testInvalidShape(self):
with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
array_ops.matrix_diag(0)
@@ -108,29 +110,29 @@ class MatrixSetDiagTest(test.TestCase):
self.assertEqual((3, 2), output.get_shape())
self.assertAllEqual(expected, output.eval())
- def testSquareBatch(self):
+ def _testSquareBatch(self, dtype):
with self.test_session(use_gpu=True):
- v_batch = np.array([[-1.0, -2.0, -3.0],
- [-4.0, -5.0, -6.0]])
- mat_batch = np.array(
- [[[1.0, 0.0, 3.0],
- [0.0, 2.0, 0.0],
- [1.0, 0.0, 3.0]],
- [[4.0, 0.0, 4.0],
- [0.0, 5.0, 0.0],
- [2.0, 0.0, 6.0]]])
+ v_batch = np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]]).astype(dtype)
+ mat_batch = np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]],
+ [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0],
+ [2.0, 0.0, 6.0]]]).astype(dtype)
+
+ mat_set_diag_batch = np.array([[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0],
+ [1.0, 0.0, -3.0]],
+ [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0],
+ [2.0, 0.0, -6.0]]]).astype(dtype)
- mat_set_diag_batch = np.array(
- [[[-1.0, 0.0, 3.0],
- [0.0, -2.0, 0.0],
- [1.0, 0.0, -3.0]],
- [[-4.0, 0.0, 4.0],
- [0.0, -5.0, 0.0],
- [2.0, 0.0, -6.0]]])
output = array_ops.matrix_set_diag(mat_batch, v_batch)
self.assertEqual((2, 3, 3), output.get_shape())
self.assertAllEqual(mat_set_diag_batch, output.eval())
+ def testSquareBatch(self):
+ self._testSquareBatch(np.float32)
+ self._testSquareBatch(np.float64)
+ self._testSquareBatch(np.int32)
+ self._testSquareBatch(np.int64)
+ self._testSquareBatch(np.bool)
+
def testRectangularBatch(self):
with self.test_session(use_gpu=True):
v_batch = np.array([[-1.0, -2.0],
@@ -220,22 +222,24 @@ class MatrixDiagPartTest(test.TestCase):
mat_diag = array_ops.matrix_diag_part(mat)
self.assertAllEqual(mat_diag.eval(), np.array([1.0, 4.0]))
- def testSquareBatch(self):
+ def _testSquareBatch(self, dtype):
with self.test_session(use_gpu=True):
- v_batch = np.array([[1.0, 2.0, 3.0],
- [4.0, 5.0, 6.0]])
- mat_batch = np.array(
- [[[1.0, 0.0, 0.0],
- [0.0, 2.0, 0.0],
- [0.0, 0.0, 3.0]],
- [[4.0, 0.0, 0.0],
- [0.0, 5.0, 0.0],
- [0.0, 0.0, 6.0]]])
+ v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype)
+ mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]],
+ [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0],
+ [0.0, 0.0, 6.0]]]).astype(dtype)
self.assertEqual(mat_batch.shape, (2, 3, 3))
mat_batch_diag = array_ops.matrix_diag_part(mat_batch)
self.assertEqual((2, 3), mat_batch_diag.get_shape())
self.assertAllEqual(mat_batch_diag.eval(), v_batch)
+ def testSquareBatch(self):
+ self._testSquareBatch(np.float32)
+ self._testSquareBatch(np.float64)
+ self._testSquareBatch(np.int32)
+ self._testSquareBatch(np.int64)
+ self._testSquareBatch(np.bool)
+
def testRectangularBatch(self):
with self.test_session(use_gpu=True):
v_batch = np.array([[1.0, 2.0],
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 10f34751d0..0038f4bbf3 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -858,5 +858,48 @@ class AsyncReaderTest(test.TestCase):
output.append(sess.run(args))
+class LMDBReaderTest(test.TestCase):
+
+ def setUp(self):
+ super(LMDBReaderTest, self).setUp()
+
+ def testReadFromFile(self):
+ with self.test_session() as sess:
+ reader = io_ops.LMDBReader(name="test_read_from_file")
+ path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata",
+ "data.mdb")
+ queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+ key, value = reader.read(queue)
+
+ queue.enqueue([path]).run()
+ queue.close().run()
+ for i in range(10):
+ k, v = sess.run([key, value])
+ self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+ self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
+
+ with self.assertRaisesOpError("is closed and has insufficient elements "
+ "\\(requested 1, current size 0\\)"):
+ k, v = sess.run([key, value])
+
+ def testReadFromFolder(self):
+ with self.test_session() as sess:
+ reader = io_ops.LMDBReader(name="test_read_from_folder")
+ path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata")
+ queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+ key, value = reader.read(queue)
+
+ queue.enqueue([path]).run()
+ queue.close().run()
+ for i in range(10):
+ k, v = sess.run([key, value])
+ self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+ self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
+
+ with self.assertRaisesOpError("is closed and has insufficient elements "
+ "\\(requested 1, current size 0\\)"):
+ k, v = sess.run([key, value])
+
+
if __name__ == "__main__":
test.main()
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index b61168695a..49dcd2370c 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -149,39 +149,13 @@ class _Conv(base.Layer):
self.built = True
def call(self, inputs):
- if (self.data_format == 'channels_first' and
- not framework.test_util.gpu_device_name()):
- # `nn.convolution` is not implemented on CPU for `channels_first` format.
- # In cases where we are most likely running on CPU using `channels_first`,
- # we reshape the inputs to use `channels_last` (and reshape them back
- # afterwards). This is a temporary fix; a better solution would be a fix
- # at the op level.
- # TODO(chollet): remove this when `nn.convolution` is feature-complete.
- data_format = 'channels_last'
- if self.rank == 1:
- inputs = array_ops.transpose(inputs, (0, 2, 1))
- elif self.rank == 2:
- inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
- elif self.rank == 3:
- inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
- else:
- data_format = self.data_format
outputs = nn.convolution(
input=inputs,
filter=self.kernel,
dilation_rate=self.dilation_rate,
strides=self.strides,
padding=self.padding.upper(),
- data_format=utils.convert_data_format(data_format,
- self.rank + 2))
- if (self.data_format == 'channels_first' and
- not framework.test_util.gpu_device_name()):
- if self.rank == 1:
- outputs = array_ops.transpose(outputs, (0, 2, 1))
- elif self.rank == 2:
- outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
- elif self.rank == 3:
- outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+ data_format=utils.convert_data_format(self.data_format, self.rank + 2))
if self.bias is not None:
if self.data_format == 'channels_first':
@@ -202,18 +176,10 @@ class _Conv(base.Layer):
[outputs_shape[0], outputs_shape[1],
outputs_shape[2] * outputs_shape[3],
outputs_shape[4]])
- outputs_4d = nn.bias_add(
- outputs_4d,
- self.bias,
- data_format=utils.convert_data_format(self.data_format, 4))
+ outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
outputs = array_ops.reshape(outputs_4d, outputs_shape)
else:
- outputs = nn.bias_add(
- outputs,
- self.bias,
- data_format=utils.convert_data_format(self.data_format, 4))
- # Note that we passed rank=4 because bias_add will only accept
- # NHWC and NCWH even if the rank of the inputs is 3 or 5.
+ outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
if self.activation is not None:
return self.activation(outputs)
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 6cd644b642..e903afa0a8 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -262,16 +262,7 @@ class _Pooling2D(base.Layer):
self.input_spec = base.InputSpec(ndim=4)
def call(self, inputs):
- if (self.data_format == 'channels_first' and
- not framework.test_util.gpu_device_name()):
- # `nn.convolution` is not implemented on CPU for `channels_first` format.
- # TODO(chollet): remove this when `nn.convolution` is feature-complete.
- data_format = 'channels_last'
- inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
- else:
- data_format = self.data_format
-
- if data_format == 'channels_last':
+ if self.data_format == 'channels_last':
pool_shape = (1,) + self.pool_size + (1,)
strides = (1,) + self.strides + (1,)
else:
@@ -282,11 +273,7 @@ class _Pooling2D(base.Layer):
ksize=pool_shape,
strides=strides,
padding=self.padding.upper(),
- data_format=utils.convert_data_format(data_format, 4))
-
- if (self.data_format == 'channels_first' and
- not framework.test_util.gpu_device_name()):
- outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
+ data_format=utils.convert_data_format(self.data_format, 4))
return outputs
def _compute_output_shape(self, input_shape):
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index 06adfc5066..553e0dc135 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -191,6 +191,7 @@ WholeFileReader
TextLineReaderV2
TFRecordReaderV2
WholeFileReaderV2
+LMDBReader
# linalg_ops
BatchCholesky
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 68ecc219e4..8f5d0e5cd4 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -26,6 +26,7 @@ See the @{$python/io_ops} guide.
@@WholeFileReader
@@IdentityReader
@@TFRecordReader
+@@LMDBReader
@@FixedLengthRecordReader
@@decode_csv
@@decode_raw
@@ -443,6 +444,25 @@ class TFRecordReader(ReaderBase):
ops.NotDifferentiable("TFRecordReader")
+class LMDBReader(ReaderBase):
+ """A Reader that outputs the records from a LMDB file.
+
+ See ReaderBase for supported methods.
+ """
+ def __init__(self, name=None, options=None):
+ """Create a LMDBReader.
+
+ Args:
+ name: A name for the operation (optional).
+ options: A LMDBRecordOptions object (optional).
+ """
+ rr = gen_io_ops._lmdb_reader(name=name)
+ super(LMDBReader, self).__init__(rr)
+
+
+ops.NotDifferentiable("LMDBReader")
+
+
class IdentityReader(ReaderBase):
"""A Reader that outputs the queued work as both the key and value.
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index ebdb137fb9..fe532fa186 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -26,6 +26,7 @@ from __future__ import print_function
import threading
import numpy as np
+import six
from tensorflow.python import pywrap_tensorflow
from tensorflow.python.framework import function
@@ -81,6 +82,10 @@ class FuncRegistry(object):
if func is None:
raise ValueError("callback %s is not found" % token)
ret = func(*args)
+ # Strings seem to lead to a memory leak here if they're not wrapped in a
+ # list.
+ if isinstance(ret, six.binary_type):
+ ret = [ret]
# Ensures that we return either a single numpy array or a list of numpy
# arrays.
if isinstance(ret, (tuple, list)):
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 5ae82524a3..459c735ea3 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -113,10 +113,14 @@ class AdamOptimizer(optimizer.Optimizer):
def _create_slots(self, var_list):
# Create the beta1 and beta2 accumulators on the same device as the first
- # variable.
+ # variable. Sort the var_list to make sure this device is consistent across
+ # workers (these need to go on the same PS, otherwise some updates are
+ # silently ignored).
+ first_var = min(var_list, key=lambda x: x.name)
+
if (self._beta1_power is None or
- self._beta1_power.graph is not var_list[0].graph):
- with ops.colocate_with(var_list[0]):
+ self._beta1_power.graph is not first_var.graph):
+ with ops.colocate_with(first_var):
self._beta1_power = variable_scope.variable(self._beta1,
name="beta1_power",
trainable=False)
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 88df3351e6..05c99856d2 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -17,14 +17,52 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
+import collections
import functools
+import itertools
import traceback
import types
+import six # pylint: disable=unused-import
+
+from backports import weakref # pylint: disable=g-bad-import-order
+
from tensorflow.python.platform import tf_logging
from tensorflow.python.util import tf_decorator
+class _RefInfoField(
+ collections.namedtuple(
+ '_RefInfoField', ('type_', 'repr_', 'creation_stack', 'object_used'))):
+ pass
+
+
+# Thread-safe up to int32max/2 thanks to python's GIL; and may be safe even for
+# higher values in Python 3.4+. We don't expect to ever count higher than this.
+# https://mail.python.org/pipermail/python-list/2005-April/342279.html
+_REF_ITER = itertools.count()
+
+# Dictionary mapping id(obj) => _RefInfoField.
+_REF_INFO = {}
+
+
+def _deleted(obj_id, fatal_error):
+ obj = _REF_INFO[obj_id]
+ del _REF_INFO[obj_id]
+ if not obj.object_used:
+ if fatal_error:
+ logger = tf_logging.fatal
+ else:
+ logger = tf_logging.error
+ logger(
+ '==================================\n'
+ 'Object was never used (type %s):\n%s\nIf you want to mark it as '
+ 'used call its "mark_used()" method.\nIt was originally created '
+ 'here:\n%s\n'
+ '==================================' %
+ (obj.type_, obj.repr_, obj.creation_stack))
+
+
def _add_should_use_warning(x, fatal_error=False):
"""Wraps object x so that if it is never used, a warning is logged.
@@ -39,14 +77,14 @@ def _add_should_use_warning(x, fatal_error=False):
"""
if x is None: # special corner case where x is None
return x
- has_been_used = getattr(x, '_tf_object_has_been_used', None)
- if has_been_used is not None:
- x._tf_object_has_been_used = has_been_used # pylint: disable=protected-access
+ if hasattr(x, '_tf_ref_id'): # this is already a TFShouldUseWarningWrapper
return x
def override_method(method):
def fn(self, *args, **kwargs):
- self._tf_object_has_been_used = True # pylint: disable=protected-access
+ # pylint: disable=protected-access
+ _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+ object_used=True)
return method(self, *args, **kwargs)
return fn
@@ -55,38 +93,36 @@ def _add_should_use_warning(x, fatal_error=False):
def __init__(self, true_self):
self.__dict__ = true_self.__dict__
- stack = [x.strip() for x in traceback.format_stack()]
+ stack = [s.strip() for s in traceback.format_stack()]
# Remove top three stack entries from adding the wrapper
- self._tf_object_creation_stack = '\n'.join(stack[:-3])
- self._tf_object_has_been_used = False
+ self.creation_stack = '\n'.join(stack[:-3])
+ self._tf_ref_id = next(_REF_ITER)
+ _REF_INFO[self._tf_ref_id] = _RefInfoField(
+ type_=type(x),
+ repr_=repr(x),
+ creation_stack=stack,
+ object_used=False)
+
+ # Create a finalizer for self, which will be called when self is
+ # garbage collected. Can't add self as the args because the
+ # loop will break garbage collection. We keep track of
+ # ourselves via python ids.
+ weakref.finalize(self, _deleted, self._tf_ref_id, fatal_error)
# Not sure why this pylint warning is being used; this is not an
# old class form.
# pylint: disable=super-on-old-class
def __getattribute__(self, name):
- if name != '_tf_object_has_been_used':
- self._tf_object_has_been_used = True
+ if name == '_tf_ref_id':
+ return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
+ if self._tf_ref_id in _REF_INFO:
+ _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+ object_used=True)
return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
- def __del__(self):
- if not self._tf_object_has_been_used:
- if fatal_error:
- logger = tf_logging.fatal
- else:
- logger = tf_logging.error
- logger(
- '==================================\n'
- 'Object was never used (type %s):\n%s\nIf you want to mark it as '
- 'used call its "mark_used()" method.\nIt was originally created '
- 'here:\n%s\n'
- '==================================' %
- (type(x), x, self._tf_object_creation_stack))
-
- if hasattr(super(TFShouldUseWarningWrapper, self), '__del__'):
- return super(TFShouldUseWarningWrapper, self).__del__()
-
def mark_used(self, *args, **kwargs):
- self._tf_object_has_been_used = True
+ _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+ object_used=True)
if hasattr(super(TFShouldUseWarningWrapper, self), 'mark_used'):
return super(TFShouldUseWarningWrapper, self).mark_used(*args, **kwargs)
# pylint: enable=super-on-old-class
@@ -102,7 +138,8 @@ def _add_should_use_warning(x, fatal_error=False):
wrapped = TFShouldUseWarningWrapper(x)
wrapped.__doc__ = x.__doc__ # functools.wraps fails on some objects.
- wrapped._tf_object_has_been_used = False # pylint: disable=protected-access
+ ref_id = wrapped._tf_ref_id # pylint: disable=protected-access
+ _REF_INFO[ref_id] = _REF_INFO[ref_id]._replace(object_used=False)
return wrapped
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index 71d48e3dde..c826874400 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -20,6 +20,7 @@ from __future__ import division
from __future__ import print_function
import contextlib
+import gc
import sys
from tensorflow.python.framework import constant_op
@@ -45,7 +46,7 @@ def reroute_error(captured):
class TfShouldUseTest(test.TestCase):
def testAddShouldUseWarningWhenNotUsed(self):
- c = constant_op.constant(0, name='blah')
+ c = constant_op.constant(0, name='blah0')
captured = []
with reroute_error(captured):
def in_this_function():
@@ -53,44 +54,52 @@ class TfShouldUseTest(test.TestCase):
del h
in_this_function()
self.assertIn('Object was never used', '\n'.join(captured))
- self.assertIn('blah:0', '\n'.join(captured))
+ self.assertIn('blah0:0', '\n'.join(captured))
self.assertIn('in_this_function', '\n'.join(captured))
+ gc.collect()
+ self.assertFalse(gc.garbage)
- def _testAddShouldUseWarningWhenUsed(self, fn):
- c = constant_op.constant(0, name='blah')
+ def _testAddShouldUseWarningWhenUsed(self, fn, name):
+ c = constant_op.constant(0, name=name)
captured = []
with reroute_error(captured):
h = tf_should_use._add_should_use_warning(c)
fn(h)
del h
self.assertNotIn('Object was never used', '\n'.join(captured))
- self.assertNotIn('blah:0', '\n'.join(captured))
+ self.assertNotIn('%s:0' % name, '\n'.join(captured))
def testAddShouldUseWarningWhenUsedWithAdd(self):
def add(h):
_ = h + 1
- self._testAddShouldUseWarningWhenUsed(add)
+ self._testAddShouldUseWarningWhenUsed(add, name='blah_add')
+ gc.collect()
+ self.assertFalse(gc.garbage)
def testAddShouldUseWarningWhenUsedWithGetName(self):
def get_name(h):
_ = h.name
- self._testAddShouldUseWarningWhenUsed(get_name)
+ self._testAddShouldUseWarningWhenUsed(get_name, name='blah_get_name')
+ gc.collect()
+ self.assertFalse(gc.garbage)
def testShouldUseResult(self):
@tf_should_use.should_use_result
def return_const(value):
- return constant_op.constant(value, name='blah')
+ return constant_op.constant(value, name='blah2')
captured = []
with reroute_error(captured):
return_const(0.0)
self.assertIn('Object was never used', '\n'.join(captured))
- self.assertIn('blah:0', '\n'.join(captured))
+ self.assertIn('blah2:0', '\n'.join(captured))
self.assertIn('return_const', '\n'.join(captured))
+ gc.collect()
+ self.assertFalse(gc.garbage)
def testShouldUseResultWhenNotReallyUsed(self):
@tf_should_use.should_use_result
def return_const(value):
- return constant_op.constant(value, name='blah')
+ return constant_op.constant(value, name='blah3')
captured = []
with reroute_error(captured):
with self.test_session():
@@ -100,8 +109,10 @@ class TfShouldUseTest(test.TestCase):
v = constant_op.constant(1.0, name='meh')
v.eval()
self.assertIn('Object was never used', '\n'.join(captured))
- self.assertIn('blah:0', '\n'.join(captured))
+ self.assertIn('blah3:0', '\n'.join(captured))
self.assertIn('return_const', '\n'.join(captured))
+ gc.collect()
+ self.assertFalse(gc.garbage)
if __name__ == '__main__':
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 9ea6474934..bf81b9c0ad 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -170,7 +170,7 @@ void Diagnostician::LogDiagnosticInformation() {
VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
std::vector<string> pieces = port::Split(library_path, ':');
- for (auto piece : pieces) {
+ for (const auto &piece : pieces) {
if (piece.empty()) {
continue;
}
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 9d3ac4ed9e..802ef755eb 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -48,27 +48,9 @@ limitations under the License.
namespace perftools {
namespace gputools {
-class KernelBase;
class Stream;
class Timer;
-namespace blas {
-class BlasSupport;
-} // namespace blas
-
-namespace fft {
-class Support;
-} // namespace fft
-
-namespace rng {
-class RngSupport;
-} // namespace rng
-
-} // namespace gputools
-} // namespace perftools
-
-namespace perftools {
-namespace gputools {
namespace internal {
// Platform-dependent interface class for the generic Events interface, in
diff --git a/tensorflow/tensorboard/backend/application_test.py b/tensorflow/tensorboard/backend/application_test.py
index 87cfdbc1d8..63eb47e9d4 100644
--- a/tensorflow/tensorboard/backend/application_test.py
+++ b/tensorflow/tensorboard/backend/application_test.py
@@ -36,7 +36,6 @@ import tensorflow as tf
from werkzeug import serving
-from tensorflow.core.protobuf import meta_graph_pb2
from tensorflow.tensorboard import tensorboard
from tensorflow.tensorboard.backend import application
from tensorflow.tensorboard.backend.event_processing import event_multiplexer
@@ -221,7 +220,7 @@ class TensorboardServerTest(tf.test.TestCase):
node2.name = 'b'
node2.attr['very_large_attr'].s = b'a' * 2048 # 2 KB attribute
- meta_graph_def = meta_graph_pb2.MetaGraphDef(graph_def=graph_def)
+ meta_graph_def = tf.MetaGraphDef(graph_def=graph_def)
if self._only_use_meta_graph:
writer.add_meta_graph(meta_graph_def)
diff --git a/tensorflow/tensorboard/backend/event_processing/BUILD b/tensorflow/tensorboard/backend/event_processing/BUILD
index 1c3e094e76..dff2dbc90d 100644
--- a/tensorflow/tensorboard/backend/event_processing/BUILD
+++ b/tensorflow/tensorboard/backend/event_processing/BUILD
@@ -134,18 +134,6 @@ py_library(
],
)
-py_test(
- name = "plugin_asset_util_test",
- size = "small",
- srcs = ["plugin_asset_util_test.py"],
- srcs_version = "PY2AND3",
- deps = [
- ":event_multiplexer",
- ":plugin_asset_util",
- "//tensorflow:tensorflow_py",
- ],
-)
-
py_library(
name = "event_file_inspector",
srcs = ["event_file_inspector.py"],
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
index 9efd64bd2e..4ce766f420 100644
--- a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
@@ -24,7 +24,6 @@ import six
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
-from tensorflow.python.summary.writer.writer import SummaryToEventTransformer
from tensorflow.tensorboard.backend.event_processing import event_accumulator as ea
@@ -760,7 +759,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
def testTFSummaryScalar(self):
"""Verify processing of tf.summary.scalar."""
event_sink = _EventGenerator(self, zero_out_timestamps=True)
- writer = SummaryToEventTransformer(event_sink)
+ writer = tf.summary.FileWriter(self.get_temp_dir())
+ writer.event_writer = event_sink
with self.test_session() as sess:
ipt = tf.placeholder(tf.float32)
tf.summary.scalar('scalar1', ipt)
@@ -794,7 +794,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
def testTFSummaryImage(self):
"""Verify processing of tf.summary.image."""
event_sink = _EventGenerator(self, zero_out_timestamps=True)
- writer = SummaryToEventTransformer(event_sink)
+ writer = tf.summary.FileWriter(self.get_temp_dir())
+ writer.event_writer = event_sink
with self.test_session() as sess:
ipt = tf.ones([10, 4, 4, 3], tf.uint8)
# This is an interesting example, because the old tf.image_summary op
@@ -830,7 +831,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
def testTFSummaryTensor(self):
"""Verify processing of tf.summary.tensor."""
event_sink = _EventGenerator(self, zero_out_timestamps=True)
- writer = SummaryToEventTransformer(event_sink)
+ writer = tf.summary.FileWriter(self.get_temp_dir())
+ writer.event_writer = event_sink
with self.test_session() as sess:
tf.summary.tensor_summary('scalar', tf.constant(1.0))
tf.summary.tensor_summary('vector', tf.constant([1.0, 2.0, 3.0]))
diff --git a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
deleted file mode 100644
index 8fe5b31621..0000000000
--- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-
-import tensorflow as tf
-
-from tensorflow.python.summary import plugin_asset
-
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.backend.event_processing import plugin_asset_util
-
-
-class GenericContentPlugin(plugin_asset.PluginAsset):
-
- def __init__(self):
- self.contents = "hello world"
-
- def assets(self):
- return {"contents.txt": self.contents}
-
-
-class PluginAlpha(GenericContentPlugin):
- plugin_name = "Alpha"
-
-
-class PluginBeta(GenericContentPlugin):
- plugin_name = "Beta"
-
-
-class PluginGamma(GenericContentPlugin):
- plugin_name = "Gamma"
-
-
-class PluginAssetUtilitiesTest(tf.test.TestCase):
-
- def testGetPluginDirectory(self):
- self.assertEqual(
- os.path.join("logdir", "plugins", "x"),
- plugin_asset_util.PluginDirectory("logdir", "x"))
-
- def testNonExistentDirectory(self):
- tempdir = self.get_temp_dir()
- fake_dir = os.path.join(tempdir, "nonexistent_dir")
- self.assertEqual([], plugin_asset_util.ListPlugins(fake_dir))
- self.assertEqual([], plugin_asset_util.ListAssets(fake_dir, "fake_plugin"))
- with self.assertRaises(KeyError):
- plugin_asset_util.RetrieveAsset(fake_dir, "fake_plugin", "fake_asset")
-
- def testSimplePluginCase(self):
- tempdir = self.get_temp_dir()
- with tf.Graph().as_default() as g:
- plugin_asset.get_plugin_asset(PluginAlpha)
- fw = tf.summary.FileWriter(tempdir)
- fw.add_graph(g)
- self.assertEqual(["Alpha"], plugin_asset_util.ListPlugins(tempdir))
- assets = plugin_asset_util.ListAssets(tempdir, "Alpha")
- self.assertEqual(["contents.txt"], assets)
- contents = plugin_asset_util.RetrieveAsset(tempdir, "Alpha", "contents.txt")
- self.assertEqual("hello world", contents)
-
- def testEventMultiplexerIntegration(self):
- tempdir = self.get_temp_dir()
- with tf.Graph().as_default() as g:
- plugin_instance = plugin_asset.get_plugin_asset(PluginAlpha)
- plugin_instance.contents = "graph one"
- plugin_asset.get_plugin_asset(PluginBeta)
-
- fw = tf.summary.FileWriter(os.path.join(tempdir, "one"))
- fw.add_graph(g)
- fw.close()
-
- with tf.Graph().as_default() as g:
- plugin_instance = plugin_asset.get_plugin_asset(PluginAlpha)
- plugin_instance.contents = "graph two"
- fw = tf.summary.FileWriter(os.path.join(tempdir, "two"))
- fw.add_graph(g)
- fw.close()
-
- multiplexer = event_multiplexer.EventMultiplexer()
- multiplexer.AddRunsFromDirectory(tempdir)
-
- self.assertEqual(
- multiplexer.PluginAssets("Alpha"),
- {"one": ["contents.txt"], "two": ["contents.txt"]})
- self.assertEqual(
- multiplexer.RetrievePluginAsset("one", "Alpha", "contents.txt"),
- "graph one")
- self.assertEqual(
- multiplexer.RetrievePluginAsset("one", "Beta", "contents.txt"),
- "hello world")
- self.assertEqual(
- multiplexer.RetrievePluginAsset("two", "Alpha", "contents.txt"),
- "graph two")
-
- self.assertEqual(
- multiplexer.PluginAssets("Beta"),
- {"one": ["contents.txt"], "two": []})
- self.assertEqual(multiplexer.PluginAssets("Gamma"), {"one": [], "two": []})
-
-
-if __name__ == "__main__":
- tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/scalars/scalars_demo.py b/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
index e9b2a6129d..f3195fd849 100644
--- a/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
+++ b/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
@@ -23,8 +23,6 @@ import os.path
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
-from tensorflow.python.platform import app
-
# Directory into which to write tensorboard data.
LOGDIR = '/tmp/scalars_demo'
@@ -129,4 +127,4 @@ def main(unused_argv):
if __name__ == '__main__':
- app.run()
+ tf.app.run()
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin.py b/tensorflow/tensorboard/plugins/text/text_plugin.py
index 2c11b80029..d0040e20be 100644
--- a/tensorflow/tensorboard/plugins/text/text_plugin.py
+++ b/tensorflow/tensorboard/plugins/text/text_plugin.py
@@ -35,7 +35,6 @@ import six
import tensorflow as tf
from werkzeug import wrappers
-from tensorflow.python.summary import text_summary
from tensorflow.tensorboard.backend import http_util
from tensorflow.tensorboard.plugins import base_plugin
@@ -256,7 +255,7 @@ class TextPlugin(base_plugin.TBPlugin):
def index_impl(self):
run_to_series = {}
- name = text_summary.TextSummaryPluginAsset.plugin_name
+ name = 'tensorboard_text'
run_to_assets = self.multiplexer.PluginAssets(name)
for run, assets in run_to_assets.items():
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 68493965fa..5d18295f68 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,14 +1,15 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
MAINTAINER Jan Prach <jendap@google.com>
# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
# /usr/local/cuda
-RUN cp /usr/include/cudnn.h /usr/local/cuda/include
-RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
+RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
+RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
# Copy and run the install scripts.
COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
RUN /install/install_bootstrap_deb_packages.sh
RUN add-apt-repository -y ppa:openjdk-r/ppa && \
add-apt-repository -y ppa:george-edison55/cmake-3.x
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu_clang b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
index 00aaa9f760..c4342d17f5 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu_clang
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
MAINTAINER Ilya Biryukov <ibiryukov@google.com>
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b8f9fc8453..8768852dc7 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -85,3 +85,6 @@ pip2 install mock
pip2 install portpicker
pip3 install portpicker
+
+pip2 install backports.weakref==1.0rc1
+pip3 install backports.weakref==1.0rc1
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index e7e2d256cd..edfc4e3a98 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -89,3 +89,6 @@ pip3.5 install wheel==0.29.0
pip3.5 install portpicker
pip3.5 install werkzeug
+
+pip3.5 install backports.weakref==1.0rc1
+
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index d1de9eace5..d0a038a9db 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
MAINTAINER Craig Citro <craigcitro@google.com>
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 88876421f5..3ba1e963f9 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
MAINTAINER Craig Citro <craigcitro@google.com>
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 52a65e1c9d..7ae1d2abd9 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -897,7 +897,7 @@ class _ClassPageInfo(object):
@property
def guides(self):
- """Returns a markdown string containing backlinks to relevent api_guides."""
+ """Returns a markdown string containing backlinks to relevant api_guides."""
return self._guides
def set_guides(self, guides):
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 1e36af93ea..51ba3b7a0b 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -97,6 +97,7 @@ genrule(
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
"@libxsmm_archive//:LICENSE",
+ "@lmdb//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",
"@png_archive//:LICENSE",
"@protobuf//:LICENSE",
@@ -126,6 +127,7 @@ genrule(
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
"@libxsmm_archive//:LICENSE",
+ "@lmdb//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",
"@png_archive//:LICENSE",
"@protobuf//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 0c2660ff37..09fe6c53cd 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -110,6 +110,7 @@ filegroup(
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
"@libxsmm_archive//:LICENSE",
+ "@lmdb//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",
"@nanopb_git//:LICENSE.txt",
"@org_html5lib//:LICENSE",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 0524d2f1aa..58a80fd98a 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -46,6 +46,7 @@ BLACKLIST = [
"//tensorflow/python:tf_optimizer",
"//tensorflow/python:compare_test_proto_py",
"//tensorflow/core:image_testdata",
+ "//tensorflow/core:lmdb_testdata",
"//tensorflow/core/kernels/cloud:bigquery_reader_ops",
"//tensorflow/python/feature_column:vocabulary_testdata",
"//tensorflow/python:framework/test_file_system.so",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 79b8e5bfa3..eb76fb3ce3 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -39,6 +39,7 @@ REQUIRED_PACKAGES = [
'html5lib == 0.9999999', # identical to 1.0b8
'markdown == 2.2.0',
'bleach == 1.5.0',
+ 'backports.weakref == 1.0rc1',
]
project_name = 'tensorflow'
diff --git a/tensorflow/tools/test/system_info_lib.py b/tensorflow/tools/test/system_info_lib.py
index c352abaa54..0cc261591b 100644
--- a/tensorflow/tools/test/system_info_lib.py
+++ b/tensorflow/tools/test/system_info_lib.py
@@ -96,7 +96,7 @@ def gather_cpu_info():
cpu_info.cpu_info = info['brand']
cpu_info.num_cores = info['count']
cpu_info.mhz_per_cpu = info['hz_advertised_raw'][0] / 1.0e6
- l2_cache_size = re.match(r'(\d+)', str(info['l2_cache_size']))
+ l2_cache_size = re.match(r'(\d+)', str(info.get('l2_cache_size', '')))
if l2_cache_size:
# If a value is returned, it's in KB
cpu_info.cache_size['L2'] = int(l2_cache_size.group(0)) * 1024
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 18bc9a8277..d7dd16919a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -508,6 +508,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
)
native.new_http_archive(
+ name = "lmdb",
+ urls = [
+ "http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+ "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+ ],
+ sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
+ strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
+ build_file = str(Label("//third_party:lmdb.BUILD")),
+ )
+
+ native.new_http_archive(
name = "jsoncpp_git",
urls = [
"http://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
diff --git a/third_party/lmdb.BUILD b/third_party/lmdb.BUILD
new file mode 100644
index 0000000000..357e4a9335
--- /dev/null
+++ b/third_party/lmdb.BUILD
@@ -0,0 +1,25 @@
+# Description:
+# LMDB is the Lightning Memory-mapped Database.
+
+licenses(["notice"]) # OpenLDAP Public License
+
+exports_files(["LICENSE"])
+
+cc_library(
+ name = "lmdb",
+ srcs = [
+ "mdb.c",
+ "midl.c",
+ ],
+ hdrs = [
+ "lmdb.h",
+ "midl.h",
+ ],
+ copts = [
+ "-w",
+ ],
+ linkopts = [
+ "-lpthread",
+ ],
+ visibility = ["//visibility:public"],
+)