diff options
97 files changed, 2122 insertions, 857 deletions
@@ -466,7 +466,7 @@ done while true; do # Configure the Cuda SDK version to use. if [ -z "$TF_CUDA_VERSION" ]; then - read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to use system default]: " TF_CUDA_VERSION + read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: " TF_CUDA_VERSION fi fromuser="" @@ -509,7 +509,6 @@ while true; do export CUDA_TOOLKIT_PATH write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH" export TF_CUDA_VERSION - write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" break fi echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found" @@ -522,6 +521,13 @@ while true; do CUDA_TOOLKIT_PATH="" done +# Set default CUDA version if not set +if [ -z "$TF_CUDA_VERSION" ]; then + TF_CUDA_VERSION="8.0" + export TF_CUDA_VERSION +fi +write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" + # Set up which gcc nvcc should use as the host compiler # No need to set this on Windows while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do @@ -555,7 +561,7 @@ done while true; do # Configure the cuDNN version to use. if [ -z "$TF_CUDNN_VERSION" ]; then - read -p "Please specify the cuDNN version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION + read -p "Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: " TF_CUDNN_VERSION fi fromuser="" @@ -605,7 +611,6 @@ while true; do CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')" if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then export TF_CUDNN_VERSION - write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION" export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})" write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH" break @@ -626,6 +631,13 @@ while true; do CUDNN_INSTALL_PATH="" done +# Set default CUDNN version if not set +if [ -z "$TF_CUDNN_VERSION" ]; then + TF_CUDNN_VERSION="6" + export TF_CUDNN_VERSION +fi +write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION" + # Configure the compute capabilities that TensorFlow builds for. # Since Cuda toolkit is not backward-compatible, this is not guaranteed to work. while true; do @@ -844,6 +856,4 @@ if [ "$TF_NEED_MPI" == "1" ]; then fi -# TODO(gunan): Remove once bazel correctly handles changes in remote repositories. -bazel clean echo "Configuration finished" diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 277e3c9906..5f857191da 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -84,7 +84,7 @@ cc_library( "//tensorflow/compiler/jit/kernels:xla_device_launch_op", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/tf2xla/kernels:xla_ops", - "//tensorflow/compiler/xla/service:cpu_plugin", + "//tensorflow/compiler/xla/service:cpu_plugin", # buildcleaner: keep "//tensorflow/core:core_cpu_internal", "//tensorflow/core:lib", ], @@ -101,7 +101,7 @@ cc_library( "//tensorflow/compiler/jit/kernels:xla_device_launch_op", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/tf2xla/kernels:xla_ops", - "//tensorflow/compiler/xla/service:gpu_plugin", + "//tensorflow/compiler/xla/service:gpu_plugin", # buildcleaner: keep "//tensorflow/core:core_cpu_internal", "//tensorflow/core:lib", ], diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 8fa7d044d1..3f93a7a511 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -679,7 +679,6 @@ cc_test( srcs = ["buffer_liveness_test.cc"], deps = [ ":buffer_liveness", - ":cpu_plugin", ":hlo", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", @@ -725,7 +724,6 @@ cc_test( ":call_graph", ":computation_tracker", ":copy_insertion", - ":cpu_plugin", ":flatten_call_graph", ":hlo", ":hlo_ordering", @@ -790,7 +788,6 @@ cc_test( name = "hlo_ordering_test", srcs = ["hlo_ordering_test.cc"], deps = [ - ":cpu_plugin", ":hlo", ":hlo_ordering", "//tensorflow/compiler/xla:shape_util", @@ -860,7 +857,6 @@ cc_test( srcs = ["algebraic_simplifier_test.cc"], deps = [ ":algebraic_simplifier", - ":cpu_plugin", ":hlo", ":hlo_matchers", ":hlo_pass", @@ -928,7 +924,6 @@ cc_test( name = "inliner_test", srcs = ["inliner_test.cc"], deps = [ - ":cpu_plugin", ":hlo", ":hlo_matchers", ":inliner", @@ -1085,7 +1080,6 @@ cc_test( name = "hlo_computation_test", srcs = ["hlo_computation_test.cc"], deps = [ - ":cpu_plugin", ":hlo", ":hlo_matchers", "//tensorflow/compiler/xla:literal_util", @@ -1116,7 +1110,6 @@ cc_test( name = "hlo_module_test", srcs = ["hlo_module_test.cc"], deps = [ - ":cpu_plugin", ":hlo", "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla:shape_util", @@ -1256,7 +1249,6 @@ cc_test( srcs = ["copy_insertion_test.cc"], deps = [ ":copy_insertion", - ":cpu_plugin", ":hlo", ":hlo_matchers", ":tuple_points_to_analysis", @@ -1321,7 +1313,6 @@ cc_test( name = "hlo_rematerialization_test", srcs = ["hlo_rematerialization_test.cc"], deps = [ - ":cpu_plugin", ":hlo", ":hlo_matchers", ":hlo_ordering", @@ -1338,7 +1329,6 @@ cc_test( name = "hlo_dce_test", srcs = ["hlo_dce_test.cc"], deps = [ - ":cpu_plugin", ":hlo", ":hlo_dce", "//tensorflow/compiler/xla:literal_util", @@ -1361,7 +1351,6 @@ cc_test( deps = [ ":algebraic_simplifier", ":computation_layout", - ":cpu_plugin", ":hlo", ":hlo_matchers", ":layout_assignment", @@ -1434,7 +1423,6 @@ cc_test( name = "hlo_cse_test", srcs = ["hlo_cse_test.cc"], deps = [ - ":cpu_plugin", ":hlo", ":hlo_cse", ":hlo_matchers", @@ -1470,7 +1458,6 @@ cc_test( name = "hlo_constant_folding_test", srcs = ["hlo_constant_folding_test.cc"], deps = [ - ":cpu_plugin", ":hlo", ":hlo_constant_folding", ":hlo_matchers", diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index 0840fdf930..afe1a54d3e 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -1103,7 +1103,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( for (int64 dim = 0; dim < slice_sizes.size(); ++dim) { const int64 input_dim_size = operand_shape.dimensions(dim); const int64 slice_dim_size = slice_sizes[dim]; - if (slice_dim_size <= 0) { + if (slice_dim_size < 0) { return InvalidArgument("negative size index to dynamic slice: %lld", slice_dim_size); } @@ -1175,9 +1175,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( for (int64 dim = 0; dim < ShapeUtil::Rank(operand_shape); ++dim) { const int64 input_dim_size = operand_shape.dimensions(dim); const int64 update_dim_size = update_shape.dimensions(dim); - if (update_dim_size <= 0) { + if (update_dim_size < 0) { return InvalidArgument( - "size index %lld to dynamic update slice must be > 0", + "size index %lld to dynamic update slice must be >= 0", update_dim_size); } if (update_dim_size > input_dim_size) { diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc index cdb4498f4e..ea7908f55c 100644 --- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc +++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc @@ -58,6 +58,8 @@ class DynamicSliceTest : public ClientLibraryTestBase { // Slice at dimension boundaries, but with sizes that cause indices to wrap. RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {6}, {4}, {6.0, 7.0, 0.0, 1.0}); + // Zero element slice. + RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {0}, {}); } template <typename IndexT> @@ -75,6 +77,12 @@ class DynamicSliceTest : public ClientLibraryTestBase { RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, {1, 1}, {3, 3}, {{5.0f, 6.0f, 4.0f}, {8.0f, 9.0f, 7.0f}, {2.0f, 3.0f, 1.0f}}); + // Zero element slice: 2x0. + RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, + {0, 0}, {2, 0}, {{}, {}}); + // Zero element slice: 0x2. + RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, + {0, 0}, {0, 2}, Array2D<float>(0, 2)); } template <typename IndexT> @@ -200,6 +208,10 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {8.0, 9.0, 10.0}, {6}, {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0, 9.0}); + // Zero-sized update. + RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, + {}, {2}, + {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}); // clang-format on } @@ -226,6 +238,11 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, {{10.0f, 11.0f}}, {2, 2}, {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 10.0f}}); + // Zero-sized update. + RunR2<IndexT>( + {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, + {{}}, {2, 1}, + {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}); // clang-format on } diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt index 2c770e75dd..9ffe08eded 100644 --- a/tensorflow/contrib/cmake/CMakeLists.txt +++ b/tensorflow/contrib/cmake/CMakeLists.txt @@ -113,6 +113,7 @@ include(zlib) include(gif) include(png) include(jpeg) +include(lmdb) include(eigen) include(gemmlowp) include(jsoncpp) @@ -129,6 +130,7 @@ set(tensorflow_EXTERNAL_LIBRARIES ${gif_STATIC_LIBRARIES} ${png_STATIC_LIBRARIES} ${jpeg_STATIC_LIBRARIES} + ${lmdb_STATIC_LIBRARIES} ${jsoncpp_STATIC_LIBRARIES} ${farmhash_STATIC_LIBRARIES} ${fft2d_STATIC_LIBRARIES} @@ -140,6 +142,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES gif_copy_headers_to_destination png_copy_headers_to_destination jpeg_copy_headers_to_destination + lmdb_copy_headers_to_destination jsoncpp farmhash_copy_headers_to_destination highwayhash_copy_headers_to_destination @@ -158,6 +161,7 @@ include_directories( ${gif_INCLUDE_DIR} ${png_INCLUDE_DIR} ${jpeg_INCLUDE_DIR} + ${lmdb_INCLUDE_DIR} ${eigen_INCLUDE_DIRS} ${gemmlowp_INCLUDE_DIR} ${jsoncpp_INCLUDE_DIR} diff --git a/tensorflow/contrib/cmake/external/lmdb.cmake b/tensorflow/contrib/cmake/external/lmdb.cmake new file mode 100644 index 0000000000..28ec833bab --- /dev/null +++ b/tensorflow/contrib/cmake/external/lmdb.cmake @@ -0,0 +1,60 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +include (ExternalProject) + +set(lmdb_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/lmdb) +set(lmdb_URL http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz) +set(lmdb_HASH SHA256=108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326) +set(lmdb_BUILD ${CMAKE_BINARY_DIR}/lmdb/src/lmdb) +set(lmdb_INSTALL ${CMAKE_BINARY_DIR}/lmdb/install) + +ExternalProject_Add(lmdb + PREFIX lmdb + URL ${lmdb_URL} + URL_HASH ${lmdb_HASH} + PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${CMAKE_CURRENT_SOURCE_DIR}/patches/lmdb/CMakeLists.txt ${lmdb_BUILD} + INSTALL_DIR ${lmdb_INSTALL} + DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON +) + +if(WIN32) + set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib) +else() + set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a) +endif() + +set(lmdb_HEADERS + "${lmdb_INSTALL}/include/lmdb.h" + "${lmdb_INSTALL}/include/midl.h" +) + +## put lmdb includes in the directory where they are expected +add_custom_target(lmdb_create_destination_dir + COMMAND ${CMAKE_COMMAND} -E make_directory ${lmdb_INCLUDE_DIR} + DEPENDS lmdb) + +add_custom_target(lmdb_copy_headers_to_destination + DEPENDS lmdb_create_destination_dir) + +foreach(header_file ${lmdb_HEADERS}) + add_custom_command(TARGET lmdb_copy_headers_to_destination PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${lmdb_INCLUDE_DIR}/) +endforeach() diff --git a/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt b/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt new file mode 100644 index 0000000000..19fa607a10 --- /dev/null +++ b/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt @@ -0,0 +1,26 @@ +cmake_minimum_required(VERSION 2.8.3) + +project(liblmdb) + +set(LIBLMDB_SRCS + "libraries/liblmdb/mdb.c" + "libraries/liblmdb/midl.c" +) + +set(LIBLMDB_INCLUDES + "libraries/liblmdb/lmdb.h" + "libraries/liblmdb/midl.h" +) + +include_directories("${CMAKE_CURRENT_SOURCE_DIR}") + +add_library(lmdb ${LIBLMDB_SRCS}) + +install(TARGETS lmdb + RUNTIME DESTINATION bin COMPONENT RuntimeLibraries + LIBRARY DESTINATION lib COMPONENT RuntimeLibraries + ARCHIVE DESTINATION lib COMPONENT Development) + +foreach(LIBLMDB_INCLUDE ${LIBLMDB_INCLUDES}) + install(FILES ${LIBLMDB_INCLUDE} DESTINATION include COMPONENT Development) +endforeach() diff --git a/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py index ac0f0fd422..36b20451ff 100644 --- a/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py +++ b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py @@ -37,8 +37,11 @@ def load_data(path='boston_housing.npz', seed=113, test_split=0.2): Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`. """ assert 0 <= test_split < 1 + fh = 'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5' path = get_file( - path, origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz') + path, + origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz', + file_hash=fh) f = np.load(path) x = f['x'] y = f['y'] diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py index 590cdc6504..7e567d3fb0 100644 --- a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py +++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py @@ -114,16 +114,19 @@ class Conv2DTest(test.TestCase): continue with self.test_session(use_gpu=True): - testing_utils.layer_test( - keras.layers.Conv2D, - kwargs={ - 'filters': filters, - 'kernel_size': kernel_size, - 'padding': padding, - 'strides': strides, - 'data_format': 'channels_first' - }, - input_shape=(num_samples, stack_size, num_row, num_col)) + # Only runs on GPU with CUDA, channels_first is not supported on CPU. + # TODO(b/62340061): Support channels_first on CPU. + if test.is_gpu_available(cuda_only=True): + testing_utils.layer_test( + keras.layers.Conv2D, + kwargs={ + 'filters': filters, + 'kernel_size': kernel_size, + 'padding': padding, + 'strides': strides, + 'data_format': 'channels_first' + }, + input_shape=(num_samples, stack_size, num_row, num_col)) def test_convolution_2d_regularization(self): # regularizers diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py index bbf695a1ba..d8a6a1673b 100644 --- a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py +++ b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py @@ -100,15 +100,18 @@ class Pooling2DTest(test.TestCase): 'padding': 'valid', 'pool_size': (3, 3)}, input_shape=(3, 5, 6, 4)) - testing_utils.layer_test( - keras.layers.AveragePooling2D, - kwargs={ - 'strides': (1, 1), - 'padding': 'valid', - 'pool_size': (2, 2), - 'data_format': 'channels_first' - }, - input_shape=(3, 4, 5, 6)) + # Only runs on GPU with CUDA, channels_first is not supported on CPU. + # TODO(b/62340061): Support channels_first on CPU. + if test.is_gpu_available(cuda_only=True): + testing_utils.layer_test( + keras.layers.AveragePooling2D, + kwargs={ + 'strides': (1, 1), + 'padding': 'valid', + 'pool_size': (2, 2), + 'data_format': 'channels_first' + }, + input_shape=(3, 4, 5, 6)) class Pooling3DTest(test.TestCase): diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc index 2e9544b0b5..e97e8d0163 100644 --- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc +++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc @@ -257,7 +257,7 @@ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request, this->QueueSendRequest(req); // Wait for the notification that indicates the tensor has been - // succesfully transmitted to the remote process. Only needed if we + // successfully transmitted to the remote process. Only needed if we // have not parsed the tensor to proto if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification(); }; // done_cb diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 1cc712c2e1..b76226f2c9 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1894,7 +1894,6 @@ cc_library( ":framework", ":lib", ":lib_internal", - ":test", ], ) @@ -1909,7 +1908,7 @@ cc_library( deps = [ ":lib", ":lib_internal", - ":test", + ":test", # buildcleaner: keep "//tensorflow/core/platform/default/build_config:test_main", ], alwayslink = 1, @@ -2887,6 +2886,20 @@ filegroup( ) filegroup( + name = "lmdb_testdata", + testonly = 1, + srcs = [ + # A simple key-value store: + # 0 : 'a' + # 1 : 'b' + # ... + # 9 : 'j' + "lib/lmdb/testdata/data.mdb", + ], + visibility = ["//visibility:public"], +) + +filegroup( name = "example_parser_configuration_testdata", srcs = [ "example/testdata/parse_example_graph_def.pbtxt", diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc index 0e21e37fd3..7a1c10d900 100644 --- a/tensorflow/core/common_runtime/gpu/process_state.cc +++ b/tensorflow/core/common_runtime/gpu/process_state.cc @@ -127,7 +127,7 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id, gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie(); int bus_id = se->GetDeviceDescription().numa_node(); if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) { - for (auto v : gpu_visitors_[bus_id]) { + for (const auto& v : gpu_visitors_[bus_id]) { gpu_allocators_[gpu_id]->AddAllocVisitor(v); } } diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc index 7b5cc1c5cb..4a5b88d5fd 100644 --- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc +++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc @@ -39,7 +39,8 @@ namespace tensorflow { namespace test { Benchmark::Benchmark(const string& device, Graph* g, - const SessionOptions* options, Graph* init) { + const SessionOptions* options, Graph* init, + Rendezvous* rendez) { SessionOptions default_options; if (!options) { options = &default_options; @@ -61,7 +62,11 @@ Benchmark::Benchmark(const string& device, Graph* g, pool_->Schedule(closure); }; - rendez_ = NewLocalRendezvous(); + if (rendez == nullptr) { + rendez_ = NewLocalRendezvous(); + } else { + rendez_ = rendez; + } const int graph_def_version = g->versions().producer(); diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h index 278a6b3f9f..3a7b3a5ace 100644 --- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h +++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h @@ -35,10 +35,11 @@ namespace test { class Benchmark { public: - // "device" must be either "cpu" or "gpu". Takes ownership of "g" - // and "init". + // "device" must be either "cpu" or "gpu". Takes ownership of "g", + // "init", and one reference on "rendez" (if not null). Benchmark(const string& device, Graph* g, - const SessionOptions* options = nullptr, Graph* init = nullptr); + const SessionOptions* options = nullptr, Graph* init = nullptr, + Rendezvous* rendez = nullptr); ~Benchmark(); // Executes the graph for "iters" times. diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc index 466b779e9b..b705bd74c2 100644 --- a/tensorflow/core/common_runtime/shape_refiner_test.cc +++ b/tensorflow/core/common_runtime/shape_refiner_test.cc @@ -557,7 +557,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt32) { .Finalize(root.graph(), &result)); ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global()); - for (auto input : inputs) { + for (const auto& input : inputs) { TF_ASSERT_OK(m.AddNode(input.node())); } TF_ASSERT_OK(m.AddNode(pack.node())); diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc index 25847a20a4..54366ce249 100644 --- a/tensorflow/core/debug/debug_io_utils.cc +++ b/tensorflow/core/debug/debug_io_utils.cc @@ -119,6 +119,18 @@ Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def, } // namespace +// static +const char* const DebugIO::kMetadataFilePrefix = "_tfdbg_"; + +// static +const char* const DebugIO::kCoreMetadataTag = "core_metadata_"; + +// static +const char* const DebugIO::kDeviceTag = "device_"; + +// static +const char* const DebugIO::kGraphTag = "graph_"; + DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name, const int32 output_slot, const string& debug_op) : device_name(device_name), @@ -126,7 +138,8 @@ DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name, output_slot(output_slot), debug_op(debug_op), debug_node_name( - strings::StrCat(node_name, ":", output_slot, ":", debug_op)) {} + strings::StrCat(node_name, ":", output_slot, ":", debug_op)), + device_path(DeviceNameToDevicePath(device_name)) {} Status ReadEventFromFile(const string& dump_file_path, Event* event) { Env* env(Env::Default()); @@ -158,6 +171,15 @@ Status ReadEventFromFile(const string& dump_file_path, Event* event) { } // static +const string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) { + return strings::StrCat( + DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag, + str_util::StringReplace( + str_util::StringReplace(device_name, ":", "_", true), "/", ",", + true)); +} + +// static const char* const DebugIO::kFileURLScheme = "file://"; // static const char* const DebugIO::kGrpcURLScheme = "grpc://"; @@ -236,7 +258,8 @@ Status DebugIO::PublishDebugMetadata( const string core_metadata_path = AppendTimestampToFilePath( io::JoinPath( dump_root_dir, - strings::StrCat("_tfdbg_core_metadata_", "sessionrun", + strings::StrCat(DebugIO::kMetadataFilePrefix, + DebugIO::kCoreMetadataTag, "sessionrun", strings::Printf("%.14lld", session_run_index))), Env::Default()->NowMicros()); status.Update(DebugFileIO::DumpEventProtoToFile( @@ -325,10 +348,11 @@ Status DebugIO::PublishGraph(const Graph& graph, const string& device_name, Status status = Status::OK(); for (const string& debug_url : debug_urls) { if (debug_url.find(kFileURLScheme) == 0) { - const string dump_root_dir = debug_url.substr(strlen(kFileURLScheme)); - // TODO(cais): (b/38325442) Serialize the GraphDef to a directory that - // reflects the device name. - const string file_name = strings::StrCat("_tfdbg_graph_", now_micros); + const string dump_root_dir = + io::JoinPath(debug_url.substr(strlen(kFileURLScheme)), + DebugNodeKey::DeviceNameToDevicePath(device_name)); + const string file_name = strings::StrCat(DebugIO::kMetadataFilePrefix, + DebugIO::kGraphTag, now_micros); status.Update( DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name)); @@ -437,7 +461,7 @@ string DebugFileIO::GetDumpFilePath(const string& dump_root_dir, const DebugNodeKey& debug_node_key, const uint64 wall_time_us) { return AppendTimestampToFilePath( - io::JoinPath(dump_root_dir, + io::JoinPath(dump_root_dir, debug_node_key.device_path, strings::StrCat(debug_node_key.node_name, "_", debug_node_key.output_slot, "_", debug_node_key.debug_op)), diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h index f3e76cc0ee..69d8c7bd4e 100644 --- a/tensorflow/core/debug/debug_io_utils.h +++ b/tensorflow/core/debug/debug_io_utils.h @@ -44,11 +44,14 @@ struct DebugNodeKey { DebugNodeKey(const string& device_name, const string& node_name, const int32 output_slot, const string& debug_op); + static const string DeviceNameToDevicePath(const string& device_name); + const string device_name; const string node_name; const int32 output_slot; const string debug_op; const string debug_node_name; + const string device_path; }; class DebugIO { @@ -136,6 +139,11 @@ class DebugIO { static Status CloseDebugURL(const string& debug_url); + static const char* const kMetadataFilePrefix; + static const char* const kCoreMetadataTag; + static const char* const kDeviceTag; + static const char* const kGraphTag; + static const char* const kFileURLScheme; static const char* const kGrpcURLScheme; }; diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc index 406bcae07f..77039aa4ab 100644 --- a/tensorflow/core/debug/debug_io_utils_test.cc +++ b/tensorflow/core/debug/debug_io_utils_test.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/core/lib/core/notification.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/util/event.pb.h" @@ -47,6 +48,18 @@ class DebugIOUtilsTest : public ::testing::Test { std::unique_ptr<Tensor> tensor_b_; }; +TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) { + DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/gpu:2", + "hidden_1/MatMul", 0, "DebugIdentity"); + EXPECT_EQ("/job:worker/replica:1/task:0/gpu:2", debug_node_key.device_name); + EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name); + EXPECT_EQ(0, debug_node_key.output_slot); + EXPECT_EQ("DebugIdentity", debug_node_key.debug_op); + EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name); + EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,gpu_2", + debug_node_key.device_path); +} + TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) { Initialize(); @@ -138,10 +151,14 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) { // First, create the file at the path. const string test_dir = testing::TmpDir(); - const string txt_file_name = strings::StrCat(test_dir, "/baz"); - - if (!env_->FileExists(test_dir).ok()) { - ASSERT_TRUE(env_->CreateDir(test_dir).ok()); + const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0"; + const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0, + "DebugIdentity"); + const string txt_file_dir = + io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName)); + const string txt_file_name = io::JoinPath(txt_file_dir, "baz"); + if (!env_->FileExists(txt_file_dir).ok()) { + ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok()); } ASSERT_EQ(error::Code::NOT_FOUND, env_->FileExists(txt_file_name).code()); @@ -157,8 +174,7 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) { // Second, try to dump the tensor to a path that requires "baz" to be a // directory, which should lead to an error. - const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0", - "baz/tensor_a", 0, "DebugIdentity"); + const uint64 wall_time = env_->NowMicros(); string dump_file_name; diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc index 6c68729410..9584d8b9f3 100644 --- a/tensorflow/core/debug/grpc_session_debug_test.cc +++ b/tensorflow/core/debug/grpc_session_debug_test.cc @@ -187,7 +187,10 @@ TEST_F(GrpcSessionDebugTest, FileDebugURL) { IsSingleFloatValue(outputs[0], 4.0); std::vector<Tensor> dumped_tensors; - LoadTensorDumps("n", &dumped_tensors); + LoadTensorDumps(io::JoinPath(DebugNodeKey::DeviceNameToDevicePath( + cluster->devices()[0].name()), + "n"), + &dumped_tensors); if (i == 0 || i == 5) { ASSERT_EQ(0, dumped_tensors.size()); @@ -267,7 +270,10 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) { TF_CHECK_OK(session->Close()); std::vector<Tensor> dumped_tensors; - LoadTensorDumps("n", &dumped_tensors); + LoadTensorDumps( + io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(a_dev.name()), + "n"), + &dumped_tensors); ASSERT_EQ(1, dumped_tensors.size()); ASSERT_EQ(TensorShape({2, 2}), dumped_tensors[0].shape()); for (size_t i = 0; i < 4; ++i) { diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc index 2b1a47a93f..1e8c30bad5 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc @@ -173,7 +173,7 @@ class GrpcRemoteWorker : public WorkerInterface { } IssueRequest(req_copy ? req_copy : request, response, recvtensor_, - std::move(*cb_to_use), call_opts); + *cb_to_use, call_opts); } void LoggingAsync(const LoggingRequest* request, LoggingResponse* response, diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc index 32ea0cfaa4..16e450abb0 100644 --- a/tensorflow/core/distributed_runtime/worker.cc +++ b/tensorflow/core/distributed_runtime/worker.cc @@ -258,7 +258,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts, } } if (request->is_last_partial_run()) { - partial_run_mgr_.PartialRunDone(step_id, std::move(finish), s); + partial_run_mgr_.PartialRunDone(step_id, finish, s); } else { finish(s); } diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc index a4597080f0..1f9e98551f 100644 --- a/tensorflow/core/framework/shape_inference.cc +++ b/tensorflow/core/framework/shape_inference.cc @@ -885,14 +885,6 @@ Status InferenceContext::AttachContext(const Status& status) { strings::StrCat(status.error_message(), error_context)); } -ShapeHandle InferenceContext::input_handle_shape(int idx) { - if (input_handle_shapes_and_types_[idx] == nullptr) { - input_handle_shapes_and_types_[idx].reset( - new std::vector<ShapeAndType>{{UnknownShape(), DT_INVALID}}); - } - return (*input_handle_shapes_and_types_[idx])[0].shape; -} - bool InferenceContext::MergeHandleShapesAndTypes( const std::vector<ShapeAndType>& shapes_and_types, std::vector<ShapeAndType>* to_update) { diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h index baeab93e30..119bed4071 100644 --- a/tensorflow/core/framework/shape_inference.h +++ b/tensorflow/core/framework/shape_inference.h @@ -491,37 +491,12 @@ class InferenceContext { return input_handle_shapes_and_types_[idx].get(); } - // DEPRECATED: use input_handle_shapes_and_types. - ShapeHandle input_handle_shape(int idx); - // DEPRECATED: use input_handle_shapes_and_types. - DataType input_handle_dtype(int idx) const { - if (input_handle_shapes_and_types_[idx] == nullptr) { - return DT_INVALID; - } else { - DCHECK_EQ(input_handle_shapes_and_types_[idx]->size(), 1); - return (*input_handle_shapes_and_types_[idx])[0].dtype; - } - } - void set_output_handle_shapes_and_types( int idx, const std::vector<ShapeAndType>& shapes_and_types) { output_handle_shapes_and_types_[idx].reset( new std::vector<ShapeAndType>(shapes_and_types)); } - // DEPRECATED: use output_handle_shapes_and_types. - ShapeHandle output_handle_shape(int idx) { - return output_handle_shapes_and_types_[idx] == nullptr - ? UnknownShape() - : (*output_handle_shapes_and_types_[idx])[0].shape; - } - // DEPRECATED: use output_handle_shapes_and_types. - DataType output_handle_dtype(int idx) const { - return output_handle_shapes_and_types_[idx] == nullptr - ? DT_INVALID - : (*output_handle_shapes_and_types_[idx])[0].dtype; - } - // Note that shape functions should usually call MakeShapeFromShapeTensor, // as it does more analysis to provide partial shapes. // diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc index 28ebf7e8c3..8d433fc8c6 100644 --- a/tensorflow/core/graph/graph_constructor.cc +++ b/tensorflow/core/graph/graph_constructor.cc @@ -435,7 +435,7 @@ Status GraphConstructor::MakeNode(const NodeDef& node_def, Node** node) { Status GraphConstructor::ValidateShape(Node* node) { if (!opts_.importing) return Status::OK(); TF_RETURN_IF_ERROR(refiner_->AddNode(node)); - // For nodes with the _output_shapes atttribute, override the shape. + // For nodes with the _output_shapes attribute, override the shape. std::vector<TensorShapeProto> shape_attrs; const char* kAttrName = "_output_shapes"; if (!GetNodeAttr(node->attrs(), kAttrName, &shape_attrs).ok()) { @@ -481,7 +481,7 @@ Status GraphConstructor::ValidateShape(Node* node) { "MutableHashTableOfTensors", "Mutex", "CuckooTable", "IndexTable", "WholeFileReader", "TextLineReader", "FixedLengthRecordReader", "TFRecordReader", "IdentityReader", "RefSwitch", "RefEnter", - "RefNextIteration", "RefMerge", "RefIdentity", + "RefNextIteration", "RefMerge", "RefIdentity", "LMDBReader", // To be removed after 2017/04/24. "ConditionalAccumulator", "SparseConditionalAccumulator", "Table", }; diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD index d40e66cd16..bebd5d7bce 100644 --- a/tensorflow/core/grappler/costs/BUILD +++ b/tensorflow/core/grappler/costs/BUILD @@ -185,21 +185,13 @@ cc_test( name = "virtual_scheduler_test", srcs = ["virtual_scheduler_test.cc"], deps = [ - ":graph_properties", - ":utils", ":virtual_placer", ":virtual_scheduler", "//tensorflow/cc:cc_ops", - "//tensorflow/core:framework", - "//tensorflow/core:protos_all_cc", "//tensorflow/core:tensorflow", "//tensorflow/core:test", "//tensorflow/core:test_main", - "//tensorflow/core/grappler:grappler_item", - "//tensorflow/core/grappler:utils", - "//tensorflow/core/grappler/clusters:utils", "//tensorflow/core/grappler/clusters:virtual_cluster", - "//tensorflow/core/grappler/costs:cost_estimator", ], ) diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc index 721cf535b9..5c0515803d 100644 --- a/tensorflow/core/grappler/costs/graph_properties.cc +++ b/tensorflow/core/grappler/costs/graph_properties.cc @@ -218,9 +218,13 @@ Status GraphProperties::InferDynamically(Cluster* cluster) { TF_RETURN_IF_ERROR( cluster->Run(item_.graph, item_.feed, item_.fetch, &metadata)); + return InferFromCostGraph(metadata.cost_graph()); +} + +Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) { std::unordered_map<string, const CostGraphDef::Node*> name_to_cost; std::unordered_map<string, const NodeDef*> name_to_node; // Empty - for (auto& node : metadata.cost_graph().node()) { + for (auto& node : cost_graph.node()) { name_to_cost[node.name()] = &node; std::vector<OpInfo::TensorProperties> output_properties; diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h index d2b466175b..f2f6ad19a7 100644 --- a/tensorflow/core/grappler/costs/graph_properties.h +++ b/tensorflow/core/grappler/costs/graph_properties.h @@ -36,6 +36,7 @@ class GraphProperties { Status InferStatically(); Status InferDynamically(Cluster* cluster); + Status InferFromCostGraph(const CostGraphDef& cost_graph); bool HasOutputProperties(const string& name) const; std::vector<OpInfo::TensorProperties> GetInputProperties( diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc index 6eca083184..bff5f7acc5 100644 --- a/tensorflow/core/grappler/costs/graph_properties_test.cc +++ b/tensorflow/core/grappler/costs/graph_properties_test.cc @@ -149,6 +149,54 @@ TEST_F(GraphPropertiesTest, DynamicProperties) { } } +TEST_F(GraphPropertiesTest, Variables) { + GrapplerItem item; + TF_CHECK_OK(NodeDefBuilder("Var", "Variable") + .Attr("dtype", DT_FLOAT) + .Attr("shape", TensorShape({3, 7})) + .Finalize(item.graph.add_node())); + item.fetch.push_back("Var"); + + Tensor initial_val(DT_FLOAT, TensorShape({3, 7})); + TF_CHECK_OK(NodeDefBuilder("InitialVal", "Const") + .Attr("dtype", DT_FLOAT) + .Attr("value", initial_val) + .Finalize(item.graph.add_node())); + TF_CHECK_OK(NodeDefBuilder("InitVar", "Assign") + .Input("Var", 0, DT_FLOAT_REF) + .Input("InitialVal", 0, DT_FLOAT) + .Finalize(item.graph.add_node())); + item.init_ops.push_back("InitVar"); + + { + GraphProperties static_properties(item); + TF_CHECK_OK(static_properties.InferStatically()); + + const auto props = static_properties.GetOutputProperties("Var"); + EXPECT_EQ(1, props.size()); + const OpInfo::TensorProperties& prop = props[0]; + EXPECT_EQ(DT_FLOAT_REF, prop.dtype()); + EXPECT_FALSE(prop.shape().unknown_rank()); + EXPECT_EQ(2, prop.shape().dim_size()); + EXPECT_EQ(3, prop.shape().dim(0).size()); + EXPECT_EQ(7, prop.shape().dim(1).size()); + } + { + TF_CHECK_OK(cluster_->Initialize(item)); + GraphProperties dynamic_properties(item); + TF_CHECK_OK(dynamic_properties.InferDynamically(cluster_.get())); + + const auto props = dynamic_properties.GetOutputProperties("Var"); + EXPECT_EQ(1, props.size()); + const OpInfo::TensorProperties& prop = props[0]; + EXPECT_EQ(DT_FLOAT_REF, prop.dtype()); + EXPECT_FALSE(prop.shape().unknown_rank()); + EXPECT_EQ(2, prop.shape().dim_size()); + EXPECT_EQ(3, prop.shape().dim(0).size()); + EXPECT_EQ(7, prop.shape().dim(1).size()); + } +} + TEST_F(GraphPropertiesTest, VarHandles) { GrapplerItem item; TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp") diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc index c42218e447..eb1e521fce 100644 --- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc @@ -245,20 +245,20 @@ class NodeProcessor { virtual Status AddLayoutTransposeToInputs() { std::vector<int> input_pos = GetInputPos(); for (const auto& pos : input_pos) { - string node_name_NHWCToNCHW = strings::StrCat( - kTransposeNHWCToNCHW, "-", node_->name(), "-", node_->input(pos)); + string base_name = strings::StrCat(node_->name(), "-", node_->input(pos)); + string node_name = + AddPrefixToNodeName(base_name, kTransposeNHWCToNCHW, "-"); auto input_node = node_map_->GetNode(node_->input(pos)); int output_pos = NodePosition(node_->input(pos)); TF_RETURN_IF_ERROR(HasAttribute(*node_, "T")); TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes")); AddNodeTranspose( - node_name_NHWCToNCHW, node_->input(pos), node_->attr().at("T").type(), + node_name, node_->input(pos), node_->attr().at("T").type(), input_node->attr().at("_output_shapes").list().shape(output_pos), true); - node_map_->UpdateOutput(node_->input(pos), node_->name(), - node_name_NHWCToNCHW); - node_map_->AddOutput(node_name_NHWCToNCHW, node_->name()); - *node_->mutable_input(pos) = node_name_NHWCToNCHW; + node_map_->UpdateOutput(node_->input(pos), node_->name(), node_name); + node_map_->AddOutput(node_name, node_->name()); + *node_->mutable_input(pos) = node_name; } return Status::OK(); } @@ -266,9 +266,10 @@ class NodeProcessor { virtual Status AddLayoutTransposeToOutputs() { auto outputs = node_map_->GetOutputs(node_->name()); for (const auto& output : outputs) { - string node_name_NCHWToNHWC = strings::StrCat( - kTransposeNCHWToNHWC, "-", node_->name(), "-", output->name()); - // TODO (yaozhang): handle the rare case where node A is connected to more + string base_name = strings::StrCat(node_->name(), "-", output->name()); + string node_name = + AddPrefixToNodeName(base_name, kTransposeNCHWToNHWC, "-"); + // TODO(yaozhang): handle the rare case where node A is connected to more // than one input of node B. auto it = std::find_if(output->mutable_input()->begin(), output->mutable_input()->end(), @@ -290,13 +291,12 @@ class NodeProcessor { } TF_RETURN_IF_ERROR(HasAttribute(*node_, "T")); TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes")); - AddNodeTranspose( - node_name_NCHWToNHWC, node_->name(), node_->attr().at("T").type(), - node_->attr().at("_output_shapes").list().shape(0), false); - *it = node_name_NCHWToNHWC; - node_map_->UpdateOutput(node_->name(), output->name(), - node_name_NCHWToNHWC); - node_map_->AddOutput(node_name_NCHWToNHWC, output->name()); + AddNodeTranspose(node_name, node_->name(), node_->attr().at("T").type(), + node_->attr().at("_output_shapes").list().shape(0), + false); + *it = node_name; + node_map_->UpdateOutput(node_->name(), output->name(), node_name); + node_map_->AddOutput(node_name, output->name()); } return Status::OK(); } @@ -468,7 +468,13 @@ class Conv2DBackpropInputProcessor : public Conv2DProcessor { Status CustomizedProcessing() override { NodeDef* node = node_map_->GetNode(node_->input(0)); - return UpdateAttrValue(node); + NodeDef* added_node = graph_->add_node(); + *added_node = *node; + string node_name = + AddPrefixToNodeName(node->name(), "LayoutOptimizer", "-"); + added_node->set_name(node_name); + node_map_->AddNode(node_name, added_node); + return UpdateAttrValue(added_node); } }; @@ -621,9 +627,11 @@ class BinaryOpProcessor : public AgnosticNodeProcessor { Status CustomizedProcessing() override { if (is_4d_with_vector_) { - string suffix = strings::StrCat("-", node_->name(), "-", node_->input(1)); - string reshape_node_name = strings::StrCat(kReshapeNHWCToNCHW, suffix); - string shape_const_node_name = strings::StrCat(kReshapeConst, suffix); + string base_name = strings::StrCat(node_->name(), "-", node_->input(1)); + string reshape_node_name = + AddPrefixToNodeName(base_name, kReshapeNHWCToNCHW, "-"); + string shape_const_node_name = + AddPrefixToNodeName(base_name, kReshapeConst, "-"); auto input_node = node_map_->GetNode(node_->input(1)); TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes")); int vector_size = @@ -710,15 +718,15 @@ class SliceProcessor : public AgnosticNodeProcessor { Status CustomizedProcessing() override { // Skip the first input, which is the data to be sliced. for (int i = 1; i < node_->input_size(); i++) { - string node_name_NHWCToNCHW = - strings::StrCat(kPermVecNHWCToNCHW, "-", node_->name(), "-input", i); + string base_name = strings::StrCat(node_->name(), "-input", i); + string node_name = + AddPrefixToNodeName(base_name, kPermVecNHWCToNCHW, "-"); TF_RETURN_IF_ERROR(HasAttribute(*node_, "Index")); - AddNodePermVec(node_name_NHWCToNCHW, node_->input(i), + AddNodePermVec(node_name, node_->input(i), node_->attr().at("Index").type(), true); - node_map_->UpdateOutput(node_->input(i), node_->name(), - node_name_NHWCToNCHW); - node_map_->AddOutput(node_name_NHWCToNCHW, node_->name()); - *node_->mutable_input(i) = node_name_NHWCToNCHW; + node_map_->UpdateOutput(node_->input(i), node_->name(), node_name); + node_map_->AddOutput(node_name, node_->name()); + *node_->mutable_input(i) = node_name; } return Status::OK(); } @@ -795,7 +803,7 @@ class SliceProcessorConcatOffset : public AgnosticNodeProcessor { "input 1 of ConcatOffset")); } // Need to process if the channel is at dimension 3, which indicates the - // NHWC format is being used. As mutiple Slice nodes may share the same + // NHWC format is being used. As multiple Slice nodes may share the same // ConcatOffset node, the NHWC to NCHW conversion may have already // been performed when processing other Slice nodes. TF_RETURN_IF_ERROR(HasAttribute(*axis_node, "value")); diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc index be38ca1a69..566ea1d87d 100644 --- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc @@ -36,8 +36,8 @@ void AddOutputShape(Node* node, const TensorShape& shape) { class LayoutOptimizerTest : public ::testing::Test { protected: - Output SimpleConv(tensorflow::Scope* s, int input_size, int filter_size, - const string& padding) { + Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size, + const string& padding) { int batch_size = 128; int input_height = input_size; int input_width = input_size; @@ -65,11 +65,80 @@ class LayoutOptimizerTest : public ::testing::Test { AddOutputShape(conv.node(), input_shape); return conv; } + + Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size, + int filter_size, const string& padding) { + int batch_size = 128; + int input_height = input_size; + int input_width = input_size; + int input_depth = 3; + int filter_count = 2; + int stride = 1; + TensorShape input_sizes_shape({4}); + Tensor input_data(DT_INT32, input_sizes_shape); + test::FillValues<int>(&input_data, + {batch_size, input_height, input_width, input_depth}); + Output input_sizes = + ops::Const(s->WithOpName("InputSizes"), Input::Initializer(input_data)); + AddOutputShape(input_sizes.node(), input_sizes_shape); + + TensorShape filter_shape( + {filter_size, filter_size, input_depth, filter_count}); + Tensor filter_data(DT_FLOAT, filter_shape); + test::FillIota<float>(&filter_data, 1.0f); + Output filter = + ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data)); + AddOutputShape(filter.node(), filter_shape); + + int output_height = input_height; + int output_width = input_width; + TensorShape output_shape( + {batch_size, output_height, output_width, filter_count}); + Tensor output_data(DT_FLOAT, output_shape); + test::FillIota<float>(&output_data, 1.0f); + Output output = + ops::Const(s->WithOpName("Output"), Input::Initializer(output_data)); + AddOutputShape(output.node(), output_shape); + + Output conv_backprop_input = ops::Conv2DBackpropInput( + s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output, + {1, stride, stride, 1}, padding); + TensorShape input_shape( + {batch_size, input_height, input_width, input_depth}); + AddOutputShape(conv_backprop_input.node(), input_shape); + return conv_backprop_input; + } + + Tensor GetAttrValue(const NodeDef& node) { + Tensor tensor; + CHECK(tensor.FromProto(node.attr().at({"value"}).tensor())); + return tensor; + } }; +TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) { + tensorflow::Scope s = tensorflow::Scope::NewRootScope(); + auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME"); + Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv}); + GrapplerItem item; + TF_CHECK_OK(s.ToGraphDef(&item.graph)); + LayoutOptimizer optimizer; + optimizer.set_num_gpus(1); + GraphDef output; + Status status = optimizer.Optimize(nullptr, item, &output); + NodeMap node_map(&output); + auto input_sizes_node = node_map.GetNode( + AddPrefixToNodeName("InputSizes", "LayoutOptimizer", "-")); + CHECK(input_sizes_node); + auto input_sizes = GetAttrValue(*input_sizes_node); + Tensor input_sizes_expected(DT_INT32, {4}); + test::FillValues<int>(&input_sizes_expected, {128, 3, 7, 7}); + test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes); +} + TEST_F(LayoutOptimizerTest, FilterSizeIsOne) { tensorflow::Scope s = tensorflow::Scope::NewRootScope(); - auto conv = SimpleConv(&s, 2, 1, "SAME"); + auto conv = SimpleConv2D(&s, 2, 1, "SAME"); Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv}); GrapplerItem item; TF_CHECK_OK(s.ToGraphDef(&item.graph)); @@ -84,7 +153,7 @@ TEST_F(LayoutOptimizerTest, FilterSizeIsOne) { TEST_F(LayoutOptimizerTest, FilterSizeNotOne) { tensorflow::Scope s = tensorflow::Scope::NewRootScope(); - auto conv = SimpleConv(&s, 2, 1, "SAME"); + auto conv = SimpleConv2D(&s, 2, 1, "SAME"); Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv}); GrapplerItem item; TF_CHECK_OK(s.ToGraphDef(&item.graph)); @@ -99,7 +168,7 @@ TEST_F(LayoutOptimizerTest, FilterSizeNotOne) { TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) { tensorflow::Scope s = tensorflow::Scope::NewRootScope(); - auto conv = SimpleConv(&s, 2, 2, "VALID"); + auto conv = SimpleConv2D(&s, 2, 2, "VALID"); Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv}); GrapplerItem item; TF_CHECK_OK(s.ToGraphDef(&item.graph)); @@ -114,7 +183,7 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) { TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) { tensorflow::Scope s = tensorflow::Scope::NewRootScope(); - auto conv = SimpleConv(&s, 2, 2, "SAME"); + auto conv = SimpleConv2D(&s, 2, 2, "SAME"); Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv}); GrapplerItem item; TF_CHECK_OK(s.ToGraphDef(&item.graph)); @@ -129,7 +198,7 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) { TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) { tensorflow::Scope s = tensorflow::Scope::NewRootScope(); - auto conv = SimpleConv(&s, 2, 3, "VALID"); + auto conv = SimpleConv2D(&s, 2, 3, "VALID"); Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv}); GrapplerItem item; TF_CHECK_OK(s.ToGraphDef(&item.graph)); diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc index b7a04f4423..cc3cbdacb4 100644 --- a/tensorflow/core/grappler/utils.cc +++ b/tensorflow/core/grappler/utils.cc @@ -98,13 +98,18 @@ int NodePosition(const string& name) { return position; } -string AddPrefixToNodeName(const string& name, const string& prefix) { +string AddPrefixToNodeName(const string& name, const string& prefix, + const string& delimiter) { if (!name.empty()) { if (name[0] == '^') { - return strings::StrCat("^", prefix, "/", name.substr(1)); + return strings::StrCat("^", prefix, delimiter, name.substr(1)); } } - return strings::StrCat(prefix, "/", name); + return strings::StrCat(prefix, delimiter, name); +} + +string AddPrefixToNodeName(const string& name, const string& prefix) { + return AddPrefixToNodeName(name, prefix, "/"); } bool ExecuteWithTimeout(std::function<void()> fn, const int64 timeout_in_ms, diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h index 5a3c0614e7..f86fc608c5 100644 --- a/tensorflow/core/grappler/utils.h +++ b/tensorflow/core/grappler/utils.h @@ -60,7 +60,11 @@ int NodePosition(const string& name); // Returns the node name and position in a single call. string ParseNodeName(const string& name, int* position); -// Add a prefix to a node name +// Add a prefix to a node name with a custom delimiter. +string AddPrefixToNodeName(const string& name, const string& prefix, + const string& delimiter); + +// Add a prefix to a node name. string AddPrefixToNodeName(const string& name, const string& prefix); // Executes a 'fn' in the 'thread_pool'. The method waits for the configured diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 1e5bc0ceab..ed0379bb3f 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -1904,6 +1904,7 @@ cc_library( deps = [ ":fixed_length_record_reader_op", ":identity_reader_op", + ":lmdb_reader_op", ":matching_files_op", ":reader_ops", ":restore_op", @@ -1939,6 +1940,14 @@ tf_kernel_library( ) tf_kernel_library( + name = "lmdb_reader_op", + prefix = "lmdb_reader_op", + deps = IO_DEPS + [ + "@lmdb", + ], +) + +tf_kernel_library( name = "matching_files_op", prefix = "matching_files_op", deps = IO_DEPS, @@ -3170,6 +3179,21 @@ tf_kernel_library( deps = REQUIRED_DEPS, ) +tf_cc_test( + name = "sendrecv_ops_test", + srcs = ["sendrecv_ops_test.cc"], + linkstatic = tf_kernel_tests_linkstatic(), # Required for benchmarking + deps = [ + ":ops_testutil", + ":ops_util", + ":sendrecv_ops", + "//tensorflow/core:framework", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + cc_library( name = "sparse", deps = [ @@ -4313,6 +4337,7 @@ filegroup( # not used on Android. Those ops also do not compile if included, # unless we add the additional deps they need. "tf_record_reader_op.*", + "lmdb_reader_op.*", "string_to_hash_bucket_op.*", "sdca_ops.*", "sdca_internal.*", diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc index 73446fbe05..914627992b 100644 --- a/tensorflow/core/kernels/cuda_solvers.cc +++ b/tensorflow/core/kernels/cuda_solvers.cc @@ -153,7 +153,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync( info_checker_callback) const { std::vector<HostLapackInfo> host_lapack_infos; if (dev_lapack_infos.empty()) { - info_checker_callback(Status::OK(), std::move(host_lapack_infos)); + info_checker_callback(Status::OK(), host_lapack_infos); return Status::OK(); } @@ -174,7 +174,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync( auto wrapped_info_checker_callback = [info_checker_callback](std::vector<HostLapackInfo> host_lapack_infos) { Status status; - for (auto host_lapack_info : host_lapack_infos) { + for (const auto& host_lapack_info : host_lapack_infos) { for (int i = 0; i < host_lapack_info.size() && status.ok(); ++i) { const int info_value = (host_lapack_info.data())[i]; if (info_value != 0) { diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc index 037272e009..1830b2a178 100644 --- a/tensorflow/core/kernels/debug_ops_test.cc +++ b/tensorflow/core/kernels/debug_ops_test.cc @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/kernels/ops_testutil.h" #include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/test.h" @@ -94,7 +95,22 @@ TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) { ASSERT_TRUE(env_->FileExists(dump_roots[i]).ok()); ASSERT_TRUE(env_->IsDirectory(dump_roots[i]).ok()); - DIR* dir = opendir(dump_roots[i].c_str()); + std::vector<string> device_roots; + DIR* dir0 = opendir(dump_roots[i].c_str()); + struct dirent* ent0; + const string kDeviceDirPrefix = + strings::StrCat(DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag); + while ((ent0 = readdir(dir0)) != nullptr) { + if (!strncmp(ent0->d_name, kDeviceDirPrefix.c_str(), + kDeviceDirPrefix.size())) { + device_roots.push_back(io::JoinPath(dump_roots[i], ent0->d_name)); + } + } + ASSERT_EQ(1, device_roots.size()); + closedir(dir0); + + const string& device_root = device_roots[0]; + DIR* dir = opendir(device_root.c_str()); struct dirent* ent; int dump_files_found = 0; while ((ent = readdir(dir)) != nullptr) { @@ -102,8 +118,7 @@ TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) { dump_files_found++; // Try reading the file into a Event proto. - const string dump_file_path = - strings::StrCat(dump_roots[i], "/", ent->d_name); + const string dump_file_path = io::JoinPath(device_root, ent->d_name); std::fstream ifs(dump_file_path, std::ios::in | std::ios::binary); Event event; event.ParseFromIstream(&ifs); diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc index 7303c97c0e..0d4feb7c1d 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc @@ -34,6 +34,19 @@ namespace tensorflow { using Eigen::GpuDevice; +// Returns whether depthwise convolution forward pass can be performed using the +// faster ('Small') variant of the kernel. +EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall( + const DepthwiseArgs args) { + return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 16 && + args.in_cols <= 16 && args.in_rows == args.out_rows && + args.in_cols == args.out_cols && args.pad_rows >= 0 && + args.pad_rows < args.filter_rows && args.pad_cols >= 0 && + args.pad_cols < args.filter_cols && + args.filter_rows * args.filter_cols <= + (args.in_rows + 1) / 2 * args.in_cols; +} + // A Cuda kernel to compute the depthwise convolution forward pass // in NHWC format. template <typename T, int kKnownFilterWidth, int kKnownFilterHeight, @@ -118,7 +131,8 @@ __global__ void __launch_bounds__(1024, 2) // CUDA kernel to compute the depthwise convolution forward pass in NCHW format, // tailored for small images up to 16x16. Stride and depth multiplier must be 1. -// Padding must be 'SAME', which allows to reuse the index computation. +// Padding must be 'SAME', which allows to reuse the index computation. Only +// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true. // Tiles of the input and filter tensors are loaded into shared memory before // performing the convolution. Each thread handles two elements per iteration, // one each in the lower and upper half of a tile. @@ -126,6 +140,7 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight, bool kKnownEvenRows> __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( const DepthwiseArgs args, const T* input, const T* filter, T* output) { + assert(CanLaunchDepthwiseConv2dGPUSmall(args)); // Holds block plus halo and filter data for blockDim.x depths. extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; T* const shared_data = reinterpret_cast<T*>(shared_memory); @@ -143,16 +158,15 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B. const int block_slices = 8; - const int block_cols = blockDim.y; const int block_rows = blockDim.z; // These values are the same for all threads and could // be precomputed on the CPU. - const int block_size = block_rows * block_cols * block_slices; + const int block_size = block_rows * in_cols * block_slices; const int in_row_size = in_cols * in_depth; const int in_size = in_rows * in_row_size; const int in_increment = (in_cols - 1) * block_slices; - const int filter_size = filter_rows * filter_cols; + const int filter_pixels = filter_rows * filter_cols; const int tile_cols = in_cols + filter_cols - 1; const int even_rows = kKnownEvenRows || (1 & ~in_rows); const int tile_rows = in_rows + filter_rows - even_rows; @@ -170,7 +184,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( const int thread_row = threadIdx.z; // Position in block. - const int thread_pix = thread_row * block_cols + thread_col; + const int thread_pix = thread_row * in_cols + thread_col; const int thread_idx = thread_pix * block_slices + thread_depth; // Initialize tile, in particular the padding. @@ -192,7 +206,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( const int max_depth = in_depth - thread_depth; const int filter_write_offset = - thread_pix < filter_size ? tile_size + thread_idx : 0; + thread_pix < filter_pixels ? tile_size + thread_idx : 0; const int filter_read_offset = tile_size + thread_depth; const bool skip_second = !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows; @@ -377,56 +391,198 @@ __global__ void __launch_bounds__(1024, 2) } } -template <typename T, int kKnownFilterWidth, int kKnownFilterHeight> -bool TryLaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, - const DepthwiseArgs args, const T* input, - const T* filter, T* output, - TensorFormat data_format) { - if (data_format != FORMAT_NHWC || args.depth_multiplier != 1 || - args.stride != 1 || args.in_rows > 16 || args.in_cols > 16 || - args.in_rows != args.out_rows || args.in_cols != args.out_cols || - args.pad_rows < 0 || args.pad_rows >= args.filter_rows || - args.pad_cols < 0 || args.pad_cols >= args.filter_cols) { - return false; +// CUDA kernel to compute the depthwise convolution forward pass in NCHW format, +// tailored for small images up to 16x16. Stride and depth multiplier must be 1. +// Padding must be 'SAME', which allows to reuse the index computation. Only +// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true. +// Tiles of the input and filter tensors are loaded into shared memory before +// performing the convolution. Each thread handles two elements per iteration, +// one each in the lower and upper half of a tile. +template <typename T, int kKnownFilterWidth, int kKnownFilterHeight, + bool kKnownEvenRows> +__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall( + const DepthwiseArgs args, const T* input, const T* filter, T* output) { + assert(CanLaunchDepthwiseConv2dGPUSmall(args)); + // Holds block plus halo and filter data for blockDim.z depths. + extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; + T* const shared_data = reinterpret_cast<T*>(shared_memory); + + const int batches = args.batch; + const int in_rows = args.in_rows; + const int in_cols = args.in_cols; + const int in_depth = args.in_depth; + const int filter_rows = + kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; + const int filter_cols = + kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; + const int pad_rows = args.pad_rows; + const int pad_cols = args.pad_cols; + + // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16. + const int block_rows = blockDim.y; + const int block_slices = 8; + + // These values are the same for all threads and could + // be precomputed on the CPU. + const int block_pixels = in_cols * block_rows; + const int block_size = block_pixels * block_slices; + const int in_pixels = in_cols * in_rows; + const int in_increment = in_cols - 1; + const int filter_pixels = filter_rows * filter_cols; + const int tile_cols = in_cols + filter_cols - 1; + const int even_rows = kKnownEvenRows || (1 & ~in_rows); + const int tile_rows = in_rows + filter_rows - even_rows; + const int tile_pixels = tile_cols * tile_rows; + const int tile_size = tile_pixels * block_slices; + const int tile_offset = block_rows * tile_cols; + const int pad_offset = pad_rows * tile_cols + pad_cols; + const int in_slices = in_depth * batches; + const int in_blocks = (in_slices + block_slices - 1) / block_slices; + + const int thread_col = threadIdx.x; + const int thread_row = threadIdx.y; + const int thread_depth = threadIdx.z; + + // Position in block. + const int thread_pix = thread_row * in_cols + thread_col; + const int thread_idx = thread_depth * block_pixels + thread_pix; + + // Initialize tile, in particular the padding. + for (int i = thread_idx; i < tile_size; i += block_size) { + shared_data[i] = T(0); } + __syncthreads(); - const int block_rows = (args.in_rows + 1) / 2; - if (args.filter_rows * args.filter_cols > args.in_cols * block_rows) { - return false; + // Position in tensors. + const int tensor_idx = thread_depth * in_pixels + thread_pix; + + // Position in (padded) shared memory. + const int data_pix = thread_row * tile_cols + thread_col; + const int data_idx = thread_depth * tile_pixels + data_pix; + + // Position in shared memory, offset by pad_rows / pad_cols. + const int tile_idx = data_idx + pad_offset; + + // Filter is always in HWCK format, irrespective of the input/output format. + const int filter_pix = thread_idx / block_slices; + const int filter_depth = thread_idx % block_slices; + const int filter_idx = filter_pix * in_depth; + + const int max_slice = in_slices - thread_depth; + const int filter_write_offset = + filter_pix < filter_pixels ? tile_size + thread_idx : 0; + const int filter_read_offset = tile_size + thread_depth; + const bool skip_second = + !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows; + + for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) { + const int slice = b * block_slices; + + const int inout_offset = slice * in_pixels + tensor_idx; + const bool slice_in_range = slice < max_slice; + + if (slice_in_range) { + const T* const in_ptr = inout_offset + input; + T* const tile_ptr = tile_idx + shared_data; + tile_ptr[0] = ldg(in_ptr); + if (!skip_second) { + tile_ptr[tile_offset] = ldg(block_pixels + in_ptr); + } + } + + if (filter_write_offset != 0) { + const int filter_offset = filter_idx + (slice + filter_depth) % in_depth; + shared_data[filter_write_offset] = ldg(filter_offset + filter); + } + + // Note: the condition to reach this is uniform across the entire block. + __syncthreads(); + + if (slice_in_range) { + T sum1 = 0; + T sum2 = 0; + int shared_offset = data_idx; + const T* filter_ptr = filter_read_offset + shared_data; + UNROLL for (int r = 0; r < filter_rows; ++r) { + UNROLL for (int c = 0; c < filter_cols; ++c) { + const T filter_value = *filter_ptr; + const T* const tile_ptr = shared_offset + shared_data; + sum1 += filter_value * tile_ptr[0]; + sum2 += filter_value * tile_ptr[tile_offset]; + ++shared_offset; + filter_ptr += block_slices; + } + shared_offset += in_increment; + } + T* const out_ptr = inout_offset + output; + out_ptr[0] = sum1; + if (!skip_second) { + out_ptr[block_pixels] = sum2; + } + } + + // Note: the condition to reach this is uniform across the entire block. + __syncthreads(); } +} +template <typename T, int kKnownFilterWidth, int kKnownFilterHeight, + bool kKnownEvenRows> +void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args, + const T* input, const T* filter, T* output, + TensorFormat data_format) { + const int block_rows = (args.in_rows + 1) / 2; + const int block_slices = 8; const int tile_cols = args.in_cols + args.filter_cols - 1; const int tile_rows = block_rows * 2 + args.filter_rows - 1; - const int tile_size = tile_rows * tile_cols; - const int filter_size = args.filter_rows * args.filter_cols; - dim3 block_dim = dim3(8, args.in_cols, block_rows); - const int shared_memory_size = - block_dim.x * (tile_size + filter_size) * sizeof(T); + const int tile_pixels = tile_rows * tile_cols; + const int filter_pixels = args.filter_rows * args.filter_cols; + const int shared_memory_size = + block_slices * (tile_pixels + filter_pixels) * sizeof(T); const int num_outputs = args.batch * args.out_rows * args.out_cols * args.out_depth; - if (args.in_rows & 1) { + + if (data_format == FORMAT_NHWC) { + dim3 block_dim = dim3(block_slices, args.in_cols, block_rows); CudaLaunchConfig config = GetCudaLaunchConfig( num_outputs, d, DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, - kKnownFilterHeight, false>, + kKnownFilterHeight, kKnownEvenRows>, shared_memory_size, block_dim.x * block_dim.y * block_dim.z); DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, kKnownFilterHeight, - false> + kKnownEvenRows> <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>( args, input, filter, output); - } else { + } else if (data_format == FORMAT_NCHW) { + dim3 block_dim = dim3(args.in_cols, block_rows, block_slices); CudaLaunchConfig config = GetCudaLaunchConfig( num_outputs, d, - DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, - kKnownFilterHeight, true>, + DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth, + kKnownFilterHeight, kKnownEvenRows>, shared_memory_size, block_dim.x * block_dim.y * block_dim.z); - DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, kKnownFilterHeight, - true> + DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth, kKnownFilterHeight, + kKnownEvenRows> <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>( args, input, filter, output); + } else { + assert(false); + } +} + +template <typename T, int kKnownFilterWidth, int kKnownFilterHeight> +void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args, + const T* input, const T* filter, T* output, + TensorFormat data_format) { + if (args.in_rows & 1) { + LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight, + /*kKnownEvenRows=*/false>( + d, args, input, filter, output, data_format); + } else { + LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight, + /*kKnownEvenRows=*/true>( + d, args, input, filter, output, data_format); } - return true; } template <typename T, int kKnownFilterWidth, int kKnownFilterHeight, @@ -434,9 +590,9 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight, void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args, const T* input, const T* filter, T* output, TensorFormat data_format) { - if (TryLaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, - kKnownFilterHeight>( - d, args, input, filter, output, data_format)) { + if (CanLaunchDepthwiseConv2dGPUSmall(args)) { + LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight>( + d, args, input, filter, output, data_format); return; } const int num_outputs = @@ -588,16 +744,15 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall( // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B. const int block_slices = 8; - const int block_cols = blockDim.y; const int block_rows = blockDim.z; // These values are the same for all threads and could // be precomputed on the CPU. - const int block_size = block_rows * block_cols * block_slices; + const int block_size = block_rows * in_cols * block_slices; const int in_row_size = in_cols * in_depth; const int in_size = in_rows * in_row_size; const int in_increment = (in_cols - 1) * block_slices; - const int filter_size = filter_rows * filter_cols; + const int filter_pixels = filter_rows * filter_cols; const int tile_cols = in_cols + filter_cols - 1; const int even_rows = kKnownEvenRows || (1 & ~in_rows); const int tile_rows = in_rows + filter_rows - even_rows; @@ -615,7 +770,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall( const int thread_row = threadIdx.z; // Position in block. - const int thread_pix = thread_row * block_cols + thread_col; + const int thread_pix = thread_row * in_cols + thread_col; const int thread_idx = thread_pix * block_slices + thread_depth; // Initialize tile, in particular the padding. @@ -637,9 +792,9 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall( const int max_depth = in_depth - thread_depth; const int filter_write_offset = - thread_pix < filter_size ? tile_size + thread_idx : 0; + thread_pix < filter_pixels ? tile_size + thread_idx : 0; const int filter_read_offset = - tile_size + filter_size * block_slices + thread_depth; + tile_size + filter_pixels * block_slices + thread_depth; const bool skip_second = !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows; @@ -786,11 +941,11 @@ bool TryLaunchDepthwiseConv2dBackpropInputGPUSmall( const int tile_cols = args.in_cols + args.filter_cols - 1; const int tile_rows = block_rows * 2 + args.filter_rows - 1; - const int tile_size = tile_rows * tile_cols; - const int filter_size = args.filter_rows * args.filter_cols; + const int tile_pixels = tile_rows * tile_cols; + const int filter_pixels = args.filter_rows * args.filter_cols; dim3 block_dim = dim3(8, args.in_cols, block_rows); const int shared_memory_size = - block_dim.x * (tile_size + filter_size) * sizeof(T); + block_dim.x * (tile_pixels + filter_pixels) * sizeof(T); const int num_in_backprop = args.batch * args.in_rows * args.in_cols * args.in_depth; @@ -1007,16 +1162,15 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B. const int block_slices = 8; - const int block_cols = blockDim.y; const int block_rows = blockDim.z; // These values are the same for all threads and could // be precomputed on the CPU. - const int block_size = block_rows * block_cols * block_slices; + const int block_size = block_rows * in_cols * block_slices; const int in_row_size = in_cols * in_depth; const int in_size = in_rows * in_row_size; const int in_increment = (in_cols - 1) * block_slices; - const int filter_size = filter_rows * filter_cols; + const int filter_pixels = filter_rows * filter_cols; const int tile_cols = in_cols + filter_cols - 1; const int tile_rows = 2 * block_rows + filter_rows - 1; const int tile_row_size = tile_cols * block_slices; @@ -1028,14 +1182,14 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( const int tensor_offset = block_rows * in_row_size; const int accum_pixels = 32; const int accum_increment = accum_pixels * block_slices; - const int accum_size = filter_size * accum_increment; + const int accum_size = filter_pixels * accum_increment; const int thread_depth = threadIdx.x; const int thread_col = threadIdx.y; const int thread_row = threadIdx.z; // Position in block. - const int thread_pix = thread_row * block_cols + thread_col; + const int thread_pix = thread_row * in_cols + thread_col; const int thread_idx = thread_pix * block_slices + thread_depth; // Initialize tile, in particular the padding and accumulator. @@ -1247,11 +1401,11 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall( const int block_rows = (args.in_rows + 1) / 2 + rows_mask & ~rows_mask; const int tile_cols = args.in_cols + args.filter_cols - 1; const int tile_rows = block_rows * 2 + args.filter_rows - 1; - const int tile_size = tile_rows * tile_cols; + const int tile_pixels = tile_rows * tile_cols; const int accum_size = args.filter_rows * args.filter_cols * 32; dim3 block_dim = dim3(8, args.in_cols, block_rows); const int shared_memory_size = - block_dim.x * (tile_size + accum_size) * sizeof(T); + block_dim.x * (tile_pixels + accum_size) * sizeof(T); if (block_rows > args.in_rows || args.filter_rows * args.filter_cols > args.in_cols * block_rows || diff --git a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h index 3d6f493a9c..d5b4cf7451 100644 --- a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h +++ b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h @@ -48,4 +48,4 @@ class IGraphTransferOpsDefinitions { } // namespace tensorflow -#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H +#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H_ diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc index fa3f3a4db6..51f11f9d2e 100644 --- a/tensorflow/core/kernels/iterator_ops.cc +++ b/tensorflow/core/kernels/iterator_ops.cc @@ -246,7 +246,7 @@ class OneShotIteratorOp : public OpKernel { n.Notify(); }); n.WaitForNotification(); - OP_REQUIRES_OK(ctx, std::move(factory_status)); + OP_REQUIRES_OK(ctx, factory_status); OP_REQUIRES( ctx, return_values.size() == 1 && diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc new file mode 100755 index 0000000000..23cabe7b54 --- /dev/null +++ b/tensorflow/core/kernels/lmdb_reader_op.cc @@ -0,0 +1,134 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "lmdb.h" +#include "tensorflow/core/framework/reader_op_kernel.h" +#include "tensorflow/core/framework/reader_base.h" +#include "tensorflow/core/lib/core/errors.h" + +#include <sys/stat.h> + +namespace tensorflow { + +inline void MDB_CHECK(int mdb_status) { + CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status); +} + +class LMDBReader : public ReaderBase { + public: + LMDBReader(const string& node_name, Env* env) + : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")), + env_(env), + mdb_env_(nullptr), + mdb_dbi_(0), + mdb_txn_(nullptr), + mdb_cursor_(nullptr) {} + + Status OnWorkStartedLocked() override { + MDB_CHECK(mdb_env_create(&mdb_env_)); + int flags = MDB_RDONLY | MDB_NOTLS; + + // Check if the LMDB filename is actually a file instead of a directory. + // If so, set appropriate flags so we can open it. + struct stat source_stat; + if (stat(current_work().c_str(), &source_stat) == 0 && + (source_stat.st_mode & S_IFREG)) { + flags |= MDB_NOSUBDIR; + } + + MDB_CHECK(mdb_env_open(mdb_env_, current_work().c_str(), flags, 0664)); + MDB_CHECK(mdb_txn_begin(mdb_env_, nullptr, MDB_RDONLY, &mdb_txn_)); + MDB_CHECK(mdb_dbi_open(mdb_txn_, nullptr, 0, &mdb_dbi_)); + + return Status::OK(); + } + + Status OnWorkFinishedLocked() override { + if (mdb_env_ != nullptr) { + if (mdb_cursor_) { + mdb_cursor_close(mdb_cursor_); + } + mdb_txn_abort(mdb_txn_); + mdb_dbi_close(mdb_env_, mdb_dbi_); + mdb_env_close(mdb_env_); + mdb_env_ = nullptr; + } + return Status::OK(); + } + + Status ReadLocked(string* key, string* value, bool* produced, + bool* at_end) override { + if (mdb_cursor_ == nullptr) { + MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_)); + if (Seek(MDB_FIRST) == false) { + *at_end = true; + return Status::OK(); + } + } + else { + if (Seek(MDB_NEXT) == false) { + *at_end = true; + return Status::OK(); + } + } + *key = string(static_cast<const char*>(mdb_key_.mv_data), + mdb_key_.mv_size); + *value = string(static_cast<const char*>(mdb_value_.mv_data), + mdb_value_.mv_size); + *produced = true; + return Status::OK(); + } + + Status ResetLocked() override { + CHECK_EQ(Seek(MDB_FIRST), true); + return ReaderBase::ResetLocked(); + } + + private: + bool Seek(MDB_cursor_op op) { + CHECK_NOTNULL(mdb_cursor_); + int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op); + if (mdb_status == MDB_NOTFOUND) { + return false; + } else { + MDB_CHECK(mdb_status); + return true; + } + } + + Env* const env_; + MDB_env* mdb_env_; + MDB_dbi mdb_dbi_; + + MDB_txn* mdb_txn_; + MDB_cursor* mdb_cursor_; + MDB_val mdb_key_, mdb_value_; +}; + +class LMDBReaderOp : public ReaderOpKernel { + public: + explicit LMDBReaderOp(OpKernelConstruction* context) + : ReaderOpKernel(context) { + Env* env = context->env(); + SetReaderFactory([this, env]() { + return new LMDBReader(name(), env); + }); + } +}; + +REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), + LMDBReaderOp); + +} diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc index f119cb1902..894b0113c2 100644 --- a/tensorflow/core/kernels/matrix_band_part_op.cc +++ b/tensorflow/core/kernels/matrix_band_part_op.cc @@ -83,7 +83,7 @@ class MatrixBandPartOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("MatrixBandPart").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ MatrixBandPartOp<CPUDevice, type>); -TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_BAND_PART); +TF_CALL_POD_TYPES(REGISTER_MATRIX_BAND_PART); #undef REGISTER_MATRIX_BAND_PART // Registration of the deprecated kernel. @@ -143,6 +143,7 @@ namespace functor { extern template struct MatrixBandPart<GPUDevice, T>; TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); +TF_CALL_bool(DECLARE_GPU_SPEC); TF_CALL_complex64(DECLARE_GPU_SPEC); TF_CALL_complex128(DECLARE_GPU_SPEC); } // namespace functor @@ -156,6 +157,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC); .HostMemory("num_upper"), \ MatrixBandPartOp<GPUDevice, type>); TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_BAND_PART_GPU); +TF_CALL_bool(REGISTER_MATRIX_BAND_PART_GPU); TF_CALL_complex64(REGISTER_MATRIX_BAND_PART_GPU); TF_CALL_complex128(REGISTER_MATRIX_BAND_PART_GPU); #undef REGISTER_MATRIX_BAND_PART_GPU diff --git a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc index fab3fb4a1f..ccc10ebada 100644 --- a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc +++ b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc @@ -29,6 +29,7 @@ typedef Eigen::GpuDevice GPUDevice; template struct functor::MatrixBandPart<GPUDevice, T>; TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC); +TF_CALL_bool(DEFINE_GPU_SPEC); TF_CALL_complex64(DEFINE_GPU_SPEC); TF_CALL_complex128(DEFINE_GPU_SPEC); diff --git a/tensorflow/core/kernels/matrix_diag_op.cc b/tensorflow/core/kernels/matrix_diag_op.cc index bc193357ad..75c49baaa8 100644 --- a/tensorflow/core/kernels/matrix_diag_op.cc +++ b/tensorflow/core/kernels/matrix_diag_op.cc @@ -123,7 +123,7 @@ class MatrixDiagOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("MatrixDiagPart").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ MatrixDiagPartOp<CPUDevice, type>); -TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_DIAG); +TF_CALL_POD_TYPES(REGISTER_MATRIX_DIAG); #undef REGISTER_MATRIX_DIAG // Registration of the deprecated kernel. @@ -136,7 +136,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_DIAG); .Device(DEVICE_CPU) \ .TypeConstraint<type>("T"), \ MatrixDiagPartOp<CPUDevice, type>); -TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_DIAG); +TF_CALL_POD_TYPES(REGISTER_BATCH_MATRIX_DIAG); #undef REGISTER_BATCH_MATRIX_DIAG // Implementation of the functor specialization for CPU. @@ -187,6 +187,7 @@ namespace functor { extern template struct MatrixDiagPart<GPUDevice, T>; TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); +TF_CALL_bool(DECLARE_GPU_SPEC); TF_CALL_complex64(DECLARE_GPU_SPEC); TF_CALL_complex128(DECLARE_GPU_SPEC); @@ -201,6 +202,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC); Name("MatrixDiagPart").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ MatrixDiagPartOp<GPUDevice, type>); TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_DIAG_GPU); +TF_CALL_bool(REGISTER_MATRIX_DIAG_GPU); TF_CALL_complex64(REGISTER_MATRIX_DIAG_GPU); TF_CALL_complex128(REGISTER_MATRIX_DIAG_GPU); #undef REGISTER_MATRIX_DIAG_GPU diff --git a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc index 14774d5404..cfb1fa10fc 100644 --- a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc +++ b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc @@ -31,6 +31,7 @@ typedef Eigen::GpuDevice GPUDevice; template struct functor::MatrixDiagPart<GPUDevice, T>; TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC); +TF_CALL_bool(DEFINE_GPU_SPEC); TF_CALL_complex64(DEFINE_GPU_SPEC); TF_CALL_complex128(DEFINE_GPU_SPEC); diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc index cbb2b68b7f..3397af56bc 100644 --- a/tensorflow/core/kernels/matrix_set_diag_op.cc +++ b/tensorflow/core/kernels/matrix_set_diag_op.cc @@ -100,7 +100,7 @@ class MatrixSetDiagOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("MatrixSetDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ MatrixSetDiagOp<CPUDevice, type>); -TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG); +TF_CALL_POD_TYPES(REGISTER_MATRIX_SET_DIAG); #undef REGISTER_MATRIX_SET_DIAG // Registration of the deprecated kernel. @@ -109,7 +109,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG); REGISTER_KERNEL_BUILDER( \ Name("BatchMatrixSetDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ MatrixSetDiagOp<CPUDevice, type>); -TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_SET_DIAG); +TF_CALL_POD_TYPES(REGISTER_BATCH_MATRIX_SET_DIAG); #undef REGISTER_BATCH_MATRIX_SET_DIAG namespace functor { @@ -131,6 +131,21 @@ struct MatrixSetDiag<CPUDevice, T> { } }; +template <> +struct MatrixSetDiag<CPUDevice, bool> { + static void Compute(const CPUDevice& d, TTypes<bool, 3>::ConstTensor input, + TTypes<bool, 2>::ConstTensor diag, + TTypes<bool>::Scalar scratch, + TTypes<bool, 3>::Tensor output) { + output.device(d) = input; + for (int64 r = 0; r < output.dimension(0); ++r) { + for (int64 d = 0; d < diag.dimension(1); ++d) { + output(r, d, d) = diag(r, d); + } + } + } +}; + } // namespace functor #if GOOGLE_CUDA @@ -147,6 +162,7 @@ namespace functor { extern template struct MatrixSetDiag<GPUDevice, T>; TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); +TF_CALL_bool(DECLARE_GPU_SPEC); TF_CALL_complex64(DECLARE_GPU_SPEC); TF_CALL_complex128(DECLARE_GPU_SPEC); @@ -158,6 +174,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC); Name("MatrixSetDiag").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ MatrixSetDiagOp<GPUDevice, type>); TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG_GPU); +TF_CALL_bool(REGISTER_MATRIX_SET_DIAG_GPU); TF_CALL_complex64(REGISTER_MATRIX_SET_DIAG_GPU); TF_CALL_complex128(REGISTER_MATRIX_SET_DIAG_GPU); #undef REGISTER_MATRIX_SET_DIAG_GPU diff --git a/tensorflow/core/kernels/matrix_set_diag_op.h b/tensorflow/core/kernels/matrix_set_diag_op.h index 8ba2f3756a..63e5650bf0 100644 --- a/tensorflow/core/kernels/matrix_set_diag_op.h +++ b/tensorflow/core/kernels/matrix_set_diag_op.h @@ -71,6 +71,23 @@ struct MatrixSetDiag { } }; +template <typename Device> +struct MatrixSetDiag<Device, bool> { + EIGEN_ALWAYS_INLINE static void Compute(const Device& d, + TTypes<bool, 3>::ConstTensor input, + TTypes<bool, 2>::ConstTensor diag, + TTypes<bool>::Scalar scratch, + TTypes<bool, 3>::Tensor output) { + output.device(d) = input; + generator::OverwriteDiagGenerator<bool> generator(diag, output); + // Use all() to force the generation to aggregate to the scalar + // output scratch. This in turn forces each element of the + // generator to execute. The side effect of the execution is to + // update the diagonal components of output with diag. + scratch.device(d) = diag.generate(generator).all(); + } +}; + } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc index bd097ff328..8e41ce5860 100644 --- a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc +++ b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc @@ -29,6 +29,7 @@ typedef Eigen::GpuDevice GPUDevice; template struct functor::MatrixSetDiag<GPUDevice, T>; TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC); +TF_CALL_bool(DEFINE_GPU_SPEC); TF_CALL_complex64(DEFINE_GPU_SPEC); TF_CALL_complex128(DEFINE_GPU_SPEC); diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc index 1c7d50e161..4c656bd74b 100644 --- a/tensorflow/core/kernels/sendrecv_ops.cc +++ b/tensorflow/core/kernels/sendrecv_ops.cc @@ -52,17 +52,16 @@ SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device, send_device_incarnation, tensor_name); + // The vast majority of Send nodes are outside any loop context, so + // proactively cache the rendezvous key for the top-level. + GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_); + OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_)); } void SendOp::Compute(OpKernelContext* ctx) { OP_REQUIRES( ctx, ctx->rendezvous() != nullptr, errors::Internal("Op kernel context needs to provide a rendezvous.")); - Rendezvous::ParsedKey parsed; - GetRendezvousKey(key_prefix_, ctx->frame_iter(), &parsed.buf_); - VLOG(2) << "Send " << parsed.buf_; - - OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed.buf_, &parsed)); // The device context may be passed between the Send/Recv // boundary, so that the device context used to produce the Tensor @@ -71,8 +70,24 @@ void SendOp::Compute(OpKernelContext* ctx) { Rendezvous::Args args; args.device_context = ctx->op_device_context(); args.alloc_attrs = ctx->input_alloc_attr(0); - OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed, args, ctx->input(0), - ctx->is_input_dead())); + + if (ctx->frame_iter() == FrameAndIter(0, 0)) { + // Use the cached rendezvous key. + VLOG(2) << "Send " << parsed_key_.buf_; + OP_REQUIRES_OK(ctx, + ctx->rendezvous()->Send(parsed_key_, args, ctx->input(0), + ctx->is_input_dead())); + } else { + Rendezvous::ParsedKey in_loop_parsed; + GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_); + VLOG(2) << "Send " << in_loop_parsed.buf_; + OP_REQUIRES_OK(ctx, + Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed)); + + OP_REQUIRES_OK(ctx, + ctx->rendezvous()->Send(in_loop_parsed, args, ctx->input(0), + ctx->is_input_dead())); + } } REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp); @@ -101,17 +116,16 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device, send_device_incarnation, tensor_name); + // The vast majority of Recv nodes are outside any loop context, so + // proactively cache the rendezvous key for the top-level. + GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_); + OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_)); } void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { OP_REQUIRES( ctx, ctx->rendezvous() != nullptr, errors::Internal("Op kernel context needs to provide a rendezvous.")); - Rendezvous::ParsedKey parsed; - GetRendezvousKey(key_prefix_, ctx->frame_iter(), &parsed.buf_); - VLOG(2) << "Recv " << parsed.buf_; - - OP_REQUIRES_OK_ASYNC(ctx, Rendezvous::ParseKey(parsed.buf_, &parsed), done); Rendezvous::Args args; args.device_context = ctx->op_device_context(); @@ -136,7 +150,19 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { done(); }, std::move(done), _1, _2, _3, _4, _5); - ctx->rendezvous()->RecvAsync(parsed, args, std::move(done_cb)); + + if (ctx->frame_iter() == FrameAndIter(0, 0)) { + VLOG(2) << "Recv " << parsed_key_.buf_; + ctx->rendezvous()->RecvAsync(parsed_key_, args, std::move(done_cb)); + } else { + Rendezvous::ParsedKey in_loop_parsed; + GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_); + VLOG(2) << "Recv " << in_loop_parsed.buf_; + OP_REQUIRES_OK_ASYNC( + ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done); + + ctx->rendezvous()->RecvAsync(in_loop_parsed, args, std::move(done_cb)); + } } REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp); diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h index 6e91422682..67867e3308 100644 --- a/tensorflow/core/kernels/sendrecv_ops.h +++ b/tensorflow/core/kernels/sendrecv_ops.h @@ -28,6 +28,7 @@ class SendOp : public OpKernel { private: string key_prefix_; + Rendezvous::ParsedKey parsed_key_; TF_DISALLOW_COPY_AND_ASSIGN(SendOp); }; @@ -39,6 +40,7 @@ class RecvOp : public AsyncOpKernel { private: string key_prefix_; + Rendezvous::ParsedKey parsed_key_; TF_DISALLOW_COPY_AND_ASSIGN(RecvOp); }; diff --git a/tensorflow/core/kernels/sendrecv_ops_test.cc b/tensorflow/core/kernels/sendrecv_ops_test.cc new file mode 100644 index 0000000000..092a29f2f3 --- /dev/null +++ b/tensorflow/core/kernels/sendrecv_ops_test.cc @@ -0,0 +1,74 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { + +namespace { + +// Implement a trivial version of the Rendezvous interface, to avoid +// clouding the benchmark results with the time spent in the various +// implementations, and to avoid the duplicate-send or duplicate-recv +// errors that would arise from running either benchmark in a loop. +class DummyRendezvous : public Rendezvous { + Status Send(const ParsedKey& key, const Args& args, const Tensor& val, + const bool is_dead) override { + return Status::OK(); + } + void RecvAsync(const ParsedKey& key, const Args& args, + DoneCallback done) override { + static Tensor* t = new Tensor(DT_FLOAT, TensorShape({0})); + done(Status::OK(), args, args, *t, false); + } + void StartAbort(const Status& status) override {} +}; + +static Graph* Send() { + Graph* g = new Graph(OpRegistry::Global()); + Tensor in0(DT_FLOAT, TensorShape({0})); + test::graph::Send(g, test::graph::Constant(g, in0), "T", "/cpu:0", 1, + "/cpu:0"); + test::graph::Recv(g, "T", "float", "/cpu:0", 1, "/cpu:0"); + return g; +} + +static Graph* Recv() { + Graph* g = new Graph(OpRegistry::Global()); + test::graph::Recv(g, "T", "float", "/cpu:0", 1, "/cpu:0"); + return g; +} + +static void BM_Send(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast<int64>(iters)); + test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous) + .Run(iters); +} +BENCHMARK(BM_Send); + +static void BM_Recv(int iters) { + testing::UseRealTime(); + testing::ItemsProcessed(static_cast<int64>(iters)); + test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous) + .Run(iters); +} +BENCHMARK(BM_Recv); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc index 4571df9a98..6771697a16 100644 --- a/tensorflow/core/lib/io/inputbuffer_test.cc +++ b/tensorflow/core/lib/io/inputbuffer_test.cc @@ -298,9 +298,9 @@ TEST(InputBuffer, ReadVarint32) { // Generates data. std::vector<uint32> data; uint32 i = 0; - for (; i < (1 << 10); i += 1) data.push_back(i); - for (; i < (1 << 15); i += 5) data.push_back(i); - for (; i < (1 << 31); i += 132817) data.push_back(i); + for (; i < (1U << 10); i += 1) data.push_back(i); + for (; i < (1U << 15); i += 5) data.push_back(i); + for (; i < (1U << 31); i += 132817) data.push_back(i); data.push_back(std::numeric_limits<uint32>::max()); // Writes the varints. diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc index 3de157a1fc..4999d5cc90 100644 --- a/tensorflow/core/lib/io/zlib_inputstream.cc +++ b/tensorflow/core/lib/io/zlib_inputstream.cc @@ -58,16 +58,14 @@ void ZlibInputStream::InitZlibBuffer() { z_stream_->avail_in = 0; int status = inflateInit2(z_stream_.get(), zlib_options_.window_bits); - if (status != Z_OK) { - LOG(FATAL) << "inflateInit failed with status " << status; - z_stream_.reset(nullptr); - } else { - z_stream_->next_in = z_stream_input_.get(); - z_stream_->next_out = z_stream_output_.get(); - next_unread_byte_ = reinterpret_cast<char*>(z_stream_output_.get()); - z_stream_->avail_in = 0; - z_stream_->avail_out = output_buffer_capacity_; - } + + CHECK_EQ(status, Z_OK) << "inflateInit failed with status " << status; + + z_stream_->next_in = z_stream_input_.get(); + z_stream_->next_out = z_stream_output_.get(); + next_unread_byte_ = reinterpret_cast<char*>(z_stream_output_.get()); + z_stream_->avail_in = 0; + z_stream_->avail_out = output_buffer_capacity_; } Status ZlibInputStream::ReadFromStream() { diff --git a/tensorflow/core/lib/lmdb/testdata/data.mdb b/tensorflow/core/lib/lmdb/testdata/data.mdb Binary files differnew file mode 100644 index 0000000000..3ea75699cb --- /dev/null +++ b/tensorflow/core/lib/lmdb/testdata/data.mdb diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc index a7b4422bab..39a3b42a87 100644 --- a/tensorflow/core/ops/array_ops_test.cc +++ b/tensorflow/core/ops/array_ops_test.cc @@ -175,7 +175,11 @@ TEST(ArrayOpsTest, Identity_ShapeFnHandles) { TF_ASSERT_OK(c.construction_status()); ASSERT_TRUE(op_reg_data->shape_inference_fn != nullptr); TF_ASSERT_OK(c.Run(op_reg_data->shape_inference_fn)); - EXPECT_TRUE(c.output_handle_dtype(0) == DT_BOOL); + + const auto* shapes_and_types = c.output_handle_shapes_and_types(0); + ASSERT_TRUE(shapes_and_types != nullptr); + ASSERT_EQ(1, shapes_and_types->size()); + EXPECT_EQ((*shapes_and_types)[0].dtype, DT_BOOL); } TEST(ArrayOpsTest, Diag_ShapeFn) { diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc index 0bce6fc0ea..b7afab84a1 100644 --- a/tensorflow/core/ops/io_ops.cc +++ b/tensorflow/core/ops/io_ops.cc @@ -520,6 +520,21 @@ shared_name: If non-empty, this reader is named in the given bucket with this shared_name. Otherwise, the node name is used instead. )doc"); +REGISTER_OP("LMDBReader") + .Output("reader_handle: Ref(string)") + .Attr("container: string = ''") + .Attr("shared_name: string = ''") + .SetIsStateful() + .SetShapeFn(TwoElementOutput) + .Doc(R"doc( +A Reader that outputs the records from a LMDB file. +reader_handle: The handle to reference the Reader. +container: If non-empty, this reader is placed in the given container. + Otherwise, a default container is used. +shared_name: If non-empty, this reader is named in the given bucket + with this shared_name. Otherwise, the node name is used instead. +)doc"); + // TODO(cwhipkey): mark this deprecated in favor of V2. REGISTER_OP("IdentityReader") .Output("reader_handle: Ref(string)") diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index d82e98beb9..997b7de3d9 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -26050,6 +26050,33 @@ op { is_stateful: true } op { + name: "LMDBReader" + output_arg { + name: "reader_handle" + description: "The handle to reference the Reader." + type: DT_STRING + is_ref: true + } + attr { + name: "container" + type: "string" + default_value { + s: "" + } + description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used." + } + attr { + name: "shared_name" + type: "string" + default_value { + s: "" + } + description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead." + } + summary: "A Reader that outputs the records from a LMDB database." + is_stateful: true +} +op { name: "TakeDataset" input_arg { name: "input_dataset" diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD index 615ca85ab7..39446b6ca2 100644 --- a/tensorflow/python/debug/BUILD +++ b/tensorflow/python/debug/BUILD @@ -546,6 +546,22 @@ cuda_py_test( tags = ["notsan"], ) +cuda_py_test( + name = "session_debug_multi_gpu_test", + size = "small", + srcs = ["lib/session_debug_multi_gpu_test.py"], + additional_deps = [ + ":debug_data", + ":debug_utils", + "//tensorflow/python:client", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:math_ops", + "//tensorflow/python:platform_test", + "//tensorflow/python:variables", + ], +) + py_test( name = "debugger_cli_common_test", size = "small", diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py index 35e9fef29f..748e5d78ba 100644 --- a/tensorflow/python/debug/lib/debug_data.py +++ b/tensorflow/python/debug/lib/debug_data.py @@ -19,10 +19,12 @@ from __future__ import division from __future__ import print_function import collections +import glob import json import os import numpy as np +import six from six.moves import xrange # pylint: disable=redefined-builtin from tensorflow.core.framework import graph_pb2 @@ -32,9 +34,12 @@ from tensorflow.python.framework import tensor_util from tensorflow.python.platform import gfile +# TODO(cais): Tie these string constants in with C++? METADATA_FILE_PREFIX = "_tfdbg_" CORE_METADATA_TAG = "core_metadata_" GRAPH_FILE_TAG = "graph_" +DEVICE_TAG = "device_" + FETCHES_INFO_FILE_TAG = "fetches_info_" FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_" @@ -158,10 +163,6 @@ def parse_node_or_tensor_name(name): return name, None -def _is_core_metadata_file(file_name): - return file_name.startswith(METADATA_FILE_PREFIX + CORE_METADATA_TAG) - - def _is_graph_file(file_name): return file_name.startswith(METADATA_FILE_PREFIX + GRAPH_FILE_TAG) @@ -344,6 +345,28 @@ def extract_core_metadata_from_event_proto(event): json_metadata["target_nodes"]) +def device_name_to_device_path(device_name): + """Convert device name to device path.""" + device_name_items = device_name.split("/") + device_name_items = [item.replace(":", "_") for item in device_name_items] + return METADATA_FILE_PREFIX + DEVICE_TAG + ",".join(device_name_items) + + +def device_path_to_device_name(device_dir): + """Parse device name from device path. + + Args: + device_dir: (str) a directory name for the device. + + Returns: + (str) parsed device name. + """ + path_items = os.path.basename(device_dir)[ + len(METADATA_FILE_PREFIX) + len(DEVICE_TAG):].split(",") + return "/".join([ + path_item.replace("_", ":", 1) for path_item in path_items]) + + class DebugTensorDatum(object): """A single tensor dumped by TensorFlow Debugger (tfdbg). @@ -360,13 +383,17 @@ class DebugTensorDatum(object): """`DebugTensorDatum` constructor. Args: - dump_root: (`str`) Debug dump root directory. + dump_root: (`str`) Debug dump root directory. This path should not include + the path component that represents the device name (see also below). debug_dump_rel_path: (`str`) Path to a debug dump file, relative to the - `dump_root`. For example, suppose the debug dump root - directory is `/tmp/tfdbg_1` and the dump file is at - `/tmp/tfdbg_1/ns_1/node_a_0_DebugIdentity_123456789`, then - the value of the debug_dump_rel_path should be - `ns_1/node_a_0_DebugIdenity_1234456789`. + `dump_root`. The first item of this relative path is assumed to be + a path representing the name of the device that the Tensor belongs to. + See `device_path_to_device_name` for more details on the device path. + For example, suppose the debug dump root + directory is `/tmp/tfdbg_1` and the dump file is at + `/tmp/tfdbg_1/<device_path>/>ns_1/node_a_0_DebugIdentity_123456789`, + then the value of the debug_dump_rel_path should be + `<device_path>/ns_1/node_a_0_DebugIdenity_1234456789`. Raises: ValueError: If the base file name of the dump file does not conform to @@ -374,15 +401,13 @@ class DebugTensorDatum(object): `node_name`_`output_slot`_`debug_op`_`timestamp` """ - base = os.path.basename(debug_dump_rel_path) - + path_components = os.path.normpath(debug_dump_rel_path).split(os.sep) + self._device_name = device_path_to_device_name(path_components[0]) + base = path_components[-1] if base.count("_") < 3: raise ValueError( "Dump file path does not conform to the naming pattern: %s" % base) - # TODO(cais): Add hostname and pid to support dumps from distributed - # sessions. - self._extended_timestamp = base.split("_")[-1] # It may include an index suffix at the end if file path collision happened # due to identical timestamps. @@ -395,31 +420,23 @@ class DebugTensorDatum(object): self._debug_op = base.split("_")[-2] self._output_slot = int(base.split("_")[-3]) - namespace = os.path.dirname(debug_dump_rel_path).replace("\\", "/") node_base_name = "_".join(base.split("_")[:-3]) - if not namespace or namespace == ".": - self._node_name = node_base_name - else: - self._node_name = namespace + "/" + node_base_name + self._node_name = "/".join(path_components[1:-1] + [node_base_name]) self._file_path = os.path.join(dump_root, debug_dump_rel_path) self._dump_size_bytes = (gfile.Stat(self._file_path).length if gfile.Exists(self._file_path) else None) - self._run_fetches_info = None - self._run_feed_keys_info = None - def __str__(self): - return "{DebugTensorDatum: %s:%d @ %s @ %d}" % (self.node_name, - self.output_slot, - self.debug_op, - self.timestamp) + return "{DebugTensorDatum (%s) %s:%d @ %s @ %d}" % (self.device_name, + self.node_name, + self.output_slot, + self.debug_op, + self.timestamp) def __repr__(self): return self.__str__() - # TODO(cais): (b/38325442) Add device name information to this class. - def get_tensor(self): """Get tensor from the dump (`Event`) file. @@ -465,6 +482,16 @@ class DebugTensorDatum(object): return self._debug_op @property + def device_name(self): + """Name of the device that the tensor belongs to. + + Returns: + (`str`) device name. + """ + + return self._device_name + + @property def node_name(self): """Name of the node from which the tensor value was dumped. @@ -529,6 +556,8 @@ class WatchKeyDoesNotExistInDebugDumpDirError(ValueError): pass +# TODO(cais): This class is getting too large in line count. Refactor to make it +# smaller and easier to maintain. class DebugDumpDir(object): """Data set from a debug-dump directory on filesystem. @@ -548,23 +577,54 @@ class DebugDumpDir(object): Raises: IOError: If dump_root does not exist as a directory. + ValueError: If more than one core metadata file is found under the dump + root directory. """ if not gfile.IsDirectory(dump_root): raise IOError("Dump root directory %s does not exist" % dump_root) - self._core_metadata = None - self._load_dumps(dump_root) - self._create_tensor_watch_maps() - self._load_partition_graphs(partition_graphs, validate) + self._core_metadata = [] + + # Find the list of devices. + self._dump_root = dump_root + + self._load_core_metadata() + self._load_fetches_info() + self._load_feeds_info() + self._load_all_device_dumps(partition_graphs, validate) self._python_graph = None - def _load_dumps(self, dump_root): - """Load `DebugTensorDatum` instances from the dump root. + def _load_all_device_dumps(self, partition_graphs, validate): + """Load the dump data for all devices.""" + device_dirs = glob.glob(os.path.join( + self._dump_root, METADATA_FILE_PREFIX + DEVICE_TAG + "*")) + + self._device_names = [] + self._t0s = {} + self._dump_tensor_data = {} + self._dump_graph_file_paths = {} + self._debug_watches = {} + self._watch_key_to_devices = {} + self._watch_key_to_datum = {} + self._watch_key_to_rel_time = {} + self._watch_key_to_dump_size_bytes = {} + for device_dir in device_dirs: + device_name = device_path_to_device_name(device_dir) + self._device_names.append(device_name) + self._load_device_dumps(device_name, device_dir) + self._load_partition_graphs(partition_graphs, validate) + self._calculate_t0() + + for device_name in self._device_names: + self._create_tensor_watch_maps(device_name) - Populates a list of `DebugTensorDatum` instance and sorts the list by - ascending timestamp. + def _load_device_dumps(self, device_name, device_root): + """Load `DebugTensorDatum` instances from the dump root of a given device. + + Populates a map {device_name: a list of `DebugTensorDatum`}, where the list + is sorted by ascending timestamp. This sorting order reflects the order in which the TensorFlow executor processed the nodes of the graph. It is (one of many possible) topological @@ -584,55 +644,67 @@ class DebugDumpDir(object): graphs may not be available, e.g., when the run errors out. Args: - dump_root: (`str`) Dump root directory. - """ + device_name: (`str`) name of the device. + device_root: (`str`) dump root directory of the given device. - self._dump_root = dump_root - self._dump_tensor_data = [] - self._dump_graph_file_paths = [] + Raises: + ValueError: If GraphDef for the device is not available. + """ - self._debug_watches = collections.defaultdict( + self._dump_tensor_data[device_name] = [] + self._debug_watches[device_name] = collections.defaultdict( lambda: collections.defaultdict(set)) - for root, _, files in gfile.Walk(self._dump_root): + for root, _, files in gfile.Walk(device_root): for f in files: - if f.startswith(METADATA_FILE_PREFIX): - if _is_core_metadata_file(f): - self._load_core_metadata(os.path.join(self._dump_root, root, f)) - - if _is_graph_file(f): - self._dump_graph_file_paths.append( - os.path.join(self._dump_root, root, f)) - - if _is_run_fetches_info_file(f): - self._run_fetches_info = _load_log_message_from_event_file( - os.path.join(root, f)) - - if _is_run_feed_keys_info_file(f): - self._run_feed_keys_info = _load_log_message_from_event_file( - os.path.join(root, f)) - - continue - - datum = self._dump_file_name_to_datum(root, f) - self._dump_tensor_data.append(datum) - - self._debug_watches[datum.node_name][datum.output_slot].add( - datum.debug_op) - - self._dump_tensor_data = sorted( - self._dump_tensor_data, key=lambda x: x.extended_timestamp) - - if self._dump_tensor_data: - self._t0 = self._dump_tensor_data[0].timestamp + if _is_graph_file(f): + self._dump_graph_file_paths[device_name] = os.path.join( + device_root, root, f) + else: + datum = self._dump_file_name_to_datum(root, f) + self._dump_tensor_data[device_name].append(datum) + self._debug_watches[device_name][datum.node_name][ + datum.output_slot].add(datum.debug_op) + + self._dump_tensor_data[device_name] = sorted( + self._dump_tensor_data[device_name], + key=lambda x: x.extended_timestamp) + + if self._dump_tensor_data[device_name]: + self._t0s[device_name] = self._dump_tensor_data[device_name][0].timestamp else: - self._t0 = None - - def _load_core_metadata(self, event_file_path): - event = event_pb2.Event() - with gfile.Open(event_file_path, "rb") as f: - event.ParseFromString(f.read()) - self._core_metadata = extract_core_metadata_from_event_proto(event) + self._t0s[device_name] = None + + def _calculate_t0(self): + """Calculate the first timestamp across all devices.""" + t0s = [t0 for t0 in six.itervalues(self._t0s) if t0 is not None] + self._t0 = min(t0s) if t0s else None + + def _load_core_metadata(self): + core_metadata_files = glob.glob(os.path.join( + self._dump_root, METADATA_FILE_PREFIX + CORE_METADATA_TAG + "*")) + for core_metadata_file in core_metadata_files: + with gfile.Open(core_metadata_file, "rb") as f: + event = event_pb2.Event() + event.ParseFromString(f.read()) + self._core_metadata.append( + extract_core_metadata_from_event_proto(event)) + + def _load_fetches_info(self): + fetches_info_files = glob.glob(os.path.join( + self._dump_root, METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG + "*")) + self._run_fetches_info = [] + for fetches_info_file in fetches_info_files: + self._run_fetches_info.append( + _load_log_message_from_event_file(fetches_info_file)) + + def _load_feeds_info(self): + feeds_info_files = glob.glob(os.path.join( + self._dump_root, METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG + "*")) + self._run_feed_keys_info = [] + for feeds_info_file in feeds_info_files: + self._run_feed_keys_info.append( + _load_log_message_from_event_file(feeds_info_file)) def _dump_file_name_to_datum(self, dir_name, file_name): """Obtain a DebugTensorDatum from the directory and file name. @@ -648,34 +720,39 @@ class DebugDumpDir(object): # Calculate the relative path of the dump file with respect to the root. debug_dump_rel_path = os.path.join( os.path.relpath(dir_name, self._dump_root), file_name) - return DebugTensorDatum(self._dump_root, debug_dump_rel_path) - def _create_tensor_watch_maps(self): + def _create_tensor_watch_maps(self, device_name): """Create maps from tensor watch keys to datum and to timestamps. Create a map from watch key (tensor name + debug op) to `DebugTensorDatum` item. Also make a map from watch key to relative timestamp. "relative" means (absolute timestamp - t0). + + Args: + device_name: (str) name of the device. """ - self._watch_key_to_datum = {} - self._watch_key_to_rel_time = {} - self._watch_key_to_dump_size_bytes = {} - for datum in self._dump_tensor_data: - if datum.watch_key not in self._watch_key_to_datum: - self._watch_key_to_datum[datum.watch_key] = [datum] - self._watch_key_to_rel_time[datum.watch_key] = [ - datum.timestamp - self._t0 - ] - self._watch_key_to_dump_size_bytes[datum.watch_key] = [ - datum.dump_size_bytes - ] + self._watch_key_to_datum[device_name] = {} + self._watch_key_to_rel_time[device_name] = {} + self._watch_key_to_dump_size_bytes[device_name] = {} + for datum in self._dump_tensor_data[device_name]: + if datum.watch_key not in self._watch_key_to_devices: + self._watch_key_to_devices[datum.watch_key] = {device_name} + else: + self._watch_key_to_devices[datum.watch_key].add(device_name) + + if datum.watch_key not in self._watch_key_to_datum[device_name]: + self._watch_key_to_datum[device_name][datum.watch_key] = [datum] + self._watch_key_to_rel_time[device_name][datum.watch_key] = [ + datum.timestamp - self._t0] + self._watch_key_to_dump_size_bytes[device_name][datum.watch_key] = [ + datum.dump_size_bytes] else: - self._watch_key_to_datum[datum.watch_key].append(datum) - self._watch_key_to_rel_time[datum.watch_key].append(datum.timestamp - - self._t0) - self._watch_key_to_dump_size_bytes[datum.watch_key].append( + self._watch_key_to_datum[device_name][datum.watch_key].append(datum) + self._watch_key_to_rel_time[device_name][datum.watch_key].append( + datum.timestamp - self._t0) + self._watch_key_to_dump_size_bytes[device_name][datum.watch_key].append( datum.dump_size_bytes) def set_python_graph(self, python_graph): @@ -733,22 +810,32 @@ class DebugDumpDir(object): `output_names`: Names of the output (fetched) Tensors. `target_nodes`: Names of the target nodes. If the core metadata have not been loaded, `None`. + If more than one core metadata files exist, return a list of the + `nametuple` described above. """ - return self._core_metadata + output = self._core_metadata + return output[0] if len(output) == 1 else output @property def dumped_tensor_data(self): - return self._dump_tensor_data + """Retrieve dumped tensor data.""" + if len(self.devices()) == 1: + return self._dump_tensor_data[self.devices()[0]] + else: + all_devices_data = six.itervalues(self._dump_tensor_data) + data = [] + for device_data in all_devices_data: + data.extend(device_data) + return sorted(data, key=lambda x: x.extended_timestamp) @property def t0(self): - """Absolute timestamp of the first dumped tensor. + """Absolute timestamp of the first dumped tensor across all devices. Returns: (`int`) absolute timestamp of the first dumped tensor, in microseconds. """ - return self._t0 @property @@ -756,10 +843,10 @@ class DebugDumpDir(object): """Total number of dumped tensors in the dump root directory. Returns: - (`int`) total number of dumped tensors in the dump root directory. + (`int`) The total number of dumped tensors in the dump root directory. """ - - return len(self._dump_tensor_data) + return sum(len(self._dump_tensor_data[device_name]) + for device_name in self._dump_tensor_data) def _load_partition_graphs(self, partition_graphs, validate): """Load and process partition graphs. @@ -770,56 +857,73 @@ class DebugDumpDir(object): tensor dumps. Args: - partition_graphs: Partition graphs executed by the TensorFlow runtime, - represented as repeated fields of GraphDef. - If no partition_graph is available, use None. + partition_graphs: A repeated field of GraphDefs representing the + partition graphs executed by the TensorFlow runtime. validate: (`bool`) Whether the dump files are to be validated against the partition graphs. - """ - if partition_graphs: - self._partition_graphs = partition_graphs - elif self._dump_graph_file_paths: - # In case partition graphs are not available from arguments, load them - # from the dump directory. - self._partition_graphs = [ - _load_graph_def_from_event_file(dump_file_path) - for dump_file_path in self._dump_graph_file_paths - ] - else: - self._partition_graphs = None - return + Raises: + ValueError: If the partition GraphDef of one or more devices fail to be + loaded. + """ self._node_attributes = {} - self._node_inputs = {} self._node_ctrl_inputs = {} - self._node_recipients = {} self._node_ctrl_recipients = {} - - self._devices = [] self._node_devices = {} self._node_op_types = {} + self._copy_send_nodes = {} + + self._partition_graphs = {} + for device_name in self._device_names: + partition_graph = None + if device_name in self._dump_graph_file_paths: + partition_graph = _load_graph_def_from_event_file( + self._dump_graph_file_paths[device_name]) + else: + partition_graph = self._find_partition_graph(partition_graphs, + device_name) + + if partition_graph: + self._partition_graphs[device_name] = partition_graph - self._copy_send_nodes = [] + self._node_attributes[device_name] = {} + self._node_inputs[device_name] = {} + self._node_ctrl_inputs[device_name] = {} + self._node_recipients[device_name] = {} + self._node_ctrl_recipients[device_name] = {} + self._node_op_types[device_name] = {} + self._copy_send_nodes[device_name] = [] - for pg in self._partition_graphs: - for node in pg.node: - self._process_partition_graph_node(node) + if partition_graph: + for node in partition_graph.node: + self._process_partition_graph_node(device_name, node) - self._prune_non_control_edges_of_debug_ops() - self._prune_control_edges_of_debug_ops() + self._prune_non_control_edges_of_debug_ops(device_name) + self._prune_control_edges_of_debug_ops(device_name) - self._populate_recipient_maps() + self._populate_recipient_maps(device_name) - if validate: - self._validate_dump_with_graphs() + if device_name in self._partition_graphs and validate: + self._validate_dump_with_graphs(device_name) + + def _find_partition_graph(self, partition_graphs, device_name): + if partition_graphs is None: + return None + else: + for graph_def in partition_graphs: + for node_def in graph_def.node: + if node_def.device == device_name: + return graph_def + return None - def _process_partition_graph_node(self, node): + def _process_partition_graph_node(self, device_name, node): """Process a node from the partition graphs. Args: + device_name: (str) device name. node: (NodeDef) A partition-graph node to be processed. Raises: @@ -833,84 +937,91 @@ class DebugDumpDir(object): (watched_node_name, watched_output_slot, _, debug_op) = parse_debug_node_name(node.name) - self._debug_watches[watched_node_name][watched_output_slot].add( - debug_op) + self._debug_watches[device_name][watched_node_name][ + watched_output_slot].add(debug_op) return - if node.name in self._node_inputs: - raise ValueError("Duplicate node name: '%s'" % node.name) + if node.name in self._node_inputs[device_name]: + raise ValueError("Duplicate node name on device %s: '%s'" % + (device_name, node.name)) - self._node_attributes[node.name] = node.attr + self._node_attributes[device_name][node.name] = node.attr - if node.device not in self._devices and node.device: - self._devices.append(node.device) + self._node_inputs[device_name][node.name] = [] + self._node_ctrl_inputs[device_name][node.name] = [] + self._node_recipients[device_name][node.name] = [] + self._node_ctrl_recipients[device_name][node.name] = [] - self._node_inputs[node.name] = [] - self._node_ctrl_inputs[node.name] = [] - self._node_recipients[node.name] = [] - self._node_ctrl_recipients[node.name] = [] - - self._node_devices[node.name] = node.device - self._node_op_types[node.name] = node.op + if node.name not in self._node_devices: + self._node_devices[node.name] = set() + self._node_devices[node.name].add(node.device) + self._node_op_types[device_name][node.name] = node.op for inp in node.input: if is_copy_node(inp) and (node.op == "_Send" or node.op == "_Retval"): - self._copy_send_nodes.append(node.name) + self._copy_send_nodes[device_name].append(node.name) if inp.startswith("^"): cinp = inp[1:] - self._node_ctrl_inputs[node.name].append(cinp) + self._node_ctrl_inputs[device_name][node.name].append(cinp) else: - self._node_inputs[node.name].append(inp) + self._node_inputs[device_name][node.name].append(inp) - def _prune_nodes_from_input_and_recipient_maps(self, nodes_to_prune): + def _prune_nodes_from_input_and_recipient_maps(self, + device_name, + nodes_to_prune): """Prune nodes out of input and recipient maps. Args: + device_name: (`str`) device name. nodes_to_prune: (`list` of `str`) Names of the nodes to be pruned. """ for node in nodes_to_prune: - del self._node_inputs[node] - del self._node_ctrl_inputs[node] - del self._node_recipients[node] - del self._node_ctrl_recipients[node] + del self._node_inputs[device_name][node] + del self._node_ctrl_inputs[device_name][node] + del self._node_recipients[device_name][node] + del self._node_ctrl_recipients[device_name][node] - def _prune_non_control_edges_of_debug_ops(self): + def _prune_non_control_edges_of_debug_ops(self, device_name): """Prune (non-control) edges related to debug ops. Prune the Copy ops and associated _Send ops inserted by the debugger out from the non-control inputs and output recipients map. Replace the inputs and recipients with original ones. + + Args: + device_name: (`str`) device name. """ copy_nodes = [] - for node in self._node_inputs: - if node in self._copy_send_nodes: + for node in self._node_inputs[device_name]: + if node in self._copy_send_nodes[device_name]: continue if is_copy_node(node): copy_nodes.append(node) - inputs = self._node_inputs[node] + inputs = self._node_inputs[device_name][node] for i in xrange(len(inputs)): inp = inputs[i] if is_copy_node(inp): # Find the input to the Copy node, which should be the original # input to the node. - orig_inp = self._node_inputs[inp][0] + orig_inp = self._node_inputs[device_name][inp][0] inputs[i] = orig_inp - self._prune_nodes_from_input_and_recipient_maps(copy_nodes) - self._prune_nodes_from_input_and_recipient_maps(self._copy_send_nodes) + self._prune_nodes_from_input_and_recipient_maps(device_name, copy_nodes) + self._prune_nodes_from_input_and_recipient_maps( + device_name, self._copy_send_nodes[device_name]) - def _prune_control_edges_of_debug_ops(self): + def _prune_control_edges_of_debug_ops(self, device_name): """Prune control edges related to the debug ops.""" - for node in self._node_ctrl_inputs: - ctrl_inputs = self._node_ctrl_inputs[node] + for node in self._node_ctrl_inputs[device_name]: + ctrl_inputs = self._node_ctrl_inputs[device_name][node] debug_op_inputs = [] for ctrl_inp in ctrl_inputs: if is_debug_node(ctrl_inp): @@ -918,33 +1029,36 @@ class DebugDumpDir(object): for debug_op_inp in debug_op_inputs: ctrl_inputs.remove(debug_op_inp) - def _populate_recipient_maps(self): + def _populate_recipient_maps(self, device_name): """Populate the map from node name to recipient(s) of its output(s).""" - for node in self._node_inputs: - inputs = self._node_inputs[node] + for node in self._node_inputs[device_name]: + inputs = self._node_inputs[device_name][node] for inp in inputs: inp = get_node_name(inp) - if inp not in self._node_recipients: - self._node_recipients[inp] = [] - self._node_recipients[inp].append(node) + if inp not in self._node_recipients[device_name]: + self._node_recipients[device_name][inp] = [] + self._node_recipients[device_name][inp].append(node) - for node in self._node_ctrl_inputs: - ctrl_inputs = self._node_ctrl_inputs[node] + for node in self._node_ctrl_inputs[device_name]: + ctrl_inputs = self._node_ctrl_inputs[device_name][node] for ctrl_inp in ctrl_inputs: - if ctrl_inp in self._copy_send_nodes: + if ctrl_inp in self._copy_send_nodes[device_name]: continue - if ctrl_inp not in self._node_ctrl_recipients: - self._node_ctrl_recipients[ctrl_inp] = [] - self._node_ctrl_recipients[ctrl_inp].append(node) + if ctrl_inp not in self._node_ctrl_recipients[device_name]: + self._node_ctrl_recipients[device_name][ctrl_inp] = [] + self._node_ctrl_recipients[device_name][ctrl_inp].append(node) - def _validate_dump_with_graphs(self): + def _validate_dump_with_graphs(self, device_name): """Validate the dumped tensor data against the partition graphs. Only the watched nodes are validated by this method, because tfdbg allows clients to watch only a subset of the nodes. + Args: + device_name: (`str`) device name. + Raises: LookupError: If the partition graphs have not been loaded yet. ValueError: If dumps contain node names not found in partition graph. @@ -952,33 +1066,35 @@ class DebugDumpDir(object): input relations on the partition graphs. """ - if not self._partition_graphs: - raise LookupError("No partition graphs loaded.") + if not self._partition_graphs[device_name]: + raise LookupError( + "No partition graphs loaded for device %s" % device_name) # Verify that the node names in the dump data are all present in the # partition graphs. - for datum in self._dump_tensor_data: - if datum.node_name not in self._node_inputs: - raise ValueError("Node name '%s' is not found in partition graphs." % - datum.node_name) + for datum in self._dump_tensor_data[device_name]: + if datum.node_name not in self._node_inputs[device_name]: + raise ValueError("Node name '%s' is not found in partition graphs of " + "device %s." % (datum.node_name, device_name)) pending_inputs = {} - for node in self._node_inputs: + for node in self._node_inputs[device_name]: pending_inputs[node] = [] - inputs = self._node_inputs[node] + inputs = self._node_inputs[device_name][node] for inp in inputs: inp_node = get_node_name(inp) inp_output_slot = get_output_slot(inp) # Inputs from Enter and NextIteration nodes are not validated because # DebugNodeInserter::InsertNodes() in the debugger core skips creating # control edges from debug ops watching these types of nodes. - if (inp_node in self._debug_watches and - inp_output_slot in self._debug_watches[inp_node] and - self._node_op_types.get(inp) not in ("Enter", "NextIteration") and + if (inp_node in self._debug_watches[device_name] and + inp_output_slot in self._debug_watches[device_name][inp_node] and + self._node_op_types[device_name].get(inp) not in ( + "Enter", "NextIteration") and (inp_node, inp_output_slot) not in pending_inputs[node]): pending_inputs[node].append((inp_node, inp_output_slot)) - for i, datum in enumerate(self._dump_tensor_data): + for i, datum in enumerate(self._dump_tensor_data[device_name]): node = datum.node_name slot = datum.output_slot # In some cases (e.g., system clocks with insufficient precision), @@ -986,13 +1102,13 @@ class DebugDumpDir(object): # following check examines this possibility and avoids raising an error if # that is the case. if not self._satisfied_at_timestamp( - pending_inputs[node], datum.timestamp, start_i=i + 1): + device_name, pending_inputs[node], datum.timestamp, start_i=i + 1): raise ValueError("Causality violated in timing relations of debug " "dumps: %s (%d): " "these input(s) are not satisfied: %s" % (node, datum.timestamp, repr(pending_inputs[node]))) - recipients = self._node_recipients[node] + recipients = self._node_recipients[device_name][node] for recipient in recipients: recipient_pending_inputs = pending_inputs[recipient] if (node, slot) in recipient_pending_inputs: @@ -1004,12 +1120,13 @@ class DebugDumpDir(object): del recipient_pending_inputs[ recipient_pending_inputs.index((node, slot))] - def _satisfied_at_timestamp(self, pending, timestamp, start_i=0): + def _satisfied_at_timestamp(self, device_name, pending, timestamp, start_i=0): """Determine whether pending inputs are satisfied at given timestamp. Note: This method mutates the input argument "pending". Args: + device_name: (str) device name. pending: A list of 2-tuple (node_name, output_slot): the dependencies to check. timestamp: (int) the timestamp in question. @@ -1023,7 +1140,7 @@ class DebugDumpDir(object): if not pending: return True - for datum in self._dump_tensor_data[start_i:]: + for datum in self._dump_tensor_data[device_name][start_i:]: if datum.timestamp > timestamp: break if (datum.timestamp == timestamp and @@ -1042,7 +1159,7 @@ class DebugDumpDir(object): """Get the partition graphs. Returns: - Partition graphs as repeated fields of GraphDef. + Partition graphs as a list of GraphDef. Raises: LookupError: If no partition graphs have been loaded. @@ -1051,50 +1168,90 @@ class DebugDumpDir(object): if self._partition_graphs is None: raise LookupError("No partition graphs have been loaded.") - return self._partition_graphs + return self._partition_graphs.values() @property def run_fetches_info(self): """Get a str representation of the fetches used in the Session.run() call. Returns: - If the information is available, a `str` obtained from `repr(fetches)`. + If the information is available from one `Session.run` call, a `str` + obtained from `repr(fetches)`. + If the information is available from multiple `Session.run` calls, a + `list` of `str` from `repr(fetches)`. If the information is not available, `None`. """ - return self._run_fetches_info + output = self._run_fetches_info + return output[0] if len(output) == 1 else output @property def run_feed_keys_info(self): """Get a str representation of the feed_dict used in the Session.run() call. Returns: - If the information is available, a `str` obtained from `repr(feed_dict)`. + If the information is available from one `Session.run` call, a `str` + obtained from `repr(feed_dict)`. + If the information is available from multiple `Session.run` calls, a + `list` of `str` obtained from `repr(feed_dict)`. If the information is not available, `None`. """ - return self._run_feed_keys_info + output = self._run_feed_keys_info + return output[0] if len(output) == 1 else output + + def _infer_device_name(self, device_name, node_name): + if device_name is None: + if len(self.devices()) == 1: + return self.devices()[0] + else: + if node_name in self._node_devices: + if len(self._node_devices[node_name]) == 1: + return list(self._node_devices[node_name])[0] + else: + raise ValueError( + "There are multiple (%d) devices with nodes named '%s' but " + "device_name is not specified." % + (len(self._node_devices[node_name]), node_name)) + else: + raise ValueError("None of the %d devices has a node named '%s'." % + (len(self._device_names), node_name)) + else: + return device_name - def nodes(self): + def nodes(self, device_name=None): """Get a list of all nodes from the partition graphs. + Args: + device_name: (`str`) name of device. If there is only one device, this + argumnet is optional. + Returns: All nodes' names, as a list of str. Raises: LookupError: If no partition graphs have been loaded. + ValueError: If there are multiple devices, but device_name is not + specified. """ - if self._partition_graphs is None: raise LookupError("No partition graphs have been loaded.") + if device_name is None: + if len(self.devices()) == 1: + device_name = self.devices()[0] + else: + raise ValueError( + "There are multiple (%d) devices, but " + "device_name is not specified." % len(self.devices())) + return [node_name for node_name in self._node_inputs[device_name]] - return [node_name for node_name in self._node_inputs] - - def node_attributes(self, node_name): + def node_attributes(self, node_name, device_name=None): """Get the attributes of a node. Args: node_name: Name of the node in question. + device_name: (`str`) name of the device. If there is only one device or if + node_name exists on only one device, this argumnet is optional. Returns: Attributes of the node. @@ -1103,22 +1260,25 @@ class DebugDumpDir(object): LookupError: If no partition graphs have been loaded. ValueError: If no node named node_name exists. """ - if self._partition_graphs is None: raise LookupError("No partition graphs have been loaded.") - if node_name in self._node_attributes: - return self._node_attributes[node_name] + device_name = self._infer_device_name(device_name, node_name) + if node_name in self._node_attributes[device_name]: + return self._node_attributes[device_name][node_name] else: - raise ValueError("No node named \"%s\" exists." % node_name) + raise ValueError("No node named \"%s\" exists on device %s." % ( + node_name, device_name)) - def node_inputs(self, node_name, is_control=False): + def node_inputs(self, node_name, is_control=False, device_name=None): """Get the inputs of given node according to partition graphs. Args: node_name: Name of the node. is_control: (`bool`) Whether control inputs, rather than non-control inputs, are to be returned. + device_name: (`str`) name of the device. If there is only one device or if + node_name exists on only one device, this argumnet is optional. Returns: (`list` of `str`) inputs to the node, as a list of node names. @@ -1133,21 +1293,27 @@ class DebugDumpDir(object): raise LookupError( "Node inputs are not loaded from partition graphs yet.") - if node_name not in self._node_inputs: - raise ValueError("Node '%s' does not exist in partition graphs." % - node_name) + device_name = self._infer_device_name(device_name, node_name) + if node_name not in self._node_inputs[device_name]: + raise ValueError("Node '%s' does not exist in the partition graph of " + "device %s." % (node_name, device_name)) if is_control: - return self._node_ctrl_inputs[node_name] + return self._node_ctrl_inputs[device_name][node_name] else: - return self._node_inputs[node_name] + return self._node_inputs[device_name][node_name] - def transitive_inputs(self, node_name, include_control=True): + def transitive_inputs(self, + node_name, + include_control=True, + device_name=None): """Get the transitive inputs of given node according to partition graphs. Args: - node_name: Name of the node + node_name: Name of the node. include_control: Include control inputs (True by default). + device_name: (`str`) name of the device. If there is only one device or if + node_name exists on only one device, this argumnet is optional. Returns: (`list` of `str`) all transitive inputs to the node, as a list of node @@ -1163,9 +1329,11 @@ class DebugDumpDir(object): raise LookupError( "Node inputs are not loaded from partition graphs yet.") - if node_name not in self._node_inputs: - raise ValueError("Node '%s' does not exist in partition graphs." % - node_name) + device_name = self._infer_device_name(device_name, node_name) + if node_name not in self._node_inputs[device_name]: + raise ValueError( + "Node '%s' does not exist in the partition graph of device %s." % + (node_name, device_name)) inputs = [] @@ -1186,21 +1354,21 @@ class DebugDumpDir(object): # Stop the tracing at a Merge op, as it is generally impossible to infer # outside the runtime which input to the Merge op is alive. - if self._node_op_types[node] == "Merge": + if self._node_op_types[device_name][node] == "Merge": return if node in visited_nodes: return visited_nodes.append(node) - for inp in self._node_inputs[node]: + for inp in self._node_inputs[device_name][node]: if inp == node_name: continue inputs.append(inp) trace_inputs(inp) if include_control: - for ctrl_inp in self._node_ctrl_inputs[node]: + for ctrl_inp in self._node_ctrl_inputs[device_name][node]: if ctrl_inp == node_name: continue inputs.append(ctrl_inp) @@ -1210,13 +1378,15 @@ class DebugDumpDir(object): return inputs - def node_recipients(self, node_name, is_control=False): + def node_recipients(self, node_name, is_control=False, device_name=None): """Get recipient of the given node's output according to partition graphs. Args: node_name: (`str`) name of the node. is_control: (`bool`) whether control outputs, rather than non-control outputs, are to be returned. + device_name: (`str`) name of the device. If there is only one device or if + node_name exists on only one device, this argumnet is optional. Returns: (`list` of `str`) all inputs to the node, as a list of node names. @@ -1231,58 +1401,67 @@ class DebugDumpDir(object): raise LookupError( "Node recipients are not loaded from partition graphs yet.") - if node_name not in self._node_recipients: - raise ValueError("Node '%s' does not exist in partition graphs." % - node_name) + device_name = self._infer_device_name(device_name, node_name) + if node_name not in self._node_recipients[device_name]: + raise ValueError( + "Node '%s' does not exist in the partition graph of device %s." % + (node_name, device_name)) if is_control: - return self._node_ctrl_recipients[node_name] + return self._node_ctrl_recipients[device_name][node_name] else: - return self._node_recipients[node_name] + return self._node_recipients[device_name][node_name] def devices(self): - """Get the list of devices. + """Get the list of device names. Returns: (`list` of `str`) names of the devices. - - Raises: - LookupError: If node inputs and control inputs have not been loaded - from partition graphs yet. """ - if self._partition_graphs is None: - raise LookupError("Devices are not loaded from partition graphs yet.") - - return self._devices + return self._device_names - def node_exists(self, node_name): + def node_exists(self, node_name, device_name=None): """Test if a node exists in the partition graphs. Args: node_name: (`str`) name of the node to be checked. + device_name: optional device name. If None, will search for the node + on all available devices. Otherwise, search for the node only on + the given device. Returns: A boolean indicating whether the node exists. Raises: LookupError: If no partition graphs have been loaded yet. + ValueError: If device_name is specified but cannot be found. """ if self._node_inputs is None: raise LookupError( "Nodes have not been loaded from partition graphs yet.") - return node_name in self._node_inputs + if (device_name is not None) and device_name not in self._node_inputs: + raise ValueError( + "The specified device_name '%s' cannot be found." % device_name) + + node_inputs_all_devices = (self._node_inputs if device_name is None + else (self._node_inputs[device_name],)) + + return any(node_name in node_inputs_all_devices[dev_name] + for dev_name in node_inputs_all_devices) def node_device(self, node_name): - """Get the device of a node. + """Get the names of the devices that has nodes of the specified name. Args: node_name: (`str`) name of the node. Returns: - (`str`) name of the device on which the node is placed. + (`str` or `list` of `str`) name of the device(s) on which the node of the + given name is found. Returns a `str` if there is only one such device, + otherwise return a `list` of `str`. Raises: LookupError: If node inputs and control inputs have not been loaded @@ -1298,13 +1477,16 @@ class DebugDumpDir(object): raise ValueError("Node '%s' does not exist in partition graphs." % node_name) - return self._node_devices[node_name] + output = list(self._node_devices[node_name]) + return output[0] if len(output) == 1 else output - def node_op_type(self, node_name): + def node_op_type(self, node_name, device_name=None): """Get the op type of given node. Args: node_name: (`str`) name of the node. + device_name: (`str`) name of the device. If there is only one device or if + node_name exists on only one device, this argumnet is optional. Returns: (`str`) op type of the node. @@ -1319,17 +1501,21 @@ class DebugDumpDir(object): raise LookupError( "Node op types are not loaded from partition graphs yet.") - if node_name not in self._node_op_types: - raise ValueError("Node '%s' does not exist in partition graphs." % - node_name) + device_name = self._infer_device_name(device_name, node_name) + if node_name not in self._node_op_types[device_name]: + raise ValueError( + "Node '%s' does not exist in the partition graph of device '%s'. " % + (node_name, device_name)) - return self._node_op_types[node_name] + return self._node_op_types[device_name][node_name] - def debug_watch_keys(self, node_name): + def debug_watch_keys(self, node_name, device_name=None): """Get all tensor watch keys of given node according to partition graphs. Args: node_name: (`str`) name of the node. + device_name: (`str`) name of the device. If there is only one device or if + node_name exists on only one device, this argumnet is optional. Returns: (`list` of `str`) all debug tensor watch keys. Returns an empty list if @@ -1340,35 +1526,61 @@ class DebugDumpDir(object): partition graphs yet. """ - if node_name not in self._debug_watches: + try: + device_name = self._infer_device_name(device_name, node_name) + except ValueError: + return [] + + if node_name not in self._debug_watches[device_name]: return [] watch_keys = [] - for watched_slot in self._debug_watches[node_name]: - debug_ops = self._debug_watches[node_name][watched_slot] + for watched_slot in self._debug_watches[device_name][node_name]: + debug_ops = self._debug_watches[device_name][node_name][watched_slot] for debug_op in debug_ops: watch_keys.append( _get_tensor_watch_key(node_name, watched_slot, debug_op)) return watch_keys - def watch_key_to_data(self, debug_watch_key): + def watch_key_to_data(self, debug_watch_key, device_name=None): """Get all `DebugTensorDatum` instances corresponding to a debug watch key. Args: debug_watch_key: (`str`) debug watch key. + device_name: (`str`) name of the device. If there is only one device or if + the specified debug_watch_key exists on only one device, this argumnet + is optional. Returns: A list of `DebugTensorDatum` instances that correspond to the debug watch key. If the watch key does not exist, returns an empty list. Raises: - ValueError: If the debug watch key does not exist. + ValueError: If there are multiple devices that have the debug_watch_key, + but device_name is not specified. """ + if device_name is None: + matching_device_names = [ + name for name in self._watch_key_to_datum + if debug_watch_key in self._watch_key_to_datum[name]] + if not matching_device_names: + return [] + elif len(matching_device_names) == 1: + device_name = matching_device_names[0] + else: + raise ValueError( + "The debug watch key '%s' exists on multiple (%d) devices, but " + "device name is not specified." % + (debug_watch_key, len(matching_device_names))) + elif device_name not in self._debug_key_to_datum: + raise ValueError( + "There is no device named '%s' consisting of debug watch keys." % + device_name) - return self._watch_key_to_datum.get(debug_watch_key, []) + return self._watch_key_to_datum[device_name].get(debug_watch_key, []) - def find(self, predicate, first_n=0): + def find(self, predicate, first_n=0, device_name=None): """Find dumped tensor data by a certain predicate. Args: @@ -1386,6 +1598,7 @@ class DebugDumpDir(object): first_n: (`int`) return only the first n `DebugTensotDatum` instances (in time order) for which the predicate returns True. To return all the `DebugTensotDatum` instances, let first_n be <= 0. + device_name: optional device name. Returns: A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object @@ -1394,22 +1607,31 @@ class DebugDumpDir(object): """ matched_data = [] - for datum in self._dump_tensor_data: - if predicate(datum, datum.get_tensor()): - matched_data.append(datum) + for device in (self._dump_tensor_data if device_name is None + else (self._dump_tensor_data[device_name],)): + for datum in self._dump_tensor_data[device]: + if predicate(datum, datum.get_tensor()): + matched_data.append(datum) - if first_n > 0 and len(matched_data) >= first_n: - break + if first_n > 0 and len(matched_data) >= first_n: + return matched_data return matched_data - def get_tensor_file_paths(self, node_name, output_slot, debug_op): + def get_tensor_file_paths(self, + node_name, + output_slot, + debug_op, + device_name=None): """Get the file paths from a debug-dumped tensor. Args: node_name: (`str`) name of the node that the tensor is produced by. output_slot: (`int`) output slot index of tensor. debug_op: (`str`) name of the debug op. + device_name: (`str`) name of the device. If there is only one device or if + the specified debug_watch_key exists on only one device, this argumnet + is optional. Returns: List of file path(s) loaded. This is a list because each debugged tensor @@ -1420,14 +1642,17 @@ class DebugDumpDir(object): the debug-dump data. """ + device_name = self._infer_device_name(device_name, node_name) watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op) - if watch_key not in self._watch_key_to_datum: + if watch_key not in self._watch_key_to_datum[device_name]: raise WatchKeyDoesNotExistInDebugDumpDirError( - "Watch key \"%s\" does not exist in the debug dump" % watch_key) + "Watch key \"%s\" does not exist in the debug dump of device %s" % + (watch_key, device_name)) - return [datum.file_path for datum in self._watch_key_to_datum[watch_key]] + return [datum.file_path for datum in + self._watch_key_to_datum[device_name][watch_key]] - def get_tensors(self, node_name, output_slot, debug_op): + def get_tensors(self, node_name, output_slot, debug_op, device_name=None): """Get the tensor value from for a debug-dumped tensor. The tensor may be dumped multiple times in the dump root directory, so a @@ -1437,6 +1662,9 @@ class DebugDumpDir(object): node_name: (`str`) name of the node that the tensor is produced by. output_slot: (`int`) output slot index of tensor. debug_op: (`str`) name of the debug op. + device_name: (`str`) name of the device. If there is only one device or if + the specified debug_watch_key exists on only one device, this argumnet + is optional. Returns: List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s). @@ -1447,13 +1675,20 @@ class DebugDumpDir(object): """ watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op) - if watch_key not in self._watch_key_to_datum: + try: + device_name = self._infer_device_name(device_name, node_name) + return [datum.get_tensor() for datum in + self._watch_key_to_datum[device_name][watch_key]] + except (ValueError, KeyError): raise WatchKeyDoesNotExistInDebugDumpDirError( - "Watch key \"%s\" does not exist in the debug dump" % watch_key) - - return [datum.get_tensor() for datum in self._watch_key_to_datum[watch_key]] - - def get_rel_timestamps(self, node_name, output_slot, debug_op): + "Watch key \"%s\" does not exist in the debug dump of device %s" % + (watch_key, device_name)) + + def get_rel_timestamps(self, + node_name, + output_slot, + debug_op, + device_name=None): """Get the relative timestamp from for a debug-dumped tensor. Relative timestamp means (absolute timestamp - `t0`), where `t0` is the @@ -1465,6 +1700,9 @@ class DebugDumpDir(object): node_name: (`str`) name of the node that the tensor is produced by. output_slot: (`int`) output slot index of tensor. debug_op: (`str`) name of the debug op. + device_name: (`str`) name of the device. If there is only one device or if + the specified debug_watch_key exists on only one device, this argumnet + is optional. Returns: (`list` of `int`) list of relative timestamps. @@ -1474,14 +1712,20 @@ class DebugDumpDir(object): exist in the debug dump data. """ + device_name = self._infer_device_name(device_name, node_name) watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op) - if watch_key not in self._watch_key_to_datum: + if watch_key not in self._watch_key_to_datum[device_name]: raise WatchKeyDoesNotExistInDebugDumpDirError( "Watch key \"%s\" does not exist in the debug dump" % watch_key) - return self._watch_key_to_rel_time[watch_key] + # TODO(cais): Figure out whether this should be relative to the global t0. + return self._watch_key_to_rel_time[device_name][watch_key] - def get_dump_sizes_bytes(self, node_name, output_slot, debug_op): + def get_dump_sizes_bytes(self, + node_name, + output_slot, + debug_op, + device_name=None): """Get the sizes of the dump files for a debug-dumped tensor. Unit of the file size: byte. @@ -1490,6 +1734,9 @@ class DebugDumpDir(object): node_name: (`str`) name of the node that the tensor is produced by. output_slot: (`int`) output slot index of tensor. debug_op: (`str`) name of the debug op. + device_name: (`str`) name of the device. If there is only one device or if + the specified debug_watch_key exists on only one device, this argumnet + is optional. Returns: (`list` of `int`): list of dump file sizes in bytes. @@ -1499,12 +1746,14 @@ class DebugDumpDir(object): exist in the debug dump data. """ + device_name = self._infer_device_name(device_name, node_name) watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op) - if watch_key not in self._watch_key_to_datum: + if watch_key not in self._watch_key_to_datum[device_name]: raise WatchKeyDoesNotExistInDebugDumpDirError( - "Watch key \"%s\" does not exist in the debug dump" % watch_key) + "Watch key \"%s\" does not exist in the debug dump of device %s" % + (watch_key, device_name)) - return self._watch_key_to_dump_size_bytes[watch_key] + return self._watch_key_to_dump_size_bytes[device_name][watch_key] def node_traceback(self, element_name): """Try to retrieve the Python traceback of node's construction. diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py index dc45e8df6c..dd621a5afc 100644 --- a/tensorflow/python/debug/lib/debug_data_test.py +++ b/tensorflow/python/debug/lib/debug_data_test.py @@ -23,12 +23,29 @@ import tempfile import numpy as np +from tensorflow.core.framework import graph_pb2 from tensorflow.core.framework import tensor_pb2 from tensorflow.python.debug.lib import debug_data from tensorflow.python.framework import test_util from tensorflow.python.platform import googletest +class DeviceNamePathConversionTest(test_util.TensorFlowTestCase): + + def testDeviceNameToDevicePath(self): + self.assertEqual( + debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG + + ",job_ps,replica_1,task_2,cpu_0", + debug_data.device_name_to_device_path("/job:ps/replica:1/task:2/cpu:0")) + + def testDevicePathToDeviceName(self): + self.assertEqual( + "/job:ps/replica:1/task:2/cpu:0", + debug_data.device_path_to_device_name( + debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG + + ",job_ps,replica_1,task_2,cpu_0")) + + class ParseNodeOrTensorNameTest(test_util.TensorFlowTestCase): def testParseNodeName(self): @@ -163,7 +180,10 @@ class DebugTensorDatumTest(test_util.TensorFlowTestCase): def testDebugDatum(self): dump_root = "/tmp/tfdbg_1" - debug_dump_rel_path = "ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385" + debug_dump_rel_path = ( + debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG + + ",job_localhost,replica_0,task_0,cpu_0" + + "/ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385") datum = debug_data.DebugTensorDatum(dump_root, debug_dump_rel_path) @@ -175,16 +195,18 @@ class DebugTensorDatumTest(test_util.TensorFlowTestCase): self.assertEqual("ns1/ns2/node_a_1:2:DebugIdentity", datum.watch_key) self.assertEqual( os.path.join(dump_root, debug_dump_rel_path), datum.file_path) - self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name, - datum.output_slot, - datum.debug_op, - datum.timestamp), - str(datum)) - self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name, - datum.output_slot, - datum.debug_op, - datum.timestamp), - repr(datum)) + self.assertEqual( + "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) " + "%s:%d @ %s @ %d}" % (datum.node_name, + datum.output_slot, + datum.debug_op, + datum.timestamp), str(datum)) + self.assertEqual( + "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) " + "%s:%d @ %s @ %d}" % (datum.node_name, + datum.output_slot, + datum.debug_op, + datum.timestamp), repr(datum)) def testDumpSizeBytesIsNoneForNonexistentFilePath(self): dump_root = "/tmp/tfdbg_1" @@ -204,18 +226,112 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase): # Tear down temporary dump directory. shutil.rmtree(self._dump_root) + def _makeDataDirWithMultipleDevicesAndDuplicateNodeNames(self): + cpu_0_dir = os.path.join( + self._dump_root, + debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG + + ",job_localhost,replica_0,task_0,cpu_0") + gpu_0_dir = os.path.join( + self._dump_root, + debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG + + ",job_localhost,replica_0,task_0,gpu_0") + gpu_1_dir = os.path.join( + self._dump_root, + debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG + + ",job_localhost,replica_0,task_0,gpu_1") + os.makedirs(cpu_0_dir) + os.makedirs(gpu_0_dir) + os.makedirs(gpu_1_dir) + open(os.path.join( + cpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536386"), "wb") + open(os.path.join( + gpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536385"), "wb") + open(os.path.join( + gpu_1_dir, "node_foo_1_2_DebugIdentity_1472563253536387"), "wb") + def testDebugDumpDir_nonexistentDumpRoot(self): with self.assertRaisesRegexp(IOError, "does not exist"): debug_data.DebugDumpDir(tempfile.mktemp() + "_foo") def testDebugDumpDir_invalidFileNamingPattern(self): # File name with too few underscores should lead to an exception. - open(os.path.join(self._dump_root, "node1_DebugIdentity_1234"), "wb") + device_dir = os.path.join( + self._dump_root, + debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG + + ",job_localhost,replica_0,task_0,cpu_0") + os.makedirs(device_dir) + open(os.path.join(device_dir, "node1_DebugIdentity_1234"), "wb") with self.assertRaisesRegexp(ValueError, "does not conform to the naming pattern"): debug_data.DebugDumpDir(self._dump_root) + def testDebugDumpDir_validDuplicateNodeNamesWithMultipleDevices(self): + self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames() + + graph_cpu_0 = graph_pb2.GraphDef() + node = graph_cpu_0.node.add() + node.name = "node_foo_1" + node.op = "FooOp" + node.device = "/job:localhost/replica:0/task:0/cpu:0" + graph_gpu_0 = graph_pb2.GraphDef() + node = graph_gpu_0.node.add() + node.name = "node_foo_1" + node.op = "FooOp" + node.device = "/job:localhost/replica:0/task:0/gpu:0" + graph_gpu_1 = graph_pb2.GraphDef() + node = graph_gpu_1.node.add() + node.name = "node_foo_1" + node.op = "FooOp" + node.device = "/job:localhost/replica:0/task:0/gpu:1" + + dump_dir = debug_data.DebugDumpDir( + self._dump_root, + partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1]) + + self.assertItemsEqual( + ["/job:localhost/replica:0/task:0/cpu:0", + "/job:localhost/replica:0/task:0/gpu:0", + "/job:localhost/replica:0/task:0/gpu:1"], dump_dir.devices()) + self.assertEqual(1472563253536385, dump_dir.t0) + self.assertEqual(3, dump_dir.size) + + with self.assertRaisesRegexp( + ValueError, + r"There are multiple \(3\) devices, but device_name is not specified"): + dump_dir.nodes() + self.assertItemsEqual( + ["node_foo_1"], + dump_dir.nodes(device_name="/job:localhost/replica:0/task:0/cpu:0")) + + def testDuplicateNodeNamesInGraphDefOfSingleDeviceRaisesException(self): + self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames() + graph_cpu_0 = graph_pb2.GraphDef() + node = graph_cpu_0.node.add() + node.name = "node_foo_1" + node.op = "FooOp" + node.device = "/job:localhost/replica:0/task:0/cpu:0" + graph_gpu_0 = graph_pb2.GraphDef() + node = graph_gpu_0.node.add() + node.name = "node_foo_1" + node.op = "FooOp" + node.device = "/job:localhost/replica:0/task:0/gpu:0" + graph_gpu_1 = graph_pb2.GraphDef() + node = graph_gpu_1.node.add() + node.name = "node_foo_1" + node.op = "FooOp" + node.device = "/job:localhost/replica:0/task:0/gpu:1" + node = graph_gpu_1.node.add() # Here is the duplicate. + node.name = "node_foo_1" + node.op = "FooOp" + node.device = "/job:localhost/replica:0/task:0/gpu:1" + + with self.assertRaisesRegexp( + ValueError, r"Duplicate node name on device "): + debug_data.DebugDumpDir( + self._dump_root, + partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1]) + def testDebugDumpDir_emptyDumpDir(self): dump_dir = debug_data.DebugDumpDir(self._dump_root) diff --git a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py new file mode 100644 index 0000000000..b0dc25851c --- /dev/null +++ b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py @@ -0,0 +1,93 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for debugger functionalities under multiple (i.e., >1) GPUs.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import shutil +import tempfile + +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.client import device_lib +from tensorflow.python.client import session +from tensorflow.python.debug.lib import debug_data +from tensorflow.python.debug.lib import debug_utils +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import googletest + + +class SessionDebugMultiGPUTest(test_util.TensorFlowTestCase): + + def setUp(self): + self._dump_root = tempfile.mkdtemp() + + def tearDown(self): + ops.reset_default_graph() + + # Tear down temporary dump directory. + if os.path.isdir(self._dump_root): + shutil.rmtree(self._dump_root) + + def testMultiGPUSessionRun(self): + local_devices = device_lib.list_local_devices() + gpu_device_names = [] + for device in local_devices: + if device.device_type == "GPU": + gpu_device_names.append(device.name) + gpu_device_names = sorted(gpu_device_names) + + if len(gpu_device_names) < 2: + self.skipTest( + "This test requires at least 2 GPUs, but only %d is available." % + len(gpu_device_names)) + + with session.Session() as sess: + v = variables.Variable([10.0, 15.0], dtype=dtypes.float32, name="v") + with ops.device(gpu_device_names[0]): + u0 = math_ops.add(v, v, name="u0") + with ops.device(gpu_device_names[1]): + u1 = math_ops.multiply(v, v, name="u1") + w = math_ops.subtract(u1, u0, name="w") + + sess.run(v.initializer) + + run_options = config_pb2.RunOptions(output_partition_graphs=True) + debug_utils.watch_graph(run_options, sess.graph, + debug_urls="file://" + self._dump_root) + run_metadata = config_pb2.RunMetadata() + self.assertAllClose( + [80.0, 195.0], + sess.run(w, options=run_options, run_metadata=run_metadata)) + + debug_dump_dir = debug_data.DebugDumpDir( + self._dump_root, partition_graphs=run_metadata.partition_graphs) + self.assertEqual(3, len(debug_dump_dir.devices())) + self.assertAllClose( + [10.0, 15.0], debug_dump_dir.get_tensors("v", 0, "DebugIdentity")[0]) + self.assertAllClose( + [20.0, 30.0], debug_dump_dir.get_tensors("u0", 0, "DebugIdentity")[0]) + self.assertAllClose( + [100.0, 225.0], + debug_dump_dir.get_tensors("u1", 0, "DebugIdentity")[0]) + + +if __name__ == "__main__": + googletest.main() diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py index 1b4444ef5a..a14ff1af3c 100644 --- a/tensorflow/python/debug/lib/session_debug_testlib.py +++ b/tensorflow/python/debug/lib/session_debug_testlib.py @@ -250,8 +250,12 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): self.assertIn(results.v.op.type, results.dump.node_op_type(results.v_name)) self.assertIn(results.w.op.type, results.dump.node_op_type(results.w_name)) - with self.assertRaisesRegexp( - ValueError, "Node 'foo_bar' does not exist in partition graphs."): + if test_util.gpu_device_name(): + expected_error_regexp = r"None of the .* devices has a node named " + else: + expected_error_regexp = ( + r"Node \'foo_bar\' does not exist in the partition graph of device") + with self.assertRaisesRegexp(ValueError, expected_error_regexp): results.dump.node_op_type("foo_bar") def testDumpStringTensorsWorks(self): @@ -437,9 +441,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): # Verify dump files self.assertTrue(os.path.isdir(self._dump_root)) - self.assertTrue(os.path.isdir(os.path.join(self._dump_root, u_namespace))) - self.assertTrue( - os.path.isdir(os.path.join(self._dump_root, v_namespace, "v"))) + u_glob_out = glob.glob(os.path.join(self._dump_root, "*", u_namespace)) + v_glob_out = glob.glob(os.path.join( + self._dump_root, "*", v_namespace, "v")) + self.assertTrue(os.path.isdir(u_glob_out[0])) + self.assertTrue(os.path.isdir(v_glob_out[0])) dump = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs) @@ -689,7 +695,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): u_read_name = u_name + "/read" # Test node name list lookup of the DebugDumpDir object. - node_names = dump.nodes() + if test_util.gpu_device_name(): + node_names = dump.nodes( + device_name="/job:localhost/replica:0/task:0/gpu:0") + else: + node_names = dump.nodes() self.assertTrue(u_name in node_names) self.assertTrue(u_read_name in node_names) @@ -699,7 +709,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): self.assertEqual(1, len(u_attr["shape"].shape.dim)) self.assertEqual(2, u_attr["shape"].shape.dim[0].size) - with self.assertRaisesRegexp(ValueError, "No node named \"foo\" exists"): + if test_util.gpu_device_name(): + expected_error_regexp = r"None of the .* devices has a node named " + else: + expected_error_regexp = r"No node named \"foo\" exists" + with self.assertRaisesRegexp(ValueError, expected_error_regexp): dump.node_attributes("foo") def testGraphStructureLookupGivesDebugWatchKeys(self): @@ -722,7 +736,6 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): self.assertEqual(0, u_data[0].output_slot) self.assertEqual("DebugIdentity", u_data[0].debug_op) self.assertGreaterEqual(u_data[0].timestamp, 0) - self.assertEqual([], dump.watch_key_to_data("foo")) def testGraphStructureLookupGivesNodeInputsAndRecipients(self): @@ -753,12 +766,13 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): self.assertEqual([], dump.node_recipients(w_name, is_control=True)) # Test errors raised on invalid node names. - with self.assertRaisesRegexp(ValueError, - "does not exist in partition graphs"): + if test_util.gpu_device_name(): + expected_error_regexp = r"None of the .* devices has a node named " + else: + expected_error_regexp = "does not exist in the partition graph of device " + with self.assertRaisesRegexp(ValueError, expected_error_regexp): dump.node_inputs(u_name + "foo") - - with self.assertRaisesRegexp(ValueError, - "does not exist in partition graphs"): + with self.assertRaisesRegexp(ValueError, expected_error_regexp): dump.node_recipients(u_name + "foo") # Test transitive_inputs(). @@ -769,8 +783,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): self.assertEqual( set([u_name, u_read_name, v_name]), set(dump.transitive_inputs(w_name))) - with self.assertRaisesRegexp(ValueError, - "does not exist in partition graphs"): + with self.assertRaisesRegexp(ValueError, expected_error_regexp): dump.transitive_inputs(u_name + "foo") def testGraphStructureLookupWithoutPartitionGraphsDoesNotErrorOut(self): @@ -1067,10 +1080,12 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): y = array_ops.squeeze(ph, name="mismatch/y") run_options = config_pb2.RunOptions(output_partition_graphs=True) + run_metadata = config_pb2.RunMetadata() debug_utils.watch_graph( run_options, sess.graph, debug_urls=self._debug_urls(), global_step=1) - sess.run(x, feed_dict={ph: np.array([[7.0, 8.0]])}, options=run_options) + sess.run(x, feed_dict={ph: np.array([[7.0, 8.0]])}, options=run_options, + run_metadata=run_metadata) dump1 = debug_data.DebugDumpDir(self._dump_root) self.assertEqual(1, dump1.core_metadata.global_step) self.assertGreaterEqual(dump1.core_metadata.session_run_index, 0) diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py index ffce6ce4c8..c2fc7e3af9 100644 --- a/tensorflow/python/framework/importer.py +++ b/tensorflow/python/framework/importer.py @@ -437,6 +437,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None, 'WholeFileReader', 'TextLineReader', 'FixedLengthRecordReader', 'TFRecordReader', 'IdentityReader', + 'LMDBReader', 'RefSwitch', 'RefEnter', 'RefNextIteration', 'RefMerge', 'RefIdentity']: pass diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 79c7905427..242c396acb 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -947,7 +947,7 @@ cuda_py_test( tags = ["notsan"], ) -tf_py_test( +cuda_py_test( name = "diag_op_test", size = "medium", srcs = ["diag_op_test.py"], @@ -979,6 +979,7 @@ tf_py_test( "//tensorflow/python:util", "//tensorflow/python:variables", ], + data = ["//tensorflow/core:lmdb_testdata"], ) cuda_py_test( diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py index dbbc2de811..013aa1ba8a 100644 --- a/tensorflow/python/kernel_tests/basic_gpu_test.py +++ b/tensorflow/python/kernel_tests/basic_gpu_test.py @@ -129,7 +129,7 @@ class MathBuiltinUnaryTest(test.TestCase): for dtype in [np.float32]: self._testDtype(dtype, use_gpu=True) - def testFloorDevide(self): + def testFloorDivide(self): x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape( [1, 3, 2]) y = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape( diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py index 4744e68051..f0b7885732 100644 --- a/tensorflow/python/kernel_tests/diag_op_test.py +++ b/tensorflow/python/kernel_tests/diag_op_test.py @@ -39,21 +39,23 @@ class MatrixDiagTest(test.TestCase): self.assertEqual((3, 3), v_diag.get_shape()) self.assertAllEqual(v_diag.eval(), mat) - def testBatchVector(self): + def _testBatchVector(self, dtype): with self.test_session(use_gpu=True): - v_batch = np.array([[1.0, 2.0, 3.0], - [4.0, 5.0, 6.0]]) - mat_batch = np.array( - [[[1.0, 0.0, 0.0], - [0.0, 2.0, 0.0], - [0.0, 0.0, 3.0]], - [[4.0, 0.0, 0.0], - [0.0, 5.0, 0.0], - [0.0, 0.0, 6.0]]]) + v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype) + mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]], + [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0], + [0.0, 0.0, 6.0]]]).astype(dtype) v_batch_diag = array_ops.matrix_diag(v_batch) self.assertEqual((2, 3, 3), v_batch_diag.get_shape()) self.assertAllEqual(v_batch_diag.eval(), mat_batch) + def testBatchVector(self): + self._testBatchVector(np.float32) + self._testBatchVector(np.float64) + self._testBatchVector(np.int32) + self._testBatchVector(np.int64) + self._testBatchVector(np.bool) + def testInvalidShape(self): with self.assertRaisesRegexp(ValueError, "must be at least rank 1"): array_ops.matrix_diag(0) @@ -108,29 +110,29 @@ class MatrixSetDiagTest(test.TestCase): self.assertEqual((3, 2), output.get_shape()) self.assertAllEqual(expected, output.eval()) - def testSquareBatch(self): + def _testSquareBatch(self, dtype): with self.test_session(use_gpu=True): - v_batch = np.array([[-1.0, -2.0, -3.0], - [-4.0, -5.0, -6.0]]) - mat_batch = np.array( - [[[1.0, 0.0, 3.0], - [0.0, 2.0, 0.0], - [1.0, 0.0, 3.0]], - [[4.0, 0.0, 4.0], - [0.0, 5.0, 0.0], - [2.0, 0.0, 6.0]]]) + v_batch = np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]]).astype(dtype) + mat_batch = np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]], + [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0], + [2.0, 0.0, 6.0]]]).astype(dtype) + + mat_set_diag_batch = np.array([[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0], + [1.0, 0.0, -3.0]], + [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0], + [2.0, 0.0, -6.0]]]).astype(dtype) - mat_set_diag_batch = np.array( - [[[-1.0, 0.0, 3.0], - [0.0, -2.0, 0.0], - [1.0, 0.0, -3.0]], - [[-4.0, 0.0, 4.0], - [0.0, -5.0, 0.0], - [2.0, 0.0, -6.0]]]) output = array_ops.matrix_set_diag(mat_batch, v_batch) self.assertEqual((2, 3, 3), output.get_shape()) self.assertAllEqual(mat_set_diag_batch, output.eval()) + def testSquareBatch(self): + self._testSquareBatch(np.float32) + self._testSquareBatch(np.float64) + self._testSquareBatch(np.int32) + self._testSquareBatch(np.int64) + self._testSquareBatch(np.bool) + def testRectangularBatch(self): with self.test_session(use_gpu=True): v_batch = np.array([[-1.0, -2.0], @@ -220,22 +222,24 @@ class MatrixDiagPartTest(test.TestCase): mat_diag = array_ops.matrix_diag_part(mat) self.assertAllEqual(mat_diag.eval(), np.array([1.0, 4.0])) - def testSquareBatch(self): + def _testSquareBatch(self, dtype): with self.test_session(use_gpu=True): - v_batch = np.array([[1.0, 2.0, 3.0], - [4.0, 5.0, 6.0]]) - mat_batch = np.array( - [[[1.0, 0.0, 0.0], - [0.0, 2.0, 0.0], - [0.0, 0.0, 3.0]], - [[4.0, 0.0, 0.0], - [0.0, 5.0, 0.0], - [0.0, 0.0, 6.0]]]) + v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype) + mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]], + [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0], + [0.0, 0.0, 6.0]]]).astype(dtype) self.assertEqual(mat_batch.shape, (2, 3, 3)) mat_batch_diag = array_ops.matrix_diag_part(mat_batch) self.assertEqual((2, 3), mat_batch_diag.get_shape()) self.assertAllEqual(mat_batch_diag.eval(), v_batch) + def testSquareBatch(self): + self._testSquareBatch(np.float32) + self._testSquareBatch(np.float64) + self._testSquareBatch(np.int32) + self._testSquareBatch(np.int64) + self._testSquareBatch(np.bool) + def testRectangularBatch(self): with self.test_session(use_gpu=True): v_batch = np.array([[1.0, 2.0], diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py index 10f34751d0..0038f4bbf3 100644 --- a/tensorflow/python/kernel_tests/reader_ops_test.py +++ b/tensorflow/python/kernel_tests/reader_ops_test.py @@ -858,5 +858,48 @@ class AsyncReaderTest(test.TestCase): output.append(sess.run(args)) +class LMDBReaderTest(test.TestCase): + + def setUp(self): + super(LMDBReaderTest, self).setUp() + + def testReadFromFile(self): + with self.test_session() as sess: + reader = io_ops.LMDBReader(name="test_read_from_file") + path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata", + "data.mdb") + queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=()) + key, value = reader.read(queue) + + queue.enqueue([path]).run() + queue.close().run() + for i in range(10): + k, v = sess.run([key, value]) + self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i))) + self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i)))) + + with self.assertRaisesOpError("is closed and has insufficient elements " + "\\(requested 1, current size 0\\)"): + k, v = sess.run([key, value]) + + def testReadFromFolder(self): + with self.test_session() as sess: + reader = io_ops.LMDBReader(name="test_read_from_folder") + path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata") + queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=()) + key, value = reader.read(queue) + + queue.enqueue([path]).run() + queue.close().run() + for i in range(10): + k, v = sess.run([key, value]) + self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i))) + self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i)))) + + with self.assertRaisesOpError("is closed and has insufficient elements " + "\\(requested 1, current size 0\\)"): + k, v = sess.run([key, value]) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py index b61168695a..49dcd2370c 100644 --- a/tensorflow/python/layers/convolutional.py +++ b/tensorflow/python/layers/convolutional.py @@ -149,39 +149,13 @@ class _Conv(base.Layer): self.built = True def call(self, inputs): - if (self.data_format == 'channels_first' and - not framework.test_util.gpu_device_name()): - # `nn.convolution` is not implemented on CPU for `channels_first` format. - # In cases where we are most likely running on CPU using `channels_first`, - # we reshape the inputs to use `channels_last` (and reshape them back - # afterwards). This is a temporary fix; a better solution would be a fix - # at the op level. - # TODO(chollet): remove this when `nn.convolution` is feature-complete. - data_format = 'channels_last' - if self.rank == 1: - inputs = array_ops.transpose(inputs, (0, 2, 1)) - elif self.rank == 2: - inputs = array_ops.transpose(inputs, (0, 2, 3, 1)) - elif self.rank == 3: - inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1)) - else: - data_format = self.data_format outputs = nn.convolution( input=inputs, filter=self.kernel, dilation_rate=self.dilation_rate, strides=self.strides, padding=self.padding.upper(), - data_format=utils.convert_data_format(data_format, - self.rank + 2)) - if (self.data_format == 'channels_first' and - not framework.test_util.gpu_device_name()): - if self.rank == 1: - outputs = array_ops.transpose(outputs, (0, 2, 1)) - elif self.rank == 2: - outputs = array_ops.transpose(outputs, (0, 3, 1, 2)) - elif self.rank == 3: - outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3)) + data_format=utils.convert_data_format(self.data_format, self.rank + 2)) if self.bias is not None: if self.data_format == 'channels_first': @@ -202,18 +176,10 @@ class _Conv(base.Layer): [outputs_shape[0], outputs_shape[1], outputs_shape[2] * outputs_shape[3], outputs_shape[4]]) - outputs_4d = nn.bias_add( - outputs_4d, - self.bias, - data_format=utils.convert_data_format(self.data_format, 4)) + outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW') outputs = array_ops.reshape(outputs_4d, outputs_shape) else: - outputs = nn.bias_add( - outputs, - self.bias, - data_format=utils.convert_data_format(self.data_format, 4)) - # Note that we passed rank=4 because bias_add will only accept - # NHWC and NCWH even if the rank of the inputs is 3 or 5. + outputs = nn.bias_add(outputs, self.bias, data_format='NHWC') if self.activation is not None: return self.activation(outputs) diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py index 6cd644b642..e903afa0a8 100644 --- a/tensorflow/python/layers/pooling.py +++ b/tensorflow/python/layers/pooling.py @@ -262,16 +262,7 @@ class _Pooling2D(base.Layer): self.input_spec = base.InputSpec(ndim=4) def call(self, inputs): - if (self.data_format == 'channels_first' and - not framework.test_util.gpu_device_name()): - # `nn.convolution` is not implemented on CPU for `channels_first` format. - # TODO(chollet): remove this when `nn.convolution` is feature-complete. - data_format = 'channels_last' - inputs = array_ops.transpose(inputs, (0, 2, 3, 1)) - else: - data_format = self.data_format - - if data_format == 'channels_last': + if self.data_format == 'channels_last': pool_shape = (1,) + self.pool_size + (1,) strides = (1,) + self.strides + (1,) else: @@ -282,11 +273,7 @@ class _Pooling2D(base.Layer): ksize=pool_shape, strides=strides, padding=self.padding.upper(), - data_format=utils.convert_data_format(data_format, 4)) - - if (self.data_format == 'channels_first' and - not framework.test_util.gpu_device_name()): - outputs = array_ops.transpose(outputs, (0, 3, 1, 2)) + data_format=utils.convert_data_format(self.data_format, 4)) return outputs def _compute_output_shape(self, input_shape): diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt index 06adfc5066..553e0dc135 100644 --- a/tensorflow/python/ops/hidden_ops.txt +++ b/tensorflow/python/ops/hidden_ops.txt @@ -191,6 +191,7 @@ WholeFileReader TextLineReaderV2 TFRecordReaderV2 WholeFileReaderV2 +LMDBReader # linalg_ops BatchCholesky diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py index 68ecc219e4..8f5d0e5cd4 100644 --- a/tensorflow/python/ops/io_ops.py +++ b/tensorflow/python/ops/io_ops.py @@ -26,6 +26,7 @@ See the @{$python/io_ops} guide. @@WholeFileReader @@IdentityReader @@TFRecordReader +@@LMDBReader @@FixedLengthRecordReader @@decode_csv @@decode_raw @@ -443,6 +444,25 @@ class TFRecordReader(ReaderBase): ops.NotDifferentiable("TFRecordReader") +class LMDBReader(ReaderBase): + """A Reader that outputs the records from a LMDB file. + + See ReaderBase for supported methods. + """ + def __init__(self, name=None, options=None): + """Create a LMDBReader. + + Args: + name: A name for the operation (optional). + options: A LMDBRecordOptions object (optional). + """ + rr = gen_io_ops._lmdb_reader(name=name) + super(LMDBReader, self).__init__(rr) + + +ops.NotDifferentiable("LMDBReader") + + class IdentityReader(ReaderBase): """A Reader that outputs the queued work as both the key and value. diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py index ebdb137fb9..fe532fa186 100644 --- a/tensorflow/python/ops/script_ops.py +++ b/tensorflow/python/ops/script_ops.py @@ -26,6 +26,7 @@ from __future__ import print_function import threading import numpy as np +import six from tensorflow.python import pywrap_tensorflow from tensorflow.python.framework import function @@ -81,6 +82,10 @@ class FuncRegistry(object): if func is None: raise ValueError("callback %s is not found" % token) ret = func(*args) + # Strings seem to lead to a memory leak here if they're not wrapped in a + # list. + if isinstance(ret, six.binary_type): + ret = [ret] # Ensures that we return either a single numpy array or a list of numpy # arrays. if isinstance(ret, (tuple, list)): diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py index 5ae82524a3..459c735ea3 100644 --- a/tensorflow/python/training/adam.py +++ b/tensorflow/python/training/adam.py @@ -113,10 +113,14 @@ class AdamOptimizer(optimizer.Optimizer): def _create_slots(self, var_list): # Create the beta1 and beta2 accumulators on the same device as the first - # variable. + # variable. Sort the var_list to make sure this device is consistent across + # workers (these need to go on the same PS, otherwise some updates are + # silently ignored). + first_var = min(var_list, key=lambda x: x.name) + if (self._beta1_power is None or - self._beta1_power.graph is not var_list[0].graph): - with ops.colocate_with(var_list[0]): + self._beta1_power.graph is not first_var.graph): + with ops.colocate_with(first_var): self._beta1_power = variable_scope.variable(self._beta1, name="beta1_power", trainable=False) diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py index 88df3351e6..05c99856d2 100644 --- a/tensorflow/python/util/tf_should_use.py +++ b/tensorflow/python/util/tf_should_use.py @@ -17,14 +17,52 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import functools +import itertools import traceback import types +import six # pylint: disable=unused-import + +from backports import weakref # pylint: disable=g-bad-import-order + from tensorflow.python.platform import tf_logging from tensorflow.python.util import tf_decorator +class _RefInfoField( + collections.namedtuple( + '_RefInfoField', ('type_', 'repr_', 'creation_stack', 'object_used'))): + pass + + +# Thread-safe up to int32max/2 thanks to python's GIL; and may be safe even for +# higher values in Python 3.4+. We don't expect to ever count higher than this. +# https://mail.python.org/pipermail/python-list/2005-April/342279.html +_REF_ITER = itertools.count() + +# Dictionary mapping id(obj) => _RefInfoField. +_REF_INFO = {} + + +def _deleted(obj_id, fatal_error): + obj = _REF_INFO[obj_id] + del _REF_INFO[obj_id] + if not obj.object_used: + if fatal_error: + logger = tf_logging.fatal + else: + logger = tf_logging.error + logger( + '==================================\n' + 'Object was never used (type %s):\n%s\nIf you want to mark it as ' + 'used call its "mark_used()" method.\nIt was originally created ' + 'here:\n%s\n' + '==================================' % + (obj.type_, obj.repr_, obj.creation_stack)) + + def _add_should_use_warning(x, fatal_error=False): """Wraps object x so that if it is never used, a warning is logged. @@ -39,14 +77,14 @@ def _add_should_use_warning(x, fatal_error=False): """ if x is None: # special corner case where x is None return x - has_been_used = getattr(x, '_tf_object_has_been_used', None) - if has_been_used is not None: - x._tf_object_has_been_used = has_been_used # pylint: disable=protected-access + if hasattr(x, '_tf_ref_id'): # this is already a TFShouldUseWarningWrapper return x def override_method(method): def fn(self, *args, **kwargs): - self._tf_object_has_been_used = True # pylint: disable=protected-access + # pylint: disable=protected-access + _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace( + object_used=True) return method(self, *args, **kwargs) return fn @@ -55,38 +93,36 @@ def _add_should_use_warning(x, fatal_error=False): def __init__(self, true_self): self.__dict__ = true_self.__dict__ - stack = [x.strip() for x in traceback.format_stack()] + stack = [s.strip() for s in traceback.format_stack()] # Remove top three stack entries from adding the wrapper - self._tf_object_creation_stack = '\n'.join(stack[:-3]) - self._tf_object_has_been_used = False + self.creation_stack = '\n'.join(stack[:-3]) + self._tf_ref_id = next(_REF_ITER) + _REF_INFO[self._tf_ref_id] = _RefInfoField( + type_=type(x), + repr_=repr(x), + creation_stack=stack, + object_used=False) + + # Create a finalizer for self, which will be called when self is + # garbage collected. Can't add self as the args because the + # loop will break garbage collection. We keep track of + # ourselves via python ids. + weakref.finalize(self, _deleted, self._tf_ref_id, fatal_error) # Not sure why this pylint warning is being used; this is not an # old class form. # pylint: disable=super-on-old-class def __getattribute__(self, name): - if name != '_tf_object_has_been_used': - self._tf_object_has_been_used = True + if name == '_tf_ref_id': + return super(TFShouldUseWarningWrapper, self).__getattribute__(name) + if self._tf_ref_id in _REF_INFO: + _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace( + object_used=True) return super(TFShouldUseWarningWrapper, self).__getattribute__(name) - def __del__(self): - if not self._tf_object_has_been_used: - if fatal_error: - logger = tf_logging.fatal - else: - logger = tf_logging.error - logger( - '==================================\n' - 'Object was never used (type %s):\n%s\nIf you want to mark it as ' - 'used call its "mark_used()" method.\nIt was originally created ' - 'here:\n%s\n' - '==================================' % - (type(x), x, self._tf_object_creation_stack)) - - if hasattr(super(TFShouldUseWarningWrapper, self), '__del__'): - return super(TFShouldUseWarningWrapper, self).__del__() - def mark_used(self, *args, **kwargs): - self._tf_object_has_been_used = True + _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace( + object_used=True) if hasattr(super(TFShouldUseWarningWrapper, self), 'mark_used'): return super(TFShouldUseWarningWrapper, self).mark_used(*args, **kwargs) # pylint: enable=super-on-old-class @@ -102,7 +138,8 @@ def _add_should_use_warning(x, fatal_error=False): wrapped = TFShouldUseWarningWrapper(x) wrapped.__doc__ = x.__doc__ # functools.wraps fails on some objects. - wrapped._tf_object_has_been_used = False # pylint: disable=protected-access + ref_id = wrapped._tf_ref_id # pylint: disable=protected-access + _REF_INFO[ref_id] = _REF_INFO[ref_id]._replace(object_used=False) return wrapped diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py index 71d48e3dde..c826874400 100644 --- a/tensorflow/python/util/tf_should_use_test.py +++ b/tensorflow/python/util/tf_should_use_test.py @@ -20,6 +20,7 @@ from __future__ import division from __future__ import print_function import contextlib +import gc import sys from tensorflow.python.framework import constant_op @@ -45,7 +46,7 @@ def reroute_error(captured): class TfShouldUseTest(test.TestCase): def testAddShouldUseWarningWhenNotUsed(self): - c = constant_op.constant(0, name='blah') + c = constant_op.constant(0, name='blah0') captured = [] with reroute_error(captured): def in_this_function(): @@ -53,44 +54,52 @@ class TfShouldUseTest(test.TestCase): del h in_this_function() self.assertIn('Object was never used', '\n'.join(captured)) - self.assertIn('blah:0', '\n'.join(captured)) + self.assertIn('blah0:0', '\n'.join(captured)) self.assertIn('in_this_function', '\n'.join(captured)) + gc.collect() + self.assertFalse(gc.garbage) - def _testAddShouldUseWarningWhenUsed(self, fn): - c = constant_op.constant(0, name='blah') + def _testAddShouldUseWarningWhenUsed(self, fn, name): + c = constant_op.constant(0, name=name) captured = [] with reroute_error(captured): h = tf_should_use._add_should_use_warning(c) fn(h) del h self.assertNotIn('Object was never used', '\n'.join(captured)) - self.assertNotIn('blah:0', '\n'.join(captured)) + self.assertNotIn('%s:0' % name, '\n'.join(captured)) def testAddShouldUseWarningWhenUsedWithAdd(self): def add(h): _ = h + 1 - self._testAddShouldUseWarningWhenUsed(add) + self._testAddShouldUseWarningWhenUsed(add, name='blah_add') + gc.collect() + self.assertFalse(gc.garbage) def testAddShouldUseWarningWhenUsedWithGetName(self): def get_name(h): _ = h.name - self._testAddShouldUseWarningWhenUsed(get_name) + self._testAddShouldUseWarningWhenUsed(get_name, name='blah_get_name') + gc.collect() + self.assertFalse(gc.garbage) def testShouldUseResult(self): @tf_should_use.should_use_result def return_const(value): - return constant_op.constant(value, name='blah') + return constant_op.constant(value, name='blah2') captured = [] with reroute_error(captured): return_const(0.0) self.assertIn('Object was never used', '\n'.join(captured)) - self.assertIn('blah:0', '\n'.join(captured)) + self.assertIn('blah2:0', '\n'.join(captured)) self.assertIn('return_const', '\n'.join(captured)) + gc.collect() + self.assertFalse(gc.garbage) def testShouldUseResultWhenNotReallyUsed(self): @tf_should_use.should_use_result def return_const(value): - return constant_op.constant(value, name='blah') + return constant_op.constant(value, name='blah3') captured = [] with reroute_error(captured): with self.test_session(): @@ -100,8 +109,10 @@ class TfShouldUseTest(test.TestCase): v = constant_op.constant(1.0, name='meh') v.eval() self.assertIn('Object was never used', '\n'.join(captured)) - self.assertIn('blah:0', '\n'.join(captured)) + self.assertIn('blah3:0', '\n'.join(captured)) self.assertIn('return_const', '\n'.join(captured)) + gc.collect() + self.assertFalse(gc.garbage) if __name__ == '__main__': diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc index 9ea6474934..bf81b9c0ad 100644 --- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc +++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc @@ -170,7 +170,7 @@ void Diagnostician::LogDiagnosticInformation() { VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\""; std::vector<string> pieces = port::Split(library_path, ':'); - for (auto piece : pieces) { + for (const auto &piece : pieces) { if (piece.empty()) { continue; } diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h index 9d3ac4ed9e..802ef755eb 100644 --- a/tensorflow/stream_executor/stream_executor_internal.h +++ b/tensorflow/stream_executor/stream_executor_internal.h @@ -48,27 +48,9 @@ limitations under the License. namespace perftools { namespace gputools { -class KernelBase; class Stream; class Timer; -namespace blas { -class BlasSupport; -} // namespace blas - -namespace fft { -class Support; -} // namespace fft - -namespace rng { -class RngSupport; -} // namespace rng - -} // namespace gputools -} // namespace perftools - -namespace perftools { -namespace gputools { namespace internal { // Platform-dependent interface class for the generic Events interface, in diff --git a/tensorflow/tensorboard/backend/application_test.py b/tensorflow/tensorboard/backend/application_test.py index 87cfdbc1d8..63eb47e9d4 100644 --- a/tensorflow/tensorboard/backend/application_test.py +++ b/tensorflow/tensorboard/backend/application_test.py @@ -36,7 +36,6 @@ import tensorflow as tf from werkzeug import serving -from tensorflow.core.protobuf import meta_graph_pb2 from tensorflow.tensorboard import tensorboard from tensorflow.tensorboard.backend import application from tensorflow.tensorboard.backend.event_processing import event_multiplexer @@ -221,7 +220,7 @@ class TensorboardServerTest(tf.test.TestCase): node2.name = 'b' node2.attr['very_large_attr'].s = b'a' * 2048 # 2 KB attribute - meta_graph_def = meta_graph_pb2.MetaGraphDef(graph_def=graph_def) + meta_graph_def = tf.MetaGraphDef(graph_def=graph_def) if self._only_use_meta_graph: writer.add_meta_graph(meta_graph_def) diff --git a/tensorflow/tensorboard/backend/event_processing/BUILD b/tensorflow/tensorboard/backend/event_processing/BUILD index 1c3e094e76..dff2dbc90d 100644 --- a/tensorflow/tensorboard/backend/event_processing/BUILD +++ b/tensorflow/tensorboard/backend/event_processing/BUILD @@ -134,18 +134,6 @@ py_library( ], ) -py_test( - name = "plugin_asset_util_test", - size = "small", - srcs = ["plugin_asset_util_test.py"], - srcs_version = "PY2AND3", - deps = [ - ":event_multiplexer", - ":plugin_asset_util", - "//tensorflow:tensorflow_py", - ], -) - py_library( name = "event_file_inspector", srcs = ["event_file_inspector.py"], diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py index 9efd64bd2e..4ce766f420 100644 --- a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py +++ b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py @@ -24,7 +24,6 @@ import six from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf -from tensorflow.python.summary.writer.writer import SummaryToEventTransformer from tensorflow.tensorboard.backend.event_processing import event_accumulator as ea @@ -760,7 +759,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest): def testTFSummaryScalar(self): """Verify processing of tf.summary.scalar.""" event_sink = _EventGenerator(self, zero_out_timestamps=True) - writer = SummaryToEventTransformer(event_sink) + writer = tf.summary.FileWriter(self.get_temp_dir()) + writer.event_writer = event_sink with self.test_session() as sess: ipt = tf.placeholder(tf.float32) tf.summary.scalar('scalar1', ipt) @@ -794,7 +794,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest): def testTFSummaryImage(self): """Verify processing of tf.summary.image.""" event_sink = _EventGenerator(self, zero_out_timestamps=True) - writer = SummaryToEventTransformer(event_sink) + writer = tf.summary.FileWriter(self.get_temp_dir()) + writer.event_writer = event_sink with self.test_session() as sess: ipt = tf.ones([10, 4, 4, 3], tf.uint8) # This is an interesting example, because the old tf.image_summary op @@ -830,7 +831,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest): def testTFSummaryTensor(self): """Verify processing of tf.summary.tensor.""" event_sink = _EventGenerator(self, zero_out_timestamps=True) - writer = SummaryToEventTransformer(event_sink) + writer = tf.summary.FileWriter(self.get_temp_dir()) + writer.event_writer = event_sink with self.test_session() as sess: tf.summary.tensor_summary('scalar', tf.constant(1.0)) tf.summary.tensor_summary('vector', tf.constant([1.0, 2.0, 3.0])) diff --git a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py deleted file mode 100644 index 8fe5b31621..0000000000 --- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os.path - -import tensorflow as tf - -from tensorflow.python.summary import plugin_asset - -from tensorflow.tensorboard.backend.event_processing import event_multiplexer -from tensorflow.tensorboard.backend.event_processing import plugin_asset_util - - -class GenericContentPlugin(plugin_asset.PluginAsset): - - def __init__(self): - self.contents = "hello world" - - def assets(self): - return {"contents.txt": self.contents} - - -class PluginAlpha(GenericContentPlugin): - plugin_name = "Alpha" - - -class PluginBeta(GenericContentPlugin): - plugin_name = "Beta" - - -class PluginGamma(GenericContentPlugin): - plugin_name = "Gamma" - - -class PluginAssetUtilitiesTest(tf.test.TestCase): - - def testGetPluginDirectory(self): - self.assertEqual( - os.path.join("logdir", "plugins", "x"), - plugin_asset_util.PluginDirectory("logdir", "x")) - - def testNonExistentDirectory(self): - tempdir = self.get_temp_dir() - fake_dir = os.path.join(tempdir, "nonexistent_dir") - self.assertEqual([], plugin_asset_util.ListPlugins(fake_dir)) - self.assertEqual([], plugin_asset_util.ListAssets(fake_dir, "fake_plugin")) - with self.assertRaises(KeyError): - plugin_asset_util.RetrieveAsset(fake_dir, "fake_plugin", "fake_asset") - - def testSimplePluginCase(self): - tempdir = self.get_temp_dir() - with tf.Graph().as_default() as g: - plugin_asset.get_plugin_asset(PluginAlpha) - fw = tf.summary.FileWriter(tempdir) - fw.add_graph(g) - self.assertEqual(["Alpha"], plugin_asset_util.ListPlugins(tempdir)) - assets = plugin_asset_util.ListAssets(tempdir, "Alpha") - self.assertEqual(["contents.txt"], assets) - contents = plugin_asset_util.RetrieveAsset(tempdir, "Alpha", "contents.txt") - self.assertEqual("hello world", contents) - - def testEventMultiplexerIntegration(self): - tempdir = self.get_temp_dir() - with tf.Graph().as_default() as g: - plugin_instance = plugin_asset.get_plugin_asset(PluginAlpha) - plugin_instance.contents = "graph one" - plugin_asset.get_plugin_asset(PluginBeta) - - fw = tf.summary.FileWriter(os.path.join(tempdir, "one")) - fw.add_graph(g) - fw.close() - - with tf.Graph().as_default() as g: - plugin_instance = plugin_asset.get_plugin_asset(PluginAlpha) - plugin_instance.contents = "graph two" - fw = tf.summary.FileWriter(os.path.join(tempdir, "two")) - fw.add_graph(g) - fw.close() - - multiplexer = event_multiplexer.EventMultiplexer() - multiplexer.AddRunsFromDirectory(tempdir) - - self.assertEqual( - multiplexer.PluginAssets("Alpha"), - {"one": ["contents.txt"], "two": ["contents.txt"]}) - self.assertEqual( - multiplexer.RetrievePluginAsset("one", "Alpha", "contents.txt"), - "graph one") - self.assertEqual( - multiplexer.RetrievePluginAsset("one", "Beta", "contents.txt"), - "hello world") - self.assertEqual( - multiplexer.RetrievePluginAsset("two", "Alpha", "contents.txt"), - "graph two") - - self.assertEqual( - multiplexer.PluginAssets("Beta"), - {"one": ["contents.txt"], "two": []}) - self.assertEqual(multiplexer.PluginAssets("Gamma"), {"one": [], "two": []}) - - -if __name__ == "__main__": - tf.test.main() diff --git a/tensorflow/tensorboard/plugins/scalars/scalars_demo.py b/tensorflow/tensorboard/plugins/scalars/scalars_demo.py index e9b2a6129d..f3195fd849 100644 --- a/tensorflow/tensorboard/plugins/scalars/scalars_demo.py +++ b/tensorflow/tensorboard/plugins/scalars/scalars_demo.py @@ -23,8 +23,6 @@ import os.path from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf -from tensorflow.python.platform import app - # Directory into which to write tensorboard data. LOGDIR = '/tmp/scalars_demo' @@ -129,4 +127,4 @@ def main(unused_argv): if __name__ == '__main__': - app.run() + tf.app.run() diff --git a/tensorflow/tensorboard/plugins/text/text_plugin.py b/tensorflow/tensorboard/plugins/text/text_plugin.py index 2c11b80029..d0040e20be 100644 --- a/tensorflow/tensorboard/plugins/text/text_plugin.py +++ b/tensorflow/tensorboard/plugins/text/text_plugin.py @@ -35,7 +35,6 @@ import six import tensorflow as tf from werkzeug import wrappers -from tensorflow.python.summary import text_summary from tensorflow.tensorboard.backend import http_util from tensorflow.tensorboard.plugins import base_plugin @@ -256,7 +255,7 @@ class TextPlugin(base_plugin.TBPlugin): def index_impl(self): run_to_series = {} - name = text_summary.TextSummaryPluginAsset.plugin_name + name = 'tensorboard_text' run_to_assets = self.multiplexer.PluginAssets(name) for run, assets in run_to_assets.items(): diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu index 68493965fa..5d18295f68 100644 --- a/tensorflow/tools/ci_build/Dockerfile.gpu +++ b/tensorflow/tools/ci_build/Dockerfile.gpu @@ -1,14 +1,15 @@ -FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04 +FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04 MAINTAINER Jan Prach <jendap@google.com> # In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to # /usr/local/cuda -RUN cp /usr/include/cudnn.h /usr/local/cuda/include -RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64 +RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include +RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64 # Copy and run the install scripts. COPY install/*.sh /install/ +ARG DEBIAN_FRONTEND=noninteractive RUN /install/install_bootstrap_deb_packages.sh RUN add-apt-repository -y ppa:openjdk-r/ppa && \ add-apt-repository -y ppa:george-edison55/cmake-3.x diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu_clang b/tensorflow/tools/ci_build/Dockerfile.gpu_clang index 00aaa9f760..c4342d17f5 100644 --- a/tensorflow/tools/ci_build/Dockerfile.gpu_clang +++ b/tensorflow/tools/ci_build/Dockerfile.gpu_clang @@ -1,4 +1,4 @@ -FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04 +FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04 MAINTAINER Ilya Biryukov <ibiryukov@google.com> diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh index b8f9fc8453..8768852dc7 100755 --- a/tensorflow/tools/ci_build/install/install_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh @@ -85,3 +85,6 @@ pip2 install mock pip2 install portpicker pip3 install portpicker + +pip2 install backports.weakref==1.0rc1 +pip3 install backports.weakref==1.0rc1 diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh index e7e2d256cd..edfc4e3a98 100755 --- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh @@ -89,3 +89,6 @@ pip3.5 install wheel==0.29.0 pip3.5 install portpicker pip3.5 install werkzeug + +pip3.5 install backports.weakref==1.0rc1 + diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu index d1de9eace5..d0a038a9db 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-gpu +++ b/tensorflow/tools/docker/Dockerfile.devel-gpu @@ -1,4 +1,4 @@ -FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04 +FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04 MAINTAINER Craig Citro <craigcitro@google.com> diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu index 88876421f5..3ba1e963f9 100644 --- a/tensorflow/tools/docker/Dockerfile.gpu +++ b/tensorflow/tools/docker/Dockerfile.gpu @@ -1,4 +1,4 @@ -FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04 +FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04 MAINTAINER Craig Citro <craigcitro@google.com> diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py index 52a65e1c9d..7ae1d2abd9 100644 --- a/tensorflow/tools/docs/parser.py +++ b/tensorflow/tools/docs/parser.py @@ -897,7 +897,7 @@ class _ClassPageInfo(object): @property def guides(self): - """Returns a markdown string containing backlinks to relevent api_guides.""" + """Returns a markdown string containing backlinks to relevant api_guides.""" return self._guides def set_guides(self, guides): diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD index 1e36af93ea..51ba3b7a0b 100644 --- a/tensorflow/tools/lib_package/BUILD +++ b/tensorflow/tools/lib_package/BUILD @@ -97,6 +97,7 @@ genrule( "@jemalloc//:COPYING", "@jpeg//:LICENSE.md", "@libxsmm_archive//:LICENSE", + "@lmdb//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", "@png_archive//:LICENSE", "@protobuf//:LICENSE", @@ -126,6 +127,7 @@ genrule( "@jemalloc//:COPYING", "@jpeg//:LICENSE.md", "@libxsmm_archive//:LICENSE", + "@lmdb//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", "@png_archive//:LICENSE", "@protobuf//:LICENSE", diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index 0c2660ff37..09fe6c53cd 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -110,6 +110,7 @@ filegroup( "@jemalloc//:COPYING", "@jpeg//:LICENSE.md", "@libxsmm_archive//:LICENSE", + "@lmdb//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", "@nanopb_git//:LICENSE.txt", "@org_html5lib//:LICENSE", diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py index 0524d2f1aa..58a80fd98a 100644 --- a/tensorflow/tools/pip_package/pip_smoke_test.py +++ b/tensorflow/tools/pip_package/pip_smoke_test.py @@ -46,6 +46,7 @@ BLACKLIST = [ "//tensorflow/python:tf_optimizer", "//tensorflow/python:compare_test_proto_py", "//tensorflow/core:image_testdata", + "//tensorflow/core:lmdb_testdata", "//tensorflow/core/kernels/cloud:bigquery_reader_ops", "//tensorflow/python/feature_column:vocabulary_testdata", "//tensorflow/python:framework/test_file_system.so", diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index 79b8e5bfa3..eb76fb3ce3 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -39,6 +39,7 @@ REQUIRED_PACKAGES = [ 'html5lib == 0.9999999', # identical to 1.0b8 'markdown == 2.2.0', 'bleach == 1.5.0', + 'backports.weakref == 1.0rc1', ] project_name = 'tensorflow' diff --git a/tensorflow/tools/test/system_info_lib.py b/tensorflow/tools/test/system_info_lib.py index c352abaa54..0cc261591b 100644 --- a/tensorflow/tools/test/system_info_lib.py +++ b/tensorflow/tools/test/system_info_lib.py @@ -96,7 +96,7 @@ def gather_cpu_info(): cpu_info.cpu_info = info['brand'] cpu_info.num_cores = info['count'] cpu_info.mhz_per_cpu = info['hz_advertised_raw'][0] / 1.0e6 - l2_cache_size = re.match(r'(\d+)', str(info['l2_cache_size'])) + l2_cache_size = re.match(r'(\d+)', str(info.get('l2_cache_size', ''))) if l2_cache_size: # If a value is returned, it's in KB cpu_info.cache_size['L2'] = int(l2_cache_size.group(0)) * 1024 diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 18bc9a8277..d7dd16919a 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -508,6 +508,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""): ) native.new_http_archive( + name = "lmdb", + urls = [ + "http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz", + "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz", + ], + sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326", + strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb", + build_file = str(Label("//third_party:lmdb.BUILD")), + ) + + native.new_http_archive( name = "jsoncpp_git", urls = [ "http://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz", diff --git a/third_party/lmdb.BUILD b/third_party/lmdb.BUILD new file mode 100644 index 0000000000..357e4a9335 --- /dev/null +++ b/third_party/lmdb.BUILD @@ -0,0 +1,25 @@ +# Description: +# LMDB is the Lightning Memory-mapped Database. + +licenses(["notice"]) # OpenLDAP Public License + +exports_files(["LICENSE"]) + +cc_library( + name = "lmdb", + srcs = [ + "mdb.c", + "midl.c", + ], + hdrs = [ + "lmdb.h", + "midl.h", + ], + copts = [ + "-w", + ], + linkopts = [ + "-lpthread", + ], + visibility = ["//visibility:public"], +) |