diff options
author | Mingxing Tan <tanmingxing@google.com> | 2018-06-28 19:13:20 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-06-28 19:16:41 -0700 |
commit | 1e7b0e4ad6d0f57f3241fe0b80a65f2c2a7f11b0 (patch) | |
tree | af92d172cedfc41e544c01a349c1d3b30bc3ff85 | |
parent | 3cee10e61c1c90734317c62ea3388ec44acc8d08 (diff) |
Merge changes from github.
PiperOrigin-RevId: 202585094
145 files changed, 6294 insertions, 1701 deletions
diff --git a/.gitignore b/.gitignore index 828bbe9bd3..b5306b8b79 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ __pycache__ cmake_build/ .idea/** /build/ +[Bb]uild/ /tensorflow/core/util/version_info.cc /tensorflow/python/framework/fast_tensor_util.cpp Pods diff --git a/configure.py b/configure.py index ada342a50a..ad585fa52e 100644 --- a/configure.py +++ b/configure.py @@ -943,6 +943,35 @@ def set_tf_cudnn_version(environ_cp): write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version) +def is_cuda_compatible(lib, cuda_ver, cudnn_ver): + """Check compatibility between given library and cudnn/cudart libraries.""" + ldd_bin = which('ldd') or '/usr/bin/ldd' + ldd_out = run_shell([ldd_bin, lib], True) + ldd_out = ldd_out.split(os.linesep) + cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$') + cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$') + cudnn = None + cudart = None + cudnn_ok = True # assume no cudnn dependency by default + cuda_ok = True # assume no cuda dependency by default + for line in ldd_out: + if 'libcudnn.so' in line: + cudnn = cudnn_pattern.search(line) + cudnn_ok = False + elif 'libcudart.so' in line: + cudart = cuda_pattern.search(line) + cuda_ok = False + if cudnn and len(cudnn.group(1)): + cudnn = convert_version_to_int(cudnn.group(1)) + if cudart and len(cudart.group(1)): + cudart = convert_version_to_int(cudart.group(1)) + if cudnn is not None: + cudnn_ok = (cudnn == cudnn_ver) + if cudart is not None: + cuda_ok = (cudart == cuda_ver) + return cudnn_ok and cuda_ok + + def set_tf_tensorrt_install_path(environ_cp): """Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION. @@ -959,8 +988,8 @@ def set_tf_tensorrt_install_path(environ_cp): raise ValueError('Currently TensorRT is only supported on Linux platform.') # Ask user whether to add TensorRT support. - if str(int(get_var( - environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False))) != '1': + if str(int(get_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', + False))) != '1': return for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS): @@ -973,47 +1002,29 @@ def set_tf_tensorrt_install_path(environ_cp): # Result returned from "read" will be used unexpanded. That make "~" # unusable. Going through one more level of expansion to handle that. - trt_install_path = os.path.realpath( - os.path.expanduser(trt_install_path)) + trt_install_path = os.path.realpath(os.path.expanduser(trt_install_path)) def find_libs(search_path): """Search for libnvinfer.so in "search_path".""" fl = set() if os.path.exists(search_path) and os.path.isdir(search_path): - fl.update([os.path.realpath(os.path.join(search_path, x)) - for x in os.listdir(search_path) if 'libnvinfer.so' in x]) + fl.update([ + os.path.realpath(os.path.join(search_path, x)) + for x in os.listdir(search_path) + if 'libnvinfer.so' in x + ]) return fl possible_files = find_libs(trt_install_path) possible_files.update(find_libs(os.path.join(trt_install_path, 'lib'))) possible_files.update(find_libs(os.path.join(trt_install_path, 'lib64'))) - - def is_compatible(tensorrt_lib, cuda_ver, cudnn_ver): - """Check the compatibility between tensorrt and cudnn/cudart libraries.""" - ldd_bin = which('ldd') or '/usr/bin/ldd' - ldd_out = run_shell([ldd_bin, tensorrt_lib]).split(os.linesep) - cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$') - cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$') - cudnn = None - cudart = None - for line in ldd_out: - if 'libcudnn.so' in line: - cudnn = cudnn_pattern.search(line) - elif 'libcudart.so' in line: - cudart = cuda_pattern.search(line) - if cudnn and len(cudnn.group(1)): - cudnn = convert_version_to_int(cudnn.group(1)) - if cudart and len(cudart.group(1)): - cudart = convert_version_to_int(cudart.group(1)) - return (cudnn == cudnn_ver) and (cudart == cuda_ver) - cuda_ver = convert_version_to_int(environ_cp['TF_CUDA_VERSION']) cudnn_ver = convert_version_to_int(environ_cp['TF_CUDNN_VERSION']) nvinfer_pattern = re.compile('.*libnvinfer.so.?(.*)$') highest_ver = [0, None, None] for lib_file in possible_files: - if is_compatible(lib_file, cuda_ver, cudnn_ver): + if is_cuda_compatible(lib_file, cuda_ver, cudnn_ver): matches = nvinfer_pattern.search(lib_file) if len(matches.groups()) == 0: continue @@ -1029,12 +1040,13 @@ def set_tf_tensorrt_install_path(environ_cp): # Try another alternative from ldconfig. ldconfig_bin = which('ldconfig') or '/sbin/ldconfig' ldconfig_output = run_shell([ldconfig_bin, '-p']) - search_result = re.search( - '.*libnvinfer.so\\.?([0-9.]*).* => (.*)', ldconfig_output) + search_result = re.search('.*libnvinfer.so\\.?([0-9.]*).* => (.*)', + ldconfig_output) if search_result: libnvinfer_path_from_ldconfig = search_result.group(2) if os.path.exists(libnvinfer_path_from_ldconfig): - if is_compatible(libnvinfer_path_from_ldconfig, cuda_ver, cudnn_ver): + if is_cuda_compatible(libnvinfer_path_from_ldconfig, cuda_ver, + cudnn_ver): trt_install_path = os.path.dirname(libnvinfer_path_from_ldconfig) tf_tensorrt_version = search_result.group(1) break diff --git a/tensorflow/BUILD b/tensorflow/BUILD index e4530a5962..233fe21fbf 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -155,6 +155,12 @@ config_setting( ) config_setting( + name = "linux_s390x", + values = {"cpu": "s390x"}, + visibility = ["//visibility:public"], +) + +config_setting( name = "debug", values = { "compilation_mode": "dbg", @@ -459,6 +465,15 @@ filegroup( tf_cc_shared_object( name = "libtensorflow_framework.so", framework_so = [], + linkopts = select({ + "//tensorflow:darwin": [], + "//tensorflow:windows": [], + "//tensorflow:windows_msvc": [], + "//conditions:default": [ + "-Wl,--version-script", # This line must be directly followed by the version_script.lds file + "$(location //tensorflow:tf_framework_version_script.lds)", + ], + }), linkstatic = 1, visibility = ["//visibility:public"], deps = [ @@ -468,6 +483,7 @@ tf_cc_shared_object( "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl", "//tensorflow/core:lib_internal_impl", "//tensorflow/stream_executor:stream_executor_impl", + "//tensorflow:tf_framework_version_script.lds", ] + tf_additional_binary_deps(), ) @@ -571,3 +587,13 @@ py_library( visibility = ["//visibility:public"], deps = ["//tensorflow/python:no_contrib"], ) + +cc_library( + name = "grpc", + deps = ["@grpc"], +) + +cc_library( + name = "grpc++", + deps = ["@grpc//:grpc++"], +) diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index 37c8302e08..5c218d3f25 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -2068,7 +2068,8 @@ TF_ImportGraphDefResults* TF_GraphImportGraphDefWithResults( TF_Graph* graph, const TF_Buffer* graph_def, const TF_ImportGraphDefOptions* options, TF_Status* status) { GraphDef def; - if (!def.ParseFromArray(graph_def->data, graph_def->length)) { + if (!tensorflow::ParseProtoUnlimited(&def, graph_def->data, + graph_def->length)) { status->status = InvalidArgument("Invalid GraphDef"); return nullptr; } @@ -2098,7 +2099,8 @@ void TF_GraphImportGraphDefWithReturnOutputs( return; } GraphDef def; - if (!def.ParseFromArray(graph_def->data, graph_def->length)) { + if (!tensorflow::ParseProtoUnlimited(&def, graph_def->data, + graph_def->length)) { status->status = InvalidArgument("Invalid GraphDef"); return; } diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc index 0025842aea..28070d60db 100644 --- a/tensorflow/compiler/aot/codegen.cc +++ b/tensorflow/compiler/aot/codegen.cc @@ -287,7 +287,7 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config, TF_RETURN_IF_ERROR(ValidateFeedFetchCppNames(config)); const int64 result_index = compile_result.aot->result_buffer_index(); const xla::BufferSizes& temp_sizes = compile_result.aot->buffer_sizes(); - if (result_index < 0 || result_index > temp_sizes.size()) { + if (result_index < 0 || result_index >= temp_sizes.size()) { return errors::InvalidArgument("result index: ", result_index, " is outside the range of temp sizes: [0,", temp_sizes.size(), ")"); diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD index 1775666652..0b1cec1925 100644 --- a/tensorflow/compiler/xla/rpc/BUILD +++ b/tensorflow/compiler/xla/rpc/BUILD @@ -39,10 +39,10 @@ tf_cc_binary( srcs = ["grpc_service_main.cc"], deps = [ ":grpc_service", + "//tensorflow:grpc++", "//tensorflow/compiler/xla/service:cpu_plugin", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", - "@grpc//:grpc++", ], ) @@ -54,6 +54,7 @@ tf_cc_test( ], deps = [ ":grpc_stub", + "//tensorflow:grpc++", "//tensorflow/compiler/xla/client", "//tensorflow/compiler/xla/client/xla_client:xla_builder", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -61,7 +62,6 @@ tf_cc_test( "//tensorflow/core:lib", "//tensorflow/core:test", "//tensorflow/core:test_main", - "@grpc//:grpc++", ], ) @@ -71,9 +71,9 @@ cc_library( hdrs = ["grpc_service.h"], deps = [ ":xla_service_proto", + "//tensorflow:grpc++", "//tensorflow/compiler/xla/service", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/core/distributed_runtime/rpc:grpc_util", - "@grpc//:grpc++", ], ) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index ae0749edb9..fe99f700d2 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -2550,7 +2550,6 @@ cc_library( name = "hlo_tfgraph_builder", srcs = ["hlo_tfgraph_builder.cc"], hdrs = ["hlo_tfgraph_builder.h"], - visibility = ["//tensorflow/compiler/xla/tools:__pkg__"], deps = [ ":hlo", "//tensorflow/compiler/xla:literal_util", diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index 088c97fbe3..5aaeec802f 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -1515,6 +1515,7 @@ bool HloInstruction::IdenticalSlowPath( // Remaining instructions with special values. case HloOpcode::kCall: + return eq_computations(to_apply(), other.to_apply()); case HloOpcode::kConditional: return eq_computations(true_computation(), other.true_computation()) && eq_computations(false_computation(), other.false_computation()); diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc index e1c5123774..d8ca99dfd1 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc @@ -924,6 +924,40 @@ TEST_F(HloInstructionTest, IdenticalInstructions) { *HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op1, op2))); } +TEST_F(HloInstructionTest, IdenticalCallInstructions) { + const char* const hlo_string = R"( +HloModule Module + +subcomp1 (x: f32[]) -> f32[] { + x = f32[] parameter(0) + ROOT n = f32[] sine(x) +} + +subcomp2 (x: f32[]) -> f32[] { + x = f32[] parameter(0) + ROOT n = f32[] cosine(x) +} + +ENTRY entry (param: f32[]) -> (f32[], f32[], f32[]) { + p = f32[] parameter(0) + t1 = f32[] call(p), to_apply=subcomp1 + t2 = f32[] call(p), to_apply=subcomp1 + t3 = f32[] call(p), to_apply=subcomp2 + ROOT t = (f32[], f32[], f32[]) tuple(t1, t2, t3) + } +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module, + ParseHloString(hlo_string)); + + auto* root = module->entry_computation()->root_instruction(); + auto* t1 = root->operand(0); + auto* t2 = root->operand(1); + auto* t3 = root->operand(2); + + EXPECT_TRUE(StructuralEqual(*t1, *t2)); + EXPECT_FALSE(StructuralEqual(*t1, *t3)); +} + TEST_F(HloInstructionTest, FunctionVisitor) { // Verify the function visitor HloInstruction::Accept visits all instructions // from a root properly given the following graph: diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD index 931ff62064..b2e2e27673 100644 --- a/tensorflow/contrib/autograph/converters/BUILD +++ b/tensorflow/contrib/autograph/converters/BUILD @@ -120,7 +120,10 @@ py_test( name = "decorators_test", srcs = ["decorators_test.py"], srcs_version = "PY2AND3", - tags = ["no_windows"], + tags = [ + "no_pip", + "no_windows", + ], deps = [ ":converters", "//tensorflow/contrib/autograph/core:test_lib", diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py index 671c9ccc13..988df70157 100644 --- a/tensorflow/contrib/autograph/operators/control_flow.py +++ b/tensorflow/contrib/autograph/operators/control_flow.py @@ -51,7 +51,7 @@ def for_stmt(iter_, extra_test, body, init_state): Args: iter_: The entity being iterated over. extra_test: Callable with the state as arguments, and boolean return type. - An additionnal loop condition. + An additional loop condition. body: Callable with the iterate and the state as arguments, and state as return type. The actual loop body. init_state: Tuple containing the initial state. diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py index 358d56ce20..4acc4ed66a 100644 --- a/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py +++ b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py @@ -286,7 +286,7 @@ class Forward(object): # TODO(alexbw): see if we can simplify by visiting breadth-first def visit(self, node): - """Depth-first walking the CFG, applying dataflow information propagtion.""" + """Depth-first walking the CFG, applying dataflow info propagation.""" # node.value is None only for the exit CfgNode. if not node.value: return diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py index 3328dde7aa..7655811830 100644 --- a/tensorflow/contrib/autograph/pyct/transformer.py +++ b/tensorflow/contrib/autograph/pyct/transformer.py @@ -218,7 +218,7 @@ class Base(gast.NodeTransformer): # TODO(mdan): Once we have error tracing, we may be able to just go to SSA. def apply_to_single_assignments(self, targets, values, apply_fn): - """Applies a fuction to each individual assignment. + """Applies a function to each individual assignment. This function can process a possibly-unpacked (e.g. a, b = c, d) assignment. It tries to break down the unpacking if possible. In effect, it has the same @@ -246,7 +246,7 @@ class Base(gast.NodeTransformer): targets field of an ast.Assign node. values: an AST node. apply_fn: a function of a single argument, which will be called with the - respective nodes of each single assignment. The signaure is + respective nodes of each single assignment. The signature is apply_fn(target, value), no return value. """ if not isinstance(targets, (list, tuple)): diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt index e524e9e743..4ca7a1b28c 100644 --- a/tensorflow/contrib/cmake/CMakeLists.txt +++ b/tensorflow/contrib/cmake/CMakeLists.txt @@ -336,40 +336,14 @@ endif() # MKL Support if (tensorflow_ENABLE_MKL_SUPPORT) add_definitions(-DINTEL_MKL -DEIGEN_USE_VML) - if (WIN32) - find_path(MKL_HOME_PLATFORM mkl - PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../ - $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../ - PATH_SUFFIXES windows) - set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include) - set(MKL_LINK_DIRS - ${MKL_HOME_PLATFORM}/mkl/lib/intel64 - ${MKL_HOME_PLATFORM}/tbb/lib/intel64/vc_mt - ${MKL_HOME_PLATFORM}/compiler/lib/intel64 - ${MKL_HOME_PLATFORM}/mkl/tools/builder/lib) - set(MKL_REDIST_DLL_DIRS - ${MKL_HOME_PLATFORM}/redist/intel64/mkl - ${MKL_HOME_PLATFORM}/redist/intel64/tbb/vc_mt - ${MKL_HOME_PLATFORM}/redist/intel64/compiler) - list(APPEND tensorflow_EXTERNAL_LIBRARIES - mkl_intel_lp64_dll mkl_sequential_dll mkl_core_dll mkl_rt mkl_cdll_intel64) - endif() - if (UNIX) - # Fix me: complete the path on linux - find_path(MKL_HOME_PLATFORM mkl - HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../ - $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../ - PATH_SUFFIXES linux) - set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include) - set(MKL_LINK_DIRS) # incompleted - set(MKL_REDIST_SO_DIRS) # incompleted - endif() - include_directories(${MKL_INCLUDE_DIRS}) - link_directories(${MKL_LINK_DIRS}) + include(mkl) + list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkl_STATIC_LIBRARIES}) + list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkl_copy_shared_to_destination) + include_directories(${mkl_INCLUDE_DIRS}) if (tensorflow_ENABLE_MKLDNN_SUPPORT) include(mkldnn) list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES}) - list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn) + list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn_copy_shared_to_destination) include_directories(${mkldnn_INCLUDE_DIRS}) else (tensorflow_ENABLE_MKLDNN_SUPPORT) add_definitions(-DINTEL_MKL_ML) diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake index 527ccdc8d8..5c5adaf579 100644 --- a/tensorflow/contrib/cmake/external/double_conversion.cmake +++ b/tensorflow/contrib/cmake/external/double_conversion.cmake @@ -16,15 +16,15 @@ include (ExternalProject) set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion) set(double_conversion_URL https://github.com/google/double-conversion.git) -set(double_conversion_TAG 5664746) +set(double_conversion_TAG 3992066a95b823efc8ccc1baf82a1cfc73f6e9b8) set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR}) set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so) set(double_conversion_INCLUDES ${double_conversion_BUILD}) if(WIN32) - set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib) + set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/$(Configuration)/double-conversion.lib) else() - set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a) + set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/libdouble-conversion.a) endif() set(double_conversion_HEADERS diff --git a/tensorflow/contrib/cmake/external/mkl.cmake b/tensorflow/contrib/cmake/external/mkl.cmake new file mode 100644 index 0000000000..a172e3a41a --- /dev/null +++ b/tensorflow/contrib/cmake/external/mkl.cmake @@ -0,0 +1,68 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +include (ExternalProject) + +# NOTE: Different from mkldnn.cmake, this file is meant to download mkl libraries +set(mkl_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include) +set(mkl_BIN_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkl/bin) +set(mkl_WIN mklml_win_2018.0.3.20180406.zip) # match for v0.14 +set(mkl_MAC mklml_mac_2018.0.3.20180406.tgz) +set(mkl_LNX mklml_lnx_2018.0.3.20180406.tgz) +set(mkl_TAG v0.14) +set(mkl_URL https://github.com/intel/mkl-dnn/releases) + +if (WIN32) + set(mkl_DOWNLOAD_URL ${mkl_URL}/download/${mkl_TAG}/${mkl_WIN}) + list(APPEND mkl_STATIC_LIBRARIES + ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/mklml.lib) + list(APPEND mkl_STATIC_LIBRARIES + ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libiomp5md.lib) + list(APPEND mkl_SHARED_LIBRARIES + ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/mklml.dll) + list(APPEND mkl_SHARED_LIBRARIES + ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libiomp5md.dll) +elseif (UNIX) + set(mkl_DOWNLOAD_URL ${mkl_URL}/download/${mkl_TAG}/${mkl_LNX}) + list(APPEND mkl_SHARED_LIBRARIES + ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libiomp5.so) + list(APPEND mkl_SHARED_LIBRARIES + ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libmklml_gnu.so) + list(APPEND mkl_SHARED_LIBRARIES + ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libmklml_intel.so) +elseif (APPLE) + set(mkl_DOWNLOAD_URL ${mkl_URL}/download/${mkl_TAG}/${mkl_MAC}) + #TODO need more information +endif () + +ExternalProject_Add(mkl + PREFIX mkl + URL ${mkl_DOWNLOAD_URL} + DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "") + +# put mkl dynamic libraries in one bin directory +add_custom_target(mkl_create_destination_dir + COMMAND ${CMAKE_COMMAND} -E make_directory ${mkl_BIN_DIRS} + DEPENDS mkl) + +add_custom_target(mkl_copy_shared_to_destination DEPENDS mkl_create_destination_dir) + +foreach(dll_file ${mkl_SHARED_LIBRARIES}) + add_custom_command(TARGET mkl_copy_shared_to_destination PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dll_file} ${mkl_BIN_DIRS}) +endforeach() diff --git a/tensorflow/contrib/cmake/external/mkldnn.cmake b/tensorflow/contrib/cmake/external/mkldnn.cmake index a639fdee36..8123ee1f39 100644 --- a/tensorflow/contrib/cmake/external/mkldnn.cmake +++ b/tensorflow/contrib/cmake/external/mkldnn.cmake @@ -22,8 +22,11 @@ set(mkldnn_TAG 3063b2e4c943983f6bf5f2fb9a490d4a998cd291) if(WIN32) if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*") set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.lib) + set(mkldnn_SHARED_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.dll) + set(mkldnn_BUILD ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release) else() set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.lib) + set(mkldnn_SHARED_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.dll) endif() else() set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/libmkldnn.a) @@ -31,6 +34,7 @@ endif() ExternalProject_Add(mkldnn PREFIX mkldnn + DEPENDS mkl GIT_REPOSITORY ${mkldnn_URL} GIT_TAG ${mkldnn_TAG} DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" @@ -40,5 +44,11 @@ ExternalProject_Add(mkldnn CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF - -DMKLINC:STRING=${MKL_INCLUDE_DIRS} + -DMKLINC:STRING=${mkl_INCLUDE_DIRS} ) + +# since mkldnn depends on mkl, copy the mkldnn.dll together with mklml.dll to mkl_bin_dirs +add_custom_target(mkldnn_copy_shared_to_destination DEPENDS mkldnn) + +add_custom_command(TARGET mkldnn_copy_shared_to_destination PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${mkldnn_SHARED_LIBRARIES} ${mkl_BIN_DIRS}) diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake index df6702a42c..e3b59001bc 100755 --- a/tensorflow/contrib/cmake/tf_python.cmake +++ b/tensorflow/contrib/cmake/tf_python.cmake @@ -755,26 +755,65 @@ set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt") file(WRITE "${api_init_list_file}" "${api_init_files}") # Run create_python_api.py to generate __init__.py files. -add_custom_command( - OUTPUT ${api_init_files} - DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops - - # tensorflow/__init__.py depends on files generated in this step. So, remove it while - # this step is running since the files aren't there yet. - COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py - - # Run create_python_api.py to generate API init files. - COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE} - "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" - "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py" - "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow" - "--package=tensorflow.python" - "--apiname=tensorflow" - "${api_init_list_file}" - COMMENT "Generating __init__.py files for Python API." - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python" -) +### TODO +# In order to download and compile MKL/MKL-DNN automatically in cmake script, mkl-built libraries should be added to system path +# to be loaded by python executor. However `add_custom_command` has an issue with `COMMAND ${CMAKE_COMMAND} -E env PATH=`, where +# arguments of multiple paths (such as D:/;D:/mkl) will be parsed in to seperate string without semicolon and that command fail to +# recongnize paths. As CUDA isn't built with MKL, the MKL built directory is the only path to this command to work around that issue. +# To not override the CUDA and system path in other circumstances, `if-else` branch used here to handle this problem, +# and should be removed if the path issue can be resolved. +### + +if (tensorflow_ENABLE_MKL_SUPPORT) + # add mkl dist dlls to system path for python + # TODO: In current cmake version, PY_RUNTIME_ENV behaves strange with multiple paths, + # so we have to specify only one path in it to work around the issue. We need this if/else + # to protect overwriting CUDA environments + set(PY_RUNTIME_ENV ${mkl_BIN_DIRS}) + add_custom_command( + OUTPUT ${api_init_files} + DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops + + # tensorflow/__init__.py depends on files generated in this step. So, remove it while + # this step is running since the files aren't there yet. + COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py + + # Run create_python_api.py to generate API init files. + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python PATH=${PY_RUNTIME_ENV} ${PYTHON_EXECUTABLE} + "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" + "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py" + "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow" + "--package=tensorflow.python" + "--apiname=tensorflow" + "${api_init_list_file}" + + COMMENT "Generating __init__.py files for Python API." + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python" + VERBATIM + ) +else (tensorflow_ENABLE_MKL_SUPPORT) + add_custom_command( + OUTPUT ${api_init_files} + DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops + + # tensorflow/__init__.py depends on files generated in this step. So, remove it while + # this step is running since the files aren't there yet. + COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py + + # Run create_python_api.py to generate API init files. + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE} + "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" + "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py" + "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow" + "--package=tensorflow.python" + "--apiname=tensorflow" + "${api_init_list_file}" + + COMMENT "Generating __init__.py files for Python API." + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python" + ) +endif (tensorflow_ENABLE_MKL_SUPPORT) add_custom_target(tf_python_api SOURCES ${api_init_files}) add_dependencies(tf_python_api tf_python_ops) diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake index 38f40452b5..fdf522f1fd 100644 --- a/tensorflow/contrib/cmake/tf_shared_lib.cmake +++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake @@ -145,3 +145,8 @@ install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/ # unsupported Eigen directory install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/ DESTINATION include/unsupported/Eigen) +# mkl +if (tensorflow_ENABLE_MKL_SUPPORT) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include/ + DESTINATION include/mkl) +endif (tensorflow_ENABLE_MKL_SUPPORT) diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md index c65a150464..cb1dd7d836 100644 --- a/tensorflow/contrib/constrained_optimization/README.md +++ b/tensorflow/contrib/constrained_optimization/README.md @@ -46,7 +46,7 @@ document. Imagine that we want to constrain the recall of a binary classifier to be at least 90%. Since the recall is proportional to the number of true positive classifications, which itself is a sum of indicator functions, this constraint -is non-differentible, and therefore cannot be used in a problem that will be +is non-differentiable, and therefore cannot be used in a problem that will be optimized using a (stochastic) gradient-based algorithm. For this and similar problems, TFCO supports so-called *proxy constraints*, diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py index 04014ab4ae..3791dae8d7 100644 --- a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py +++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py @@ -169,8 +169,8 @@ def _project_stochastic_matrix_wrt_euclidean_norm(matrix): del old_inactive # Needed by the condition, but not the body. iteration += 1 scale = (1.0 - standard_ops.reduce_sum( - matrix, axis=0, keep_dims=True)) / standard_ops.maximum( - 1.0, standard_ops.reduce_sum(inactive, axis=0, keep_dims=True)) + matrix, axis=0, keepdims=True)) / standard_ops.maximum( + 1.0, standard_ops.reduce_sum(inactive, axis=0, keepdims=True)) matrix += scale * inactive new_inactive = standard_ops.to_float(matrix > 0) matrix *= new_inactive @@ -206,10 +206,10 @@ def _project_log_stochastic_matrix_wrt_kl_divergence(log_matrix): # For numerical reasons, make sure that the largest matrix element is zero # before exponentiating. - log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keep_dims=True) + log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keepdims=True) log_matrix -= standard_ops.log( standard_ops.reduce_sum( - standard_ops.exp(log_matrix), axis=0, keep_dims=True)) + standard_ops.exp(log_matrix), axis=0, keepdims=True)) return log_matrix diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py index 33c48e20be..5590a4bf78 100644 --- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py @@ -58,6 +58,7 @@ class SlideDatasetTest(test.TestCase): [t.shape.as_list() for t in get_next]) with self.test_session() as sess: + # stride < window_size. # Slide over a finite input, where the window_size divides the # total number of elements. sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 7}) @@ -71,11 +72,9 @@ class SlideDatasetTest(test.TestCase): result_component[j]) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) - # Slide over a finite input, where the window_size does not # divide the total number of elements. sess.run(init_op, feed_dict={count: 20, window_size: 17, stride: 9}) - num_batches = (20 * 7 - 17) // 9 + 1 for i in range(num_batches): result = sess.run(get_next) @@ -86,6 +85,41 @@ class SlideDatasetTest(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) + # stride == window_size. + sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 14}) + num_batches = 20 * 7 // 14 + for i in range(num_batches): + result = sess.run(get_next) + for component, result_component in zip(components, result): + for j in range(14): + self.assertAllEqual(component[(i*14 + j) % 7]**2, + result_component[j]) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + # stride > window_size. + sess.run(init_op, feed_dict={count: 20, window_size: 10, stride: 14}) + num_batches = 20 * 7 // 14 + for i in range(num_batches): + result = sess.run(get_next) + for component, result_component in zip(components, result): + for j in range(10): + self.assertAllEqual(component[(i*14 + j) % 7]**2, + result_component[j]) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + # Drop the last batch which is smaller than window_size. + sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 19}) + num_batches = (20 * 7 - 7) // 19 # = 19 * 7 // 19 + for i in range(num_batches): + result = sess.run(get_next) + for component, result_component in zip(components, result): + for j in range(14): + self.assertAllEqual(component[(i*19 + j) % 7]**2, + result_component[j]) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + # Slide over a finite input, which is less than window_size, # should fail straight away. sess.run(init_op, feed_dict={count: 1, window_size: 10, stride: 4}) @@ -108,10 +142,6 @@ class SlideDatasetTest(test.TestCase): # Invalid stride should be an initialization time error. with self.assertRaises(errors.InvalidArgumentError): sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 0}) - with self.assertRaises(errors.InvalidArgumentError): - sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 3}) - with self.assertRaises(errors.InvalidArgumentError): - sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 5}) def assertSparseValuesEqual(self, a, b): self.assertAllEqual(a.indices, b.indices) diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py index f935beb1a9..3f3c5ca17c 100644 --- a/tensorflow/contrib/data/python/ops/sliding.py +++ b/tensorflow/contrib/data/python/ops/sliding.py @@ -86,7 +86,7 @@ def sliding_window_batch(window_size, stride=1): elements in the sliding window. stride: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the steps moving the sliding window forward for one iteration. The default - is `1`. It must be in `[1, window_size)`. + is `1`. It must be positive. Returns: A `Dataset` transformation function, which can be passed to diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb new file mode 100644 index 0000000000..54ebcad8e9 --- /dev/null +++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb @@ -0,0 +1,909 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "nmt_with_attention.ipynb", + "version": "0.3.2", + "views": {}, + "default_view": {}, + "provenance": [ + { + "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U", + "timestamp": 1527858391290 + }, + { + "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv", + "timestamp": 1527776041613 + } + ], + "private_outputs": true, + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "metadata": { + "id": "AOpGoE2T-YXS", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "##### Copyright 2018 The TensorFlow Authors.\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\").\n", + "\n", + "# Neural Machine Translation with Attention\n", + "\n", + "<table align=\"left\"><td>\n", + "<a target=\"_blank\" href=\"https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\">\n", + " <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a> \n", + "</td><td>\n", + "<a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on Github</a></td></table>" + ] + }, + { + "metadata": { + "id": "CiwtNgENbx2g", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "This notebook trains a sequence to sequence (seq2seq) model for Spanish to English translation using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). This is an advanced example that assumes some knowledge of sequence to sequence models.\n", + "\n", + "After training the model in this notebook, you will be able to input a Spanish sentence, such as *\"¿todavia estan en casa?\"*, and return the English translation: *\"are you still at home?\"*\n", + "\n", + "The translation quality is reasonable for a toy example, but the generated attention plot is perhaps more interesting. This shows which parts of the input sentence has the model's attention while translating:\n", + "\n", + "<img src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\">\n", + "\n", + "Note: This example takes approximately 10 mintues to run on a single P100 GPU." + ] + }, + { + "metadata": { + "id": "tnxXKDjq3jEL", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "from __future__ import absolute_import, division, print_function\n", + "\n", + "# Import TensorFlow >= 1.9 and enable eager execution\n", + "import tensorflow as tf\n", + "\n", + "tf.enable_eager_execution()\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import unicodedata\n", + "import re\n", + "import numpy as np\n", + "import os\n", + "import time\n", + "\n", + "print(tf.__version__)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "wfodePkj3jEa", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "## Download and prepare the dataset\n", + "\n", + "We'll use a language dataset provided by http://www.manythings.org/anki/. This dataset contains language translation pairs in the format:\n", + "\n", + "```\n", + "May I borrow this book?\t¿Puedo tomar prestado este libro?\n", + "```\n", + "\n", + "There are a variety of languages available, but we'll use the English-Spanish dataset. For convenience, we've hosted a copy of this dataset on Google Cloud, but you can also download your own copy. After downloading the dataset, here are the steps we'll take to prepare the data:\n", + "\n", + "1. Add a *start* and *end* token to each sentence.\n", + "2. Clean the sentences by removing special characters.\n", + "3. Create a word index and reverse word index (dictionaries mapping from word → id and id → word).\n", + "4. Pad each sentence to a maximum length." + ] + }, + { + "metadata": { + "id": "kRVATYOgJs1b", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "# Download the file\n", + "path_to_zip = tf.keras.utils.get_file(\n", + " 'spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', \n", + " extract=True)\n", + "\n", + "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\"" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "rd0jw-eC3jEh", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "# Converts the unicode file to ascii\n", + "def unicode_to_ascii(s):\n", + " return ''.join(c for c in unicodedata.normalize('NFD', s)\n", + " if unicodedata.category(c) != 'Mn')\n", + "\n", + "\n", + "def preprocess_sentence(w):\n", + " w = unicode_to_ascii(w.lower().strip())\n", + " \n", + " # creating a space between a word and the punctuation following it\n", + " # eg: \"he is a boy.\" => \"he is a boy .\" \n", + " # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n", + " w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n", + " w = re.sub(r'[\" \"]+', \" \", w)\n", + " \n", + " # replacing everything with space except (a-z, A-Z, \".\", \"?\", \"!\", \",\")\n", + " w = re.sub(r\"[^a-zA-Z?.!,¿]+\", \" \", w)\n", + " \n", + " w = w.rstrip().strip()\n", + " \n", + " # adding a start and an end token to the sentence\n", + " # so that the model know when to start and stop predicting.\n", + " w = '<start> ' + w + ' <end>'\n", + " return w" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "OHn4Dct23jEm", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "# 1. Remove the accents\n", + "# 2. Clean the sentences\n", + "# 3. Return word pairs in the format: [ENGLISH, SPANISH]\n", + "def create_dataset(path, num_examples):\n", + " lines = open(path, encoding='UTF-8').read().strip().split('\\n')\n", + " \n", + " word_pairs = [[preprocess_sentence(w) for w in l.split('\\t')] for l in lines[:num_examples]]\n", + " \n", + " return word_pairs" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "9xbqO7Iie9bb", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "# This class creates a word -> index mapping (e.g,. \"dad\" -> 5) and vice-versa \n", + "# (e.g., 5 -> \"dad\") for each language,\n", + "class LanguageIndex():\n", + " def __init__(self, lang):\n", + " self.lang = lang\n", + " self.word2idx = {}\n", + " self.idx2word = {}\n", + " self.vocab = set()\n", + " \n", + " self.create_index()\n", + " \n", + " def create_index(self):\n", + " for phrase in self.lang:\n", + " self.vocab.update(phrase.split(' '))\n", + " \n", + " self.vocab = sorted(self.vocab)\n", + " \n", + " self.word2idx['<pad>'] = 0\n", + " for index, word in enumerate(self.vocab):\n", + " self.word2idx[word] = index + 1\n", + " \n", + " for word, index in self.word2idx.items():\n", + " self.idx2word[index] = word" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "eAY9k49G3jE_", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "def max_length(tensor):\n", + " return max(len(t) for t in tensor)\n", + "\n", + "\n", + "def load_dataset(path, num_examples):\n", + " # creating cleaned input, output pairs\n", + " pairs = create_dataset(path, num_examples)\n", + "\n", + " # index language using the class defined above \n", + " inp_lang = LanguageIndex(sp for en, sp in pairs)\n", + " targ_lang = LanguageIndex(en for en, sp in pairs)\n", + " \n", + " # Vectorize the input and target languages\n", + " \n", + " # Spanish sentences\n", + " input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]\n", + " \n", + " # English sentences\n", + " target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]\n", + " \n", + " # Calculate max_length of input and output tensor\n", + " # Here, we'll set those to the longest sentence in the dataset\n", + " max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)\n", + " \n", + " # Padding the input and output tensor to the maximum length\n", + " input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, \n", + " maxlen=max_length_inp,\n", + " padding='post')\n", + " \n", + " target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, \n", + " maxlen=max_length_tar, \n", + " padding='post')\n", + " \n", + " return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "GOi42V79Ydlr", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### Limit the size of the dataset to experiment faster (optional)\n", + "\n", + "Training on the complete dataset of >100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):" + ] + }, + { + "metadata": { + "id": "cnxC7q-j3jFD", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "# Try experimenting with the size of that dataset\n", + "num_examples = 30000\n", + "input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "4QILQkOs3jFG", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "# Creating training and validation sets using an 80-20 split\n", + "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n", + "\n", + "# Show length\n", + "len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "rgCLkfv5uO3d", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "### Create a tf.data dataset" + ] + }, + { + "metadata": { + "id": "TqHsArVZ3jFS", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "BUFFER_SIZE = len(input_tensor_train)\n", + "BATCH_SIZE = 64\n", + "embedding_dim = 256\n", + "units = 1024\n", + "vocab_inp_size = len(inp_lang.word2idx)\n", + "vocab_tar_size = len(targ_lang.word2idx)\n", + "\n", + "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n", + "dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "TNfHIF71ulLu", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "## Write the encoder and decoder model\n", + "\n", + "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://www.tensorflow.org/tutorials/seq2seq). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://www.tensorflow.org/tutorials/seq2seq#background_on_the_attention_mechanism) from the seq2seq tutorial. The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n", + "\n", + "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\">\n", + "\n", + "The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*. \n", + "\n", + "Here are the equations that are implemented:\n", + "\n", + "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\">\n", + "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\">\n", + "\n", + "We're using *Bahdanau attention*. Lets decide on notation before writing the simplified form:\n", + "\n", + "* FC = Fully connected (dense) layer\n", + "* EO = Encoder output\n", + "* H = hidden state\n", + "* X = input to the decoder\n", + "\n", + "And the pseudo-code:\n", + "\n", + "* `score = FC(tanh(FC(EO) + FC(H)))`\n", + "* `attention weights = softmax(score, axis = 1)`. Softmax by default is applied on the last axis but here we want to apply it on the *1st axis*, since the shape of score is *(batch_size, max_length, hidden_size)*. `Max_length` is the length of our input. Since we are trying to assign a weight to each input, softmax should be applied on that axis.\n", + "* `context vector = sum(attention weights * EO, axis = 1)`. Same reason as above for choosing axis as 1.\n", + "* `embedding output` = The input to the decoder X is passed through an embedding layer.\n", + "* `merged vector = concat(embedding output, context vector)`\n", + "* This merged vector is then given to the GRU\n", + " \n", + "The shapes of all the vectors at each step have been specified in the comments in the code:" + ] + }, + { + "metadata": { + "id": "avyJ_4VIUoHb", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "def gru(units):\n", + " # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)\n", + " # the code automatically does that.\n", + " if tf.test.is_gpu_available():\n", + " return tf.keras.layers.CuDNNGRU(units, \n", + " return_sequences=True, \n", + " return_state=True, \n", + " recurrent_initializer='glorot_uniform')\n", + " else:\n", + " return tf.keras.layers.GRU(units, \n", + " return_sequences=True, \n", + " return_state=True, \n", + " recurrent_activation='sigmoid', \n", + " recurrent_initializer='glorot_uniform')" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "nZ2rI24i3jFg", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "class Encoder(tf.keras.Model):\n", + " def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n", + " super(Encoder, self).__init__()\n", + " self.batch_sz = batch_sz\n", + " self.enc_units = enc_units\n", + " self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n", + " self.gru = gru(self.enc_units)\n", + " \n", + " def call(self, x, hidden):\n", + " x = self.embedding(x)\n", + " output, state = self.gru(x, initial_state = hidden) \n", + " return output, state\n", + " \n", + " def initialize_hidden_state(self):\n", + " return tf.zeros((self.batch_sz, self.enc_units))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "yJ_B3mhW3jFk", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "class Decoder(tf.keras.Model):\n", + " def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):\n", + " super(Decoder, self).__init__()\n", + " self.batch_sz = batch_sz\n", + " self.dec_units = dec_units\n", + " self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n", + " self.gru = gru(self.dec_units)\n", + " self.fc = tf.keras.layers.Dense(vocab_size)\n", + " \n", + " # used for attention\n", + " self.W1 = tf.keras.layers.Dense(self.dec_units)\n", + " self.W2 = tf.keras.layers.Dense(self.dec_units)\n", + " self.V = tf.keras.layers.Dense(1)\n", + " \n", + " def call(self, x, hidden, enc_output):\n", + " # enc_output shape == (batch_size, max_length, hidden_size)\n", + " \n", + " # hidden shape == (batch_size, hidden size)\n", + " # hidden_with_time_axis shape == (batch_size, 1, hidden size)\n", + " # we are doing this to perform addition to calculate the score\n", + " hidden_with_time_axis = tf.expand_dims(hidden, 1)\n", + " \n", + " # score shape == (batch_size, max_length, hidden_size)\n", + " score = tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))\n", + " \n", + " # attention_weights shape == (batch_size, max_length, 1)\n", + " # we get 1 at the last axis because we are applying score to self.V\n", + " attention_weights = tf.nn.softmax(self.V(score), axis=1)\n", + " \n", + " # context_vector shape after sum == (batch_size, hidden_size)\n", + " context_vector = attention_weights * enc_output\n", + " context_vector = tf.reduce_sum(context_vector, axis=1)\n", + " \n", + " # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n", + " x = self.embedding(x)\n", + " \n", + " # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n", + " x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n", + " \n", + " # passing the concatenated vector to the GRU\n", + " output, state = self.gru(x)\n", + " \n", + " # output shape == (batch_size * max_length, hidden_size)\n", + " output = tf.reshape(output, (-1, output.shape[2]))\n", + " \n", + " # output shape == (batch_size * max_length, vocab)\n", + " x = self.fc(output)\n", + " \n", + " return x, state, attention_weights\n", + " \n", + " def initialize_hidden_state(self):\n", + " return tf.zeros((self.batch_sz, self.dec_units))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "P5UY8wko3jFp", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n", + "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "_ch_71VbIRfK", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "## Define the optimizer and the loss function" + ] + }, + { + "metadata": { + "id": "WmTHr5iV3jFr", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "optimizer = tf.train.AdamOptimizer()\n", + "\n", + "\n", + "def loss_function(real, pred):\n", + " mask = 1 - np.equal(real, 0)\n", + " loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n", + " return tf.reduce_mean(loss_)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "hpObfY22IddU", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "## Training\n", + "\n", + "1. Pass the *input* through the *encoder* which return *encoder output* and the *encoder hidden state*.\n", + "2. The encoder output, encoder hidden state and the decoder input (which is the *start token*) is passed to the decoder.\n", + "3. The decoder returns the *predictions* and the *decoder hidden state*.\n", + "4. The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n", + "5. Use *teacher forcing* to decide the next input to the decoder.\n", + "6. *Teacher forcing* is the technique where the *target word* is passed as the *next input* to the decoder.\n", + "7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate." + ] + }, + { + "metadata": { + "id": "ddefjBMa3jF0", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "EPOCHS = 10\n", + "\n", + "for epoch in range(EPOCHS):\n", + " start = time.time()\n", + " \n", + " hidden = encoder.initialize_hidden_state()\n", + " total_loss = 0\n", + " \n", + " for (batch, (inp, targ)) in enumerate(dataset):\n", + " loss = 0\n", + " \n", + " with tf.GradientTape() as tape:\n", + " enc_output, enc_hidden = encoder(inp, hidden)\n", + " \n", + " dec_hidden = enc_hidden\n", + " \n", + " dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1) \n", + " \n", + " # Teacher forcing - feeding the target as the next input\n", + " for t in range(1, targ.shape[1]):\n", + " # passing enc_output to the decoder\n", + " predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)\n", + " \n", + " loss += loss_function(targ[:, t], predictions)\n", + " \n", + " # using teacher forcing\n", + " dec_input = tf.expand_dims(targ[:, t], 1)\n", + " \n", + " total_loss += (loss / int(targ.shape[1]))\n", + " \n", + " variables = encoder.variables + decoder.variables\n", + " \n", + " gradients = tape.gradient(loss, variables)\n", + " \n", + " optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n", + "\n", + " if batch % 100 == 0:\n", + " print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,\n", + " batch,\n", + " loss.numpy() / int(targ.shape[1])))\n", + " \n", + " print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n", + " total_loss/len(input_tensor)))\n", + " print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "mU3Ce8M6I3rz", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "## Translate\n", + "\n", + "* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n", + "* Stop predicting when the model predicts the *end token*.\n", + "* And store the *attention weights for every time step*.\n", + "\n", + "Note: The encoder output is calculated only once for one input." + ] + }, + { + "metadata": { + "id": "EbQpyYs13jF_", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n", + " attention_plot = np.zeros((max_length_targ, max_length_inp))\n", + " \n", + " sentence = preprocess_sentence(sentence)\n", + "\n", + " inputs = [inp_lang.word2idx[i] for i in sentence.split(' ')]\n", + " inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')\n", + " inputs = tf.convert_to_tensor(inputs)\n", + " \n", + " result = ''\n", + "\n", + " hidden = [tf.zeros((1, units))]\n", + " enc_out, enc_hidden = encoder(inputs, hidden)\n", + "\n", + " dec_hidden = enc_hidden\n", + " dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)\n", + "\n", + " for t in range(max_length_targ):\n", + " predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)\n", + " \n", + " # storing the attention weigths to plot later on\n", + " attention_weights = tf.reshape(attention_weights, (-1, ))\n", + " attention_plot[t] = attention_weights.numpy()\n", + "\n", + " predicted_id = tf.multinomial(tf.exp(predictions), num_samples=1)[0][0].numpy()\n", + "\n", + " result += targ_lang.idx2word[predicted_id] + ' '\n", + "\n", + " if targ_lang.idx2word[predicted_id] == '<end>':\n", + " return result, sentence, attention_plot\n", + " \n", + " # the predicted ID is fed back into the model\n", + " dec_input = tf.expand_dims([predicted_id], 0)\n", + "\n", + " return result, sentence, attention_plot" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "s5hQWlbN3jGF", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "# function for plotting the attention weights\n", + "def plot_attention(attention, sentence, predicted_sentence):\n", + " fig = plt.figure(figsize=(10,10))\n", + " ax = fig.add_subplot(1, 1, 1)\n", + " ax.matshow(attention, cmap='viridis')\n", + " \n", + " fontdict = {'fontsize': 14}\n", + " \n", + " ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)\n", + " ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)\n", + "\n", + " plt.show()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "sl9zUHzg3jGI", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n", + " result, sentence, attention_plot = evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)\n", + " \n", + " print('Input: {}'.format(sentence))\n", + " print('Predicted translation: {}'.format(result))\n", + " \n", + " attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]\n", + " plot_attention(attention_plot, sentence.split(' '), result.split(' '))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "WrAM0FDomq3E", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "zSx2iM36EZQZ", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "A3LLCx3ZE0Ls", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "DUQVLVqUE1YW", + "colab_type": "code", + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + } + }, + "cell_type": "code", + "source": [ + "# wrong translation\n", + "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "RTe5P5ioMJwN", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "## Next steps\n", + "\n", + "* [Download a different dataset](http://www.manythings.org/anki/) to experiment with translations, for example, English to German, or English to French.\n", + "* Experiment with training on a larger dataset, or using more epochs\n" + ] + } + ] +}
\ No newline at end of file diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py index ff903a78cc..5b5557bd8f 100644 --- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py +++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py @@ -24,6 +24,7 @@ from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples from tensorflow.contrib.gan.python import train as tfgan_train from tensorflow.python.estimator import model_fn as model_fn_lib from tensorflow.python.estimator.canned import head +from tensorflow.python.estimator.export import export_output from tensorflow.python.framework import ops from tensorflow.python.ops import metrics as metrics_lib @@ -182,7 +183,10 @@ class GANHead(head._Head): # pylint: disable=protected-access if mode == model_fn_lib.ModeKeys.PREDICT: return model_fn_lib.EstimatorSpec( mode=model_fn_lib.ModeKeys.PREDICT, - predictions=gan_model.generated_data) + predictions=gan_model.generated_data, + export_outputs={ + 'predict': export_output.PredictOutput(gan_model.generated_data) + }) elif mode == model_fn_lib.ModeKeys.EVAL: gan_loss = self.create_loss( features=None, mode=mode, logits=gan_model, labels=None) diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py index 6587f1fc60..5309d87765 100644 --- a/tensorflow/contrib/gan/python/estimator/python/head_test.py +++ b/tensorflow/contrib/gan/python/estimator/python/head_test.py @@ -26,8 +26,11 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import test +from tensorflow.python.saved_model import signature_constants from tensorflow.python.training import training +_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY + def dummy_loss(gan_model, add_summaries=True): # pylint:disable=unused-argument return math_ops.reduce_sum(gan_model.discriminator_real_outputs - @@ -71,13 +74,15 @@ class GANHeadTest(test.TestCase): return {} def _test_modes_helper(self, mode): - self.gan_head.create_estimator_spec( + return self.gan_head.create_estimator_spec( features=None, mode=mode, logits=get_gan_model()) def test_modes_predict(self): - self._test_modes_helper(model_fn_lib.ModeKeys.PREDICT) + spec = self._test_modes_helper(model_fn_lib.ModeKeys.PREDICT) + self.assertItemsEqual((_DEFAULT_SERVING_KEY, 'predict'), + spec.export_outputs.keys()) def test_modes_eval(self): self._test_modes_helper(model_fn_lib.ModeKeys.EVAL) diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc index 1f9dd0decb..9025c992a4 100644 --- a/tensorflow/contrib/gdr/gdr_server_lib.cc +++ b/tensorflow/contrib/gdr/gdr_server_lib.cc @@ -57,7 +57,7 @@ Status GdrServer::Init() { new GdrWorker(env, remote_memory_manager_.get())); }; TF_RETURN_IF_ERROR( - GrpcServer::Init(nullptr, rendezvous_mgr_func, worker_func)); + GrpcServer::Init(nullptr, rendezvous_mgr_func, nullptr, worker_func)); return remote_memory_manager_->Init(); } diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h index a7b0d805a3..4cfaa0f36d 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h @@ -26,7 +26,7 @@ namespace optimized_ops { // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on // Jetson TX-2. This compiler does not support the offsetof() macro. #if defined(__aarch64__) && !defined(GOOGLE_L4T) - +#include <stddef.h> // clang-format gets confused with this file and ends up formatting lines to // be larger than 80 characters. Turn off here and back on at the end of the // file. diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h index cbeb53bee7..681448be20 100644 --- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h +++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h @@ -19,7 +19,9 @@ limitations under the License. #include <string> #include <vector> +// Place `<locale>` before <Python.h> to avoid build failures in macOS. #include <Python.h> +#include <locale> // We forward declare TFLite classes here to avoid exposing them to SWIG. namespace tflite { diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD index 4f35de4e5d..bbdf962d04 100644 --- a/tensorflow/contrib/opt/BUILD +++ b/tensorflow/contrib/opt/BUILD @@ -29,6 +29,7 @@ py_library( "python/training/reg_adagrad_optimizer.py", "python/training/sign_decay.py", "python/training/variable_clipping_optimizer.py", + "python/training/weight_decay_optimizers.py", ], srcs_version = "PY2AND3", deps = [ @@ -198,6 +199,25 @@ py_test( ], ) +py_test( + name = "weight_decay_optimizers_test", + srcs = ["python/training/weight_decay_optimizers_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":opt_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:session", + "//tensorflow/python:variables", + "//third_party/py/numpy", + ], +) + tf_py_test( name = "drop_stale_gradient_optimizer_test", srcs = ["python/training/drop_stale_gradient_optimizer_test.py"], diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py index b41148329d..65777b1323 100644 --- a/tensorflow/contrib/opt/__init__.py +++ b/tensorflow/contrib/opt/__init__.py @@ -22,16 +22,17 @@ from __future__ import print_function from tensorflow.contrib.opt.python.training.adamax import * from tensorflow.contrib.opt.python.training.addsign import * from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import * +from tensorflow.contrib.opt.python.training.elastic_average_optimizer import * from tensorflow.contrib.opt.python.training.external_optimizer import * +from tensorflow.contrib.opt.python.training.ggt import * from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import * +from tensorflow.contrib.opt.python.training.model_average_optimizer import * from tensorflow.contrib.opt.python.training.moving_average_optimizer import * from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import * from tensorflow.contrib.opt.python.training.nadam_optimizer import * from tensorflow.contrib.opt.python.training.powersign import * from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import * -from tensorflow.contrib.opt.python.training.elastic_average_optimizer import * -from tensorflow.contrib.opt.python.training.model_average_optimizer import * -from tensorflow.contrib.opt.python.training.ggt import * +from tensorflow.contrib.opt.python.training.weight_decay_optimizers import * # pylint: enable=wildcard-import from tensorflow.python.util.all_util import remove_undocumented @@ -47,6 +48,10 @@ _allowed_symbols = [ 'LazyAdamOptimizer', 'NadamOptimizer', 'MovingAverageOptimizer', + 'MomentumWOptimizer', + 'AdamWOptimizer', + 'DecoupledWeightDecayExtension', + 'extend_with_decoupled_weight_decay', 'ScipyOptimizerInterface', 'VariableClippingOptimizer', 'MultitaskOptimizerWrapper', diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py new file mode 100644 index 0000000000..b9cf40eb7b --- /dev/null +++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py @@ -0,0 +1,362 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Base class to make optimizers weight decay ready.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.training import adam +from tensorflow.python.training import momentum as momentum_opt +from tensorflow.python.training import optimizer +from tensorflow.python.util.tf_export import tf_export + + +class DecoupledWeightDecayExtension(object): + """This class allows to extend optimizers with decoupled weight decay. + + It implements the decoupled weight decay described by Loshchilov & Hutter + (https://arxiv.org/pdf/1711.05101.pdf), in which the weight decay is + decoupled from the optimization steps w.r.t. to the loss function. + For SGD variants, this simplifies hyperparameter search since it decouples + the settings of weight decay and learning rate. + For adaptive gradient algorithms, it regularizes variables with large + gradients more than L2 regularization would, which was shown to yield better + training loss and generalization error in the paper above. + + This class alone is not an optimizer but rather extends existing + optimizers with decoupled weight decay. We explicitly define the two examples + used in the above paper (SGDW and AdamW), but in general this can extend + any OptimizerX by using + `extend_with_weight_decay(OptimizerX, weight_decay=weight_decay)`. + In order for it to work, it must be the first class the Optimizer with + weight decay inherits from, e.g. + + ```python + class AdamWOptimizer(DecoupledWeightDecayExtension, adam.AdamOptimizer): + def __init__(self, weight_decay, *args, **kwargs): + super(AdamWOptimizer, self).__init__(weight_decay, *args, **kwargs). + ``` + + Note that this extension decays weights BEFORE applying the update based + on the gradient, i.e. this extension only has the desired behaviour for + optimizers which do not depend on the value of'var' in the update step! + """ + + def __init__(self, weight_decay, **kwargs): + """Construct the extension class that adds weight decay to an optimizer. + + Args: + weight_decay: A `Tensor` or a floating point value, the factor by which + a variable is decayed in the update step. + **kwargs: Optional list or tuple or set of `Variable` objects to + decay. + """ + self._decay_var_list = None # is set in minimize or apply_gradients + self._weight_decay = weight_decay + # The tensors are initialized in call to _prepare + self._weight_decay_tensor = None + super(DecoupledWeightDecayExtension, self).__init__(**kwargs) + + def minimize(self, loss, global_step=None, var_list=None, + gate_gradients=optimizer.Optimizer.GATE_OP, + aggregation_method=None, colocate_gradients_with_ops=False, + name=None, grad_loss=None, decay_var_list=None): + """Add operations to minimize `loss` by updating `var_list` with decay. + + This function is the same as Optimizer.minimize except that it allows to + specify the variables that should be decayed using decay_var_list. + If decay_var_list is None, all variables in var_list are decayed. + + For more information see the documentation of Optimizer.minimize. + + Args: + loss: A `Tensor` containing the value to minimize. + global_step: Optional `Variable` to increment by one after the + variables have been updated. + var_list: Optional list or tuple of `Variable` objects to update to + minimize `loss`. Defaults to the list of variables collected in + the graph under the key `GraphKeys.TRAINABLE_VARIABLES`. + gate_gradients: How to gate the computation of gradients. Can be + `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. + aggregation_method: Specifies the method used to combine gradient terms. + Valid values are defined in the class `AggregationMethod`. + colocate_gradients_with_ops: If True, try colocating gradients with + the corresponding op. + name: Optional name for the returned operation. + grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. + decay_var_list: Optional list of decay variables. + + Returns: + An Operation that updates the variables in `var_list`. If `global_step` + was not `None`, that operation also increments `global_step`. + + """ + self._decay_var_list = set(decay_var_list) if decay_var_list else False + return super(DecoupledWeightDecayExtension, self).minimize( + loss, global_step=global_step, var_list=var_list, + gate_gradients=gate_gradients, aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, name=name, + grad_loss=grad_loss) + + def apply_gradients(self, grads_and_vars, global_step=None, name=None, + decay_var_list=None): + """Apply gradients to variables and decay the variables. + + This function is the same as Optimizer.apply_gradients except that it + allows to specify the variables that should be decayed using + decay_var_list. If decay_var_list is None, all variables in var_list + are decayed. + + For more information see the documentation of Optimizer.apply_gradients. + + Args: + grads_and_vars: List of (gradient, variable) pairs as returned by + `compute_gradients()`. + global_step: Optional `Variable` to increment by one after the + variables have been updated. + name: Optional name for the returned operation. Default to the + name passed to the `Optimizer` constructor. + decay_var_list: Optional list of decay variables. + + Returns: + An `Operation` that applies the specified gradients. If `global_step` + was not None, that operation also increments `global_step`. + """ + self._decay_var_list = set(decay_var_list) if decay_var_list else False + return super(DecoupledWeightDecayExtension, self).apply_gradients( + grads_and_vars, global_step=global_step, name=name) + + def _prepare(self): + weight_decay = self._weight_decay + if callable(weight_decay): + weight_decay = weight_decay() + self._weight_decay_tensor = ops.convert_to_tensor( + weight_decay, name="weight_decay") + # Call the optimizers _prepare function. + super(DecoupledWeightDecayExtension, self)._prepare() + + def _decay_weights_op(self, var): + if not self._decay_var_list or var in self._decay_var_list: + return var.assign_sub(self._weight_decay * var, self._use_locking) + return control_flow_ops.no_op() + + def _decay_weights_sparse_op(self, var, indices, scatter_add): + if not self._decay_var_list or var in self._decay_var_list: + return scatter_add(var, indices, -self._weight_decay * var, + self._use_locking) + return control_flow_ops.no_op() + + # Here, we overwrite the apply functions that the base optimizer calls. + # super().apply_x resolves to the apply_x function of the BaseOptimizer. + def _apply_dense(self, grad, var): + with ops.control_dependencies([self._decay_weights_op(var)]): + return super(DecoupledWeightDecayExtension, self)._apply_dense(grad, var) + + def _resource_apply_dense(self, grad, var): + with ops.control_dependencies([self._decay_weights_op(var)]): + return super(DecoupledWeightDecayExtension, self)._resource_apply_dense( + grad, var) + + def _apply_sparse(self, grad, var): + scatter_add = state_ops.scatter_add + decay_op = self._decay_weights_sparse_op(var, grad.indices, scatter_add) + with ops.control_dependencies([decay_op]): + return super(DecoupledWeightDecayExtension, self)._apply_sparse( + grad, var) + + def _resource_scatter_add(self, x, i, v, _=None): + # last argument allows for one overflow argument, to have the same function + # signature as state_ops.scatter_add + with ops.control_dependencies( + [resource_variable_ops.resource_scatter_add(x.handle, i, v)]): + return x.value() + + def _resource_apply_sparse(self, grad, var, indices): + scatter_add = self._resource_scatter_add + decay_op = self._decay_weights_sparse_op(var, indices, scatter_add) + with ops.control_dependencies([decay_op]): + return super(DecoupledWeightDecayExtension, self)._resource_apply_sparse( + grad, var, indices) + + +def extend_with_decoupled_weight_decay(base_optimizer): + """Factory function returning an optimizer class with decoupled weight decay. + + Returns an optimizer class. An instance of the returned class computes the + update step of `base_optimizer` and additionally decays the weights. + E.g., the class returned by + `extend_with_decoupled_weight_decay(tf.train.AdamOptimizer)` is equivalent to + `tf.contrib.opt.AdamWOptimizer`. + + The API of the new optimizer class slightly differs from the API of the + base optimizer: + - The first argument to the constructor is the weight decay rate. + - `minimize` and `apply_gradients` accept the optional keyword argument + `decay_var_list`, which specifies the variables that should be decayed. + If `None`, all variables that are optimized are decayed. + + Usage example: + ```python + # MyAdamW is a new class + MyAdamW = extend_with_decoupled_weight_decay(tf.train.AdamOptimizer) + # Create a MyAdamW object + optimizer = MyAdamW(weight_decay=0.001, learning_rate=0.001) + sess.run(optimizer.minimize(loss, decay_variables=[var1, var2])) + + Note that this extension decays weights BEFORE applying the update based + on the gradient, i.e. this extension only has the desired behaviour for + optimizers which do not depend on the value of'var' in the update step! + ``` + + Args: + base_optimizer: An optimizer class that inherits from tf.train.Optimizer. + + Returns: + A new optimizer class that inherits from DecoupledWeightDecayExtension + and base_optimizer. + """ + + class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecayExtension, + base_optimizer): + """Base_optimizer with decoupled weight decay. + + This class computes the update step of `base_optimizer` and + additionally decays the variable with the weight decay being decoupled from + the optimization steps w.r.t. to the loss function, as described by + Loshchilov & Hutter (https://arxiv.org/pdf/1711.05101.pdf). + For SGD variants, this simplifies hyperparameter search since + it decouples the settings of weight decay and learning rate. + For adaptive gradient algorithms, it regularizes variables with large + gradients more than L2 regularization would, which was shown to yield + better training loss and generalization error in the paper above. + """ + + def __init__(self, weight_decay, *args, **kwargs): + # super delegation is necessary here + # pylint: disable=useless-super-delegation + super(OptimizerWithDecoupledWeightDecay, self).__init__( + weight_decay, *args, **kwargs) + # pylint: enable=useless-super-delegation + + return OptimizerWithDecoupledWeightDecay + + +@tf_export("contrib.opt.MomentumWOptimizer") +class MomentumWOptimizer(DecoupledWeightDecayExtension, + momentum_opt.MomentumOptimizer): + """Optimizer that implements the Momentum algorithm with weight_decay. + + This is an implementation of the SGDW optimizer described in "Fixing + Weight Decay Regularization in Adam" by Loshchilov & Hutter + (https://arxiv.org/abs/1711.05101) + ([pdf])(https://arxiv.org/pdf/1711.05101.pdf). + It computes the update step of `train.MomentumOptimizer` and additionally + decays the variable. Note that this is different from adding + L2 regularization on the variables to the loss. Decoupling the weight decay + from other hyperparameters (in particular the learning rate) simplifies + hyperparameter search. + + For further information see the documentation of the Momentum Optimizer. + + Note that this optimizer can also be instantiated as + ```python + extend_with_weight_decay(tf.train.MomentumOptimizer, + weight_decay=weight_decay) + ``` + """ + + def __init__(self, weight_decay, learning_rate, momentum, + use_locking=False, name="MomentumW", use_nesterov=False): + """Construct a new MomentumW optimizer. + + For further information see the documentation of the Momentum Optimizer. + + Args: + weight_decay: A `Tensor` or a floating point value. The weight decay. + learning_rate: A `Tensor` or a floating point value. The learning rate. + momentum: A `Tensor` or a floating point value. The momentum. + use_locking: If `True` use locks for update operations. + name: Optional name prefix for the operations created when applying + gradients. Defaults to "Momentum". + use_nesterov: If `True` use Nesterov Momentum. + See [Sutskever et al., 2013]( + http://jmlr.org/proceedings/papers/v28/sutskever13.pdf). + This implementation always computes gradients at the value of the + variable(s) passed to the optimizer. Using Nesterov Momentum makes the + variable(s) track the values called `theta_t + mu*v_t` in the paper. + + @compatibility(eager) + When eager execution is enabled, learning_rate, weight_decay and momentum + can each be a callable that takes no arguments and returns the actual value + to use. This can be useful for changing these values across different + invocations of optimizer functions. + @end_compatibility + """ + super(MomentumWOptimizer, self).__init__( + weight_decay, learning_rate=learning_rate, momentum=momentum, + use_locking=use_locking, name=name, use_nesterov=use_nesterov) + + +@tf_export("contrib.opt.AdamWOptimizer") +class AdamWOptimizer(DecoupledWeightDecayExtension, adam.AdamOptimizer): + """Optimizer that implements the Adam algorithm with weight decay. + + This is an implementation of the AdamW optimizer described in "Fixing + Weight Decay Regularization in Adam" by Loshchilov & Hutter + (https://arxiv.org/abs/1711.05101) + ([pdf])(https://arxiv.org/pdf/1711.05101.pdf). + + It computes the update step of `train.AdamOptimizer` and additionally decays + the variable. Note that this is different from adding L2 regularization on + the variables to the loss: it regularizes variables with large + gradients more than L2 regularization would, which was shown to yield better + training loss and generalization error in the paper above. + + For further information see the documentation of the Adam Optimizer. + + Note that this optimizer can also be instantiated as + ```python + extend_with_weight_decay(tf.train.AdamOptimizer, weight_decay=weight_decay) + ``` + """ + + def __init__(self, weight_decay, learning_rate=0.001, beta1=0.9, beta2=0.999, + epsilon=1e-8, use_locking=False, name="AdamW"): + """Construct a new AdamW optimizer. + + For further information see the documentation of the Adam Optimizer. + + Args: + weight_decay: A `Tensor` or a floating point value. The weight decay. + learning_rate: A Tensor or a floating point value. The learning rate. + beta1: A float value or a constant float tensor. + The exponential decay rate for the 1st moment estimates. + beta2: A float value or a constant float tensor. + The exponential decay rate for the 2nd moment estimates. + epsilon: A small constant for numerical stability. This epsilon is + "epsilon hat" in the Kingma and Ba paper (in the formula just before + Section 2.1), not the epsilon in Algorithm 1 of the paper. + use_locking: If True use locks for update operations. + name: Optional name for the operations created when applying gradients. + Defaults to "Adam". + """ + super(AdamWOptimizer, self).__init__( + weight_decay, learning_rate=learning_rate, beta1=beta1, beta2=beta2, + epsilon=epsilon, use_locking=use_locking, name=name) diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py new file mode 100644 index 0000000000..76d8a5697a --- /dev/null +++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py @@ -0,0 +1,188 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for optimizers with weight decay.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.opt.python.training import weight_decay_optimizers +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import adam + +WEIGHT_DECAY = 0.01 + + +def adamw_update_numpy(param, g_t, t, m, v, lr=0.001, beta1=0.9, + beta2=0.999, epsilon=1e-8): + lr_t = lr * np.sqrt(1 - beta2**t) / (1 - beta1**t) + + m_t = beta1 * m + (1 - beta1) * g_t + v_t = beta2 * v + (1 - beta2) * g_t * g_t + + param_t = (param - lr_t * m_t / (np.sqrt(v_t) + epsilon) - + (param * WEIGHT_DECAY)) + return param_t, m_t, v_t + + +def momentumw_update_numpy(param, g_t, m, lr=0.001, momentum=0.9, **_): + # v, t are not needed for momentum optimizer + m = momentum * m + g_t + param_t = param - lr * m - param * WEIGHT_DECAY + return param_t, m, None + + +class WeightDecayOptimizerTest(test.TestCase): + + def doTest(self, optimizer, update_fn, optimizer_name, slot_name, + use_resource=False, do_sparse=False): + for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]): + with self.test_session(graph=ops.Graph()): + # Initialize variables for numpy implementation. + m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 + var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) + var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) + + if use_resource: + var0 = resource_variable_ops.ResourceVariable( + var0_np, name="var0_%d" % i) + var1 = resource_variable_ops.ResourceVariable( + var1_np, name="var1_%d" % i) + else: + var0 = variables.Variable(var0_np) + var1 = variables.Variable(var1_np) + + if do_sparse: + grads0_np_indices = np.array([0, 1], dtype=np.int32) + grads0 = ops.IndexedSlices(constant_op.constant(grads0_np), + constant_op.constant(grads0_np_indices), + constant_op.constant([2])) + grads1_np_indices = np.array([0, 1], dtype=np.int32) + grads1 = ops.IndexedSlices(constant_op.constant(grads1_np), + constant_op.constant(grads1_np_indices), + constant_op.constant([2])) + else: + grads0 = constant_op.constant(grads0_np) + grads1 = constant_op.constant(grads1_np) + + opt = optimizer() + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + + if not context.executing_eagerly(): + with ops.Graph().as_default(): + # Shouldn't return non-slot variables from other graphs. + self.assertEqual(0, len(opt.variables())) + self.evaluate(variables.global_variables_initializer()) + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) + + # Run 3 steps of the optimizer + for t in range(1, 4): + if not context.executing_eagerly(): + self.evaluate(update) + elif t > 1: + opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + + var0_np, m0, v0 = update_fn(var0_np, grads0_np, t=t, m=m0, v=v0) + var1_np, m1, v1 = update_fn(var1_np, grads1_np, t=t, m=m1, v=v1) + + # Validate updated params + self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) + if use_resource: + self.assertEqual("var0_%d/%s:0" % (i, optimizer_name), + opt.get_slot(var=var0, name=slot_name).name) + + +class AdamWOptimizerTest(WeightDecayOptimizerTest): + + @staticmethod + def get_optimizer(): + return weight_decay_optimizers.AdamWOptimizer(WEIGHT_DECAY) + + def testSparse(self): + self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m", + use_resource=False, do_sparse=True) + + def testResourceSparse(self): + self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m", + use_resource=True, do_sparse=True) + + def testBasic(self): + self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m", + use_resource=False) + + @test_util.run_in_graph_and_eager_modes(reset_test=True) + def testResourceBasic(self): + self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m", + use_resource=True) + + +class MomentumWOptimizerTest(WeightDecayOptimizerTest): + + @staticmethod + def get_optimizer(): + return weight_decay_optimizers.MomentumWOptimizer(WEIGHT_DECAY, 0.001, 0.9) + + def testSparse(self): + self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW", + "momentum", use_resource=False, do_sparse=True) + + def testResourceSparse(self): + self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW", + "momentum", use_resource=True, do_sparse=True) + + def testBasic(self): + self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW", + "momentum", use_resource=False) + + @test_util.run_in_graph_and_eager_modes(reset_test=True) + def testResourceBasic(self): + self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW", + "momentum", use_resource=True) + + +class ExtendWithWeightDecayTest(WeightDecayOptimizerTest): + + @staticmethod + def get_optimizer(): + adamw = weight_decay_optimizers.extend_with_decoupled_weight_decay( + adam.AdamOptimizer) + return adamw(WEIGHT_DECAY) + + def testBasic(self): + self.doTest(self.get_optimizer, adamw_update_numpy, "Adam", "m", + use_resource=False) + + @test_util.run_in_graph_and_eager_modes(reset_test=True) + def testResourceBasic(self): + self.doTest(self.get_optimizer, adamw_update_numpy, "Adam", "m", + use_resource=True) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/solvers/python/ops/linear_equations.py b/tensorflow/contrib/solvers/python/ops/linear_equations.py index 9305c6a11c..85918bf850 100644 --- a/tensorflow/contrib/solvers/python/ops/linear_equations.py +++ b/tensorflow/contrib/solvers/python/ops/linear_equations.py @@ -28,7 +28,6 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops -from tensorflow.python.ops import linalg_ops def conjugate_gradient(operator, diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD index a5d8b061b6..adda0b758b 100644 --- a/tensorflow/contrib/tensorrt/BUILD +++ b/tensorflow/contrib/tensorrt/BUILD @@ -49,7 +49,6 @@ tf_cuda_cc_test( tf_custom_op_library( name = "python/ops/_trt_engine_op.so", srcs = [ - "ops/trt_calib_op.cc", "ops/trt_engine_op.cc", ], deps = [ @@ -76,11 +75,9 @@ tf_cuda_library( cc_library( name = "trt_engine_op_kernel", srcs = [ - "kernels/trt_calib_op.cc", "kernels/trt_engine_op.cc", ], hdrs = [ - "kernels/trt_calib_op.h", "kernels/trt_engine_op.h", ], copts = tf_copts(), @@ -89,20 +86,22 @@ cc_library( ":trt_logging", ":trt_plugins", ":trt_resources", + ":trt_conversion", + ":utils", "//tensorflow/core:gpu_headers_lib", "//tensorflow/core:lib_proto_parsing", "//tensorflow/core:stream_executor_headers_lib", + "//tensorflow/core/grappler/costs:graph_properties", ] + if_tensorrt([ "@local_config_tensorrt//:nv_infer", ]) + tf_custom_op_library_additional_deps(), - # TODO(laigd) + # TODO(laigd): fix this by merging header file in cc file. alwayslink = 1, # buildozer: disable=alwayslink-with-hdrs ) tf_gen_op_libs( op_lib_names = [ "trt_engine_op", - "trt_calib_op", ], ) @@ -122,7 +121,6 @@ tf_gen_op_wrapper_py( name = "trt_engine_op", gen_locally = True, deps = [ - ":trt_calib_op_op_lib", ":trt_engine_op_op_lib", ":trt_logging", ":trt_shape_function", @@ -140,7 +138,6 @@ tf_custom_op_py_library( kernels = [ ":trt_engine_op_kernel", ":trt_engine_op_op_lib", - ":trt_calib_op_op_lib", ":trt_shape_function", ], srcs_version = "PY2AND3", @@ -191,7 +188,6 @@ tf_py_wrap_cc( deps = [ ":trt_conversion", ":trt_engine_op_kernel", - "//tensorflow/core:framework_lite", "//third_party/python_runtime:headers", ], ) @@ -211,6 +207,7 @@ tf_cuda_library( ], deps = [ ":trt_logging", + ":utils", "//tensorflow/core:framework_headers_lib", "//tensorflow/core:framework_lite", "//tensorflow/core:lib_proto_parsing", @@ -237,12 +234,12 @@ tf_cuda_library( ":trt_plugins", ":trt_logging", ":trt_resources", + ":utils", "//tensorflow/core/grappler/clusters:cluster", "//tensorflow/core/grappler/optimizers:custom_graph_optimizer", "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler:utils", - "//tensorflow/core:framework", "//tensorflow/core:gpu_runtime", "//tensorflow/core:framework_lite", "//tensorflow/core:graph", @@ -343,3 +340,8 @@ py_test( "//tensorflow/python:framework_test_lib", ], ) + +cc_library( + name = "utils", + hdrs = ["convert/utils.h"], +) diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index da4dd5a14c..4dc1c551cc 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/tensorrt/convert/convert_graph.h" -#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" +#include <fstream> #include <list> #include <map> #include <set> @@ -24,10 +24,17 @@ limitations under the License. #include <vector> #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h" +#include "tensorflow/contrib/tensorrt/convert/utils.h" +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" +#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h" +#include "tensorflow/contrib/tensorrt/resources/trt_resources.h" #include "tensorflow/contrib/tensorrt/segment/segment.h" #include "tensorflow/core/common_runtime/gpu/gpu_id.h" #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h" #include "tensorflow/core/common_runtime/gpu/process_state.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" +#include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" @@ -39,17 +46,39 @@ limitations under the License. #include "tensorflow/core/grappler/utils.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/protobuf/config.pb.h" // NOLINT #include "tensorflow/core/protobuf/device_properties.pb.h" // NOLINT +#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT +#include "tensorflow/core/util/device_name_utils.h" #if GOOGLE_CUDA #if GOOGLE_TENSORRT +#include "cuda/include/cuda_runtime_api.h" #include "tensorrt/include/NvInfer.h" - namespace tensorflow { namespace tensorrt { namespace convert { +using ::tensorflow::strings::StrAppend; +using ::tensorflow::strings::StrCat; + +// Returns compiled TRT version information {Maj, Min, Patch} +std::vector<int> GetLinkedTensorRTVersion() { + return {NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH}; +} + +// Returns loaded TRT library version {Maj, Min, Patch} +std::vector<int> GetLoadedTensorRTVersion() { + int ver = getInferLibVersion(); + int ver_major = ver / 1000; + ver = ver - ver_major * 1000; + int ver_minor = ver / 100; + int ver_patch = ver - ver_minor * 100; + return {ver_major, ver_minor, ver_patch}; +} + namespace { bool IsTensorRTCandidate(const tensorflow::Node* node) { @@ -82,229 +111,6 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) { PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string())); } -void GetSubGraphIncomingEdges(const tensorflow::Graph& graph, - const std::set<int>& subgraph_node_ids, - tensorflow::EdgeSet* incoming_edges) { - for (int node_id : subgraph_node_ids) { - const tensorflow::Node* node = graph.FindNodeId(node_id); - for (const tensorflow::Edge* edge : node->in_edges()) { - if (!subgraph_node_ids.count(edge->src()->id()) && - !edge->src()->IsSource() && !edge->IsControlEdge()) { - incoming_edges->insert(edge); - VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name() - << " Y, "; - } else { - VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name() - << " N, "; - } - } - } -} - -void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph, - const std::set<int>& subgraph_node_ids, - tensorflow::EdgeSet* outgoing_edges) { - for (int node_id : subgraph_node_ids) { - const tensorflow::Node* node = graph.FindNodeId(node_id); - for (const tensorflow::Edge* edge : node->out_edges()) { - if (!subgraph_node_ids.count(edge->dst()->id()) && - !edge->dst()->IsSink() && !edge->IsControlEdge()) { - VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name() - << " Y, "; - outgoing_edges->insert(edge); - } else { - VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name() - << " N, "; - } - } - } -} - -std::pair<string, int> ParseTensorName(const string& name, - int default_idx = 0) { - string name_no_idx = name; - int idx = default_idx; - const size_t sep = name_no_idx.find_last_of(':'); - if (sep != string::npos) { - name_no_idx = name_no_idx.substr(0, sep); - idx = std::stoi(name.substr(sep + 1)); - } - return std::make_pair(name_no_idx, idx); -} - -std::unordered_map<string, std::vector<int>> BuildTensorNameMap( - const std::vector<string>& tensor_names) { - std::unordered_map<string, std::vector<int>> result; - for (const string& tensor_name : tensor_names) { - string node_name; - int index; - std::tie(node_name, index) = ParseTensorName(tensor_name); - result[node_name].push_back(index); - } - return result; -} - -// TODO(sami): convert references to pointers -struct ConvertGraphParams { - ConvertGraphParams( - tensorflow::Graph& inp_graph, - const std::vector<string>& output_node_names, - const std::set<int>& subgraph_node_id_numbers, - size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes, - const tensorflow::grappler::GraphProperties& current_graph_properties, - std::unordered_map<string, std::pair<int, string>>* output_edges, - int engine_precision_mode, const string& device_name, - std::shared_ptr<nvinfer1::IGpuAllocator> allocator, int cuda_gpu_id) - : graph(inp_graph), - output_names(output_node_names), - subgraph_node_ids(subgraph_node_id_numbers), - max_batch_size(max_supported_batch_size), - max_workspace_size_bytes(max_consumed_workspace_size_bytes), - graph_properties(current_graph_properties), - output_edge_map(output_edges), - precision_mode(engine_precision_mode), - device_name_(device_name), - allocator_(allocator), - cuda_gpu_id_(cuda_gpu_id) {} - tensorflow::Graph& graph; - const std::vector<string>& output_names; - const std::set<int>& subgraph_node_ids; - size_t max_batch_size; - size_t max_workspace_size_bytes; - const tensorflow::grappler::GraphProperties& graph_properties; - std::unordered_map<string, std::pair<int, string>>* output_edge_map; - int precision_mode; - string device_name_; - std::shared_ptr<nvinfer1::IGpuAllocator> allocator_; - int cuda_gpu_id_; - std::vector<std::pair<int, int>> subgraph_inputs; - std::vector<std::pair<int, int>> subgraph_outputs; - tensorflow::EdgeSet subgraph_incoming_edges; - tensorflow::EdgeSet subgraph_outgoing_edges; -}; - -static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) { - GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids, - &p->subgraph_incoming_edges); - - std::set<std::pair<int, int>> unique_tensors; - // Add only unique input source nodes. If output of an outside node is shared - // between multiple nodes inside the engine, only one edge should be created - for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) { - unique_tensors.insert({edge->src()->id(), edge->src_output()}); - } - p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(), - unique_tensors.end()); - GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids, - &p->subgraph_outgoing_edges); - unique_tensors.clear(); - // Similar to above, if multiple ouside nodes are sharing the output of an - // internal node only one output port should be created and shared between - // outputs - for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) { - unique_tensors.insert({edge->src()->id(), edge->src_output()}); - } - p->subgraph_outputs.reserve(unique_tensors.size()); - p->subgraph_outputs.insert(p->subgraph_outputs.begin(), - unique_tensors.begin(), unique_tensors.end()); - return tensorflow::Status::OK(); -} - -tensorflow::Status GetCalibNode(ConvertGraphParams* params) { - TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params)); - tensorflow::NodeDef trt_node_def; - SubGraphParams s(params->graph, params->subgraph_node_ids, - params->subgraph_inputs, params->subgraph_outputs, - params->max_batch_size, params->max_workspace_size_bytes, - params->graph_properties, params->output_edge_map, - &trt_node_def, params->precision_mode, params->device_name_, - params->allocator_, params->cuda_gpu_id_); - TF_RETURN_IF_ERROR(InjectCalibrationNode(s)); - tensorflow::Status status; - tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status); - - TF_RETURN_IF_ERROR(status); - - for (auto in_edge : - params->subgraph_incoming_edges) { // loop over incoming edges and - // attach them to calib node - auto src_output = in_edge->src_output(); - auto dst_node = in_edge->dst(); - auto dst_input = in_edge->dst_input(); - VLOG(1) << " update edge " << trt_node->name() << ":" << src_output - << " -> " << dst_node->name() << ":" << dst_input; - TF_RETURN_IF_ERROR( - params->graph.UpdateEdge(trt_node, src_output, dst_node, dst_input)); - } - return tensorflow::Status::OK(); -} - -tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) { - TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params)); - tensorflow::NodeDef trt_node_def; - - SubGraphParams s(params->graph, params->subgraph_node_ids, - params->subgraph_inputs, params->subgraph_outputs, - params->max_batch_size, params->max_workspace_size_bytes, - params->graph_properties, params->output_edge_map, - &trt_node_def, params->precision_mode, params->device_name_, - params->allocator_, params->cuda_gpu_id_); - TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s)); - tensorflow::Status status; - tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status); - - // AddNode does not wire edges. - // Re-map incoming edges to use the new TRT node instead of the orig subgraph - std::map<std::pair<int, int>, int> subgraph_edge_to_input_map; - for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) { - subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i}); - } - std::set<std::pair<int, int>> unique_tensors; - for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) { - std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()}; - if (unique_tensors.count(old_src)) continue; - unique_tensors.insert(old_src); - int new_src_output = subgraph_edge_to_input_map.at(old_src); - params->graph.AddEdge(edge->src(), edge->src_output(), trt_node, - new_src_output); - VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output() - << " -> " << trt_node->name() << ":" << new_src_output; - params->graph.RemoveEdge(edge); - } - if (VLOG_IS_ON(2)) { - VLOG(2) << "new edge count: " << trt_node->in_edges().size(); - for (const tensorflow::Edge* edge : trt_node->in_edges()) { - VLOG(2) << edge->src()->name() << " port: " << edge->src_output(); - } - } - TF_RETURN_IF_ERROR(status); - - // Re-map outgoing edges to use the new TRT node instead of the orig subgraph - std::map<std::pair<int, int>, int> subgraph_edge_to_output_map; - for (size_t i = 0; i < params->subgraph_outputs.size(); ++i) { - subgraph_edge_to_output_map.insert({params->subgraph_outputs.at(i), i}); - } - TF_RETURN_IF_ERROR(status); - for (const tensorflow::Edge* edge : params->subgraph_outgoing_edges) { - std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()}; - int new_src_output = subgraph_edge_to_output_map.at(old_src); - TF_RETURN_IF_ERROR(params->graph.UpdateEdge( - trt_node, new_src_output, edge->dst(), edge->dst_input())); - VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> " - << edge->dst()->name() << ":" << edge->dst_input(); - } - // Remove the original subgraph - for (int node_id : params->subgraph_node_ids) { - tensorflow::Node* node = params->graph.FindNodeId(node_id); - // Don't remove the input placeholders - if (node->type_string() == "Placeholder") { - continue; - } - params->graph.RemoveNode(node); - } - return tensorflow::Status::OK(); -} - tensorflow::Status BuildNodeMap( const tensorflow::Graph& graph, std::unordered_map<string, tensorflow::Node*>* node_map) { @@ -318,51 +124,77 @@ tensorflow::Status BuildNodeMap( } } // namespace + +// Function to get calibration from ResourceMgr and put them into nodedef. tensorflow::Status ConvertCalibGraphToInferGraph( - const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph) { + const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph, + bool is_dyn_op) { VLOG(0) << "Starting Calib Conversion"; - tensorflow::Graph graph(tensorflow::OpRegistry::Global()); - TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph( - tensorflow::GraphConstructorOptions(), graph_def, &graph)); - // get calib nodes - std::vector<tensorflow::Node*> calib_nodes; - std::vector<tensorflow::Node*> topo_order; - tensorflow::GetPostOrder(graph, &topo_order); - for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) { - auto node = *rit; - if (node->type_string() == "TRTCalibOp") { - VLOG(1) << "Found Calib Node " << node->name(); - calib_nodes.push_back(node); - } + infer_graph->CopyFrom(graph_def); + auto trt_rm = TRTResourceManager::instance(); + auto calib_rm = trt_rm->getManager("TRTCalibration"); + int num_nodes = infer_graph->node_size(); + if (!is_dyn_op) { + LOG(WARNING) << "Construction of static int8 engine is not implemented " + "yet!. Dynamic engine will be constructed"; } - VLOG(0) << "Num Calib nodes in graph= " << calib_nodes.size(); - if (calib_nodes.size() == 0) - return tensorflow::errors::FailedPrecondition( - "Graph doesn't contain any calibration nodes!." - " Please generate calibration graph and run calibration first"); - for (auto n : calib_nodes) { - TF_RETURN_IF_ERROR( - tensorrt::convert::ConvertCalibrationNodeToEngineNode(graph, n)); + for (int i = 0; i < num_nodes; ++i) { + auto n = infer_graph->mutable_node(i); + if (n->op() == "TRTEngineOp") { + VLOG(1) << "Processing " << n->name(); + string container_name = n->attr().at("segment_funcdef_name").s(); + TRTCalibrationResource* cres = nullptr; + auto status = calib_rm->Lookup(container_name, "Calibrator", &cres); + if (!status.ok()) { + LOG(ERROR) << "Could not get Calibration information. Did you run with " + "calibration data?"; + return tensorflow::errors::FailedPrecondition( + "Need to run graph with calibration data first!"); + } + if (cres->calibrator_) { + cres->calibrator_->setDone(); + cres->thr_->join(); + const auto& calibration_table = + cres->calibrator_->getCalibrationTableAsString(); + if (!calibration_table.size()) { + LOG(ERROR) << "Calibration table is empty"; + return tensorflow::errors::Unknown( + "Calibration table is missing. This shouldn't have happened!"); + } + n->mutable_attr()->at("calibration_data").set_s(calibration_table); + } else { + LOG(ERROR) << "Can't get TRTCalibrator from resource manager!"; + return tensorflow::errors::Unknown( + "Can't get TRTCalibrator from resource manager!"); + } + cres->Unref(); + } } - graph.ToGraphDef(infer_graph); return tensorflow::Status::OK(); } +// Entry function from Python. tensorflow::Status ConvertGraphDefToTensorRT( const tensorflow::GraphDef& graph_def, const std::vector<string>& output_names, size_t max_batch_size, size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def, - int precision_mode = FP32MODE, int minimum_segment_size = 3) { + int precision_mode, int minimum_segment_size, bool is_dyn_op, + int max_cached_engines, std::vector<int> cached_engine_batches) { // optimization pass tensorflow::grappler::GrapplerItem item; item.fetch = output_names; item.graph = graph_def; - + // grappler requires a virtual cluster with a proper GPU device + // in order to calculate flops>0 or fails with FATAL + // We add numbers from a Pascal card here to have flops>0 tensorflow::DeviceProperties device_properties; device_properties.set_type("GPU"); device_properties.mutable_environment()->insert({"architecture", "6"}); - tensorflow::grappler::Cluster* cluster = - new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}}); + device_properties.set_num_cores(3584); + device_properties.set_frequency(1531); + std::unique_ptr<tensorflow::grappler::Cluster> cluster( + new tensorflow::grappler::VirtualCluster( + {{"/GPU:0", device_properties}})); // single machine int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores(); @@ -370,134 +202,633 @@ tensorflow::Status ConvertGraphDefToTensorRT( VLOG(2) << "cpu_cores: " << num_cpu_cores; VLOG(2) << "gpus: " << num_gpus; tensorflow::RewriterConfig rw_cfg; + // use only const folding and layout for the time being since new optimizers + // break the graph for us + rw_cfg.add_optimizers("constfold"); + rw_cfg.add_optimizers("layout"); + rw_cfg.set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE); tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg); tensorflow::GraphDef gdef; - TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster, item, &gdef)); + TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, &gdef)); item.graph = gdef; // AJ refactoring shape inference through grappler/GraphProperties. tensorflow::grappler::GraphProperties static_graph_properties(item); TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true)); // Build full graph - - return ConvertAfterShapes(gdef, output_names, max_batch_size, - max_workspace_size_bytes, new_graph_def, - precision_mode, minimum_segment_size, - static_graph_properties, nullptr); + ConversionParams cp; + cp.input_graph_def = &gdef; + cp.output_names = &output_names; + cp.max_batch_size = max_batch_size; + cp.output_graph_def = new_graph_def; + cp.precision_mode = precision_mode; + cp.is_dyn_op = is_dyn_op; + cp.max_cached_engines = max_cached_engines; + cp.cached_engine_batches = cached_engine_batches; + cp.minimum_segment_size = minimum_segment_size; + cp.graph_properties = &static_graph_properties; + cp.max_workspace_size_bytes = max_workspace_size_bytes; + if (VLOG_IS_ON(5)) { + std::fstream f; + f.open("TRTConversionInput.pb", + std::fstream::out | std::fstream::binary | std::fstream::trunc); + f << gdef.SerializeAsString(); + f.close(); + } + return ConvertAfterShapes(cp); } -tensorflow::Status ConvertAfterShapes( - const tensorflow::GraphDef& gdef, const std::vector<string>& output_names, - size_t max_batch_size, size_t max_workspace_size_bytes, - tensorflow::GraphDef* new_graph_def, int precision_mode, - int minimum_segment_size, +// Function to get subsegment information structure. +tensorflow::Status GetEngineInfo( + const tensorflow::Graph* g, const tensorflow::grappler::GraphProperties& graph_properties, - const tensorflow::grappler::Cluster* cluster) { - // Segment the graph into subgraphs that can be converted to TensorRT - tensorflow::tensorrt::segment::SegmentOptions segment_options; + const std::set<string>& segment_nodes, + const std::unordered_map<string, tensorflow::Node*>& node_map, + const std::vector<tensorflow::Node*>& reverse_topo_order, + EngineInfo* info) { + std::vector<int> subgraph_node_ids; + std::set<string> segment_devices; + int input_port = 0; + int output_port = 0; + + // Map from src_node_name+port to the unique port numbers of the TRT op, where + // the src_node_name is the name of the source node of the input/output + // edge, thus there must not be any duplicates since source nodes of + // input/output edges must be in different split of the graph. + // TODO(aaroey): consider using node id and port instead. + std::unordered_map<string, int> created_edges; + for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend(); + ++it) { + const auto& node_name = (*it)->name(); + + if (segment_nodes.count(node_name) == 0) continue; + auto node = node_map.at(node_name); + auto node_device = node->requested_device(); + if (!node_device.empty()) { + segment_devices.insert(node_device); + } else { + if (node->has_assigned_device_name()) { + segment_devices.insert(node->assigned_device_name()); + } else { + VLOG(2) << "Node " << node->name() + << " neither have requested device nor assigned device"; + } + } + int node_id = node->id(); + subgraph_node_ids.push_back(node_id); + for (const auto edge : node->in_edges()) { + auto input_node = edge->src(); + if (segment_nodes.count(input_node->name()) == 0) { + // Add constant input node into the segment. We don't care if it has + // other output edges going into other engines or TF nodes. Since we add + // it only to the subsegment node list, not the subsegment itself, it + // won't be removed from the graph. If it doesn't have any edges, TF + // will prune it out. + if (input_node->type_string() == "Const") { + subgraph_node_ids.push_back(input_node->id()); + } else if (!edge->IsControlEdge() && !input_node->IsSource()) { + string s(input_node->name()); + StrAppend(&s, ":", edge->src_output()); + VLOG(1) << "Input edge = " << s; + int port = input_port; + if (created_edges.count(s)) { + port = created_edges.at(s); + } else { + created_edges.insert({s, port}); + input_port++; + } + info->connections.emplace_back(input_node->name(), input_node->id(), + edge->src_output(), node_name, node_id, + edge->dst_input(), true, port); + } + } + } + for (const auto edge : node->out_edges()) { + auto output_node = edge->dst(); + if (segment_nodes.count(output_node->name()) == 0 && + !edge->IsControlEdge() && !output_node->IsSink()) { + string s(node_name); + StrAppend(&s, ":", edge->src_output()); + VLOG(1) << "Output edge = " << s; + int port = output_port; + if (created_edges.count(s)) { + port = created_edges.at(s); + } else { + created_edges.insert({s, port}); + output_port++; + } + info->connections.emplace_back(output_node->name(), output_node->id(), + edge->dst_input(), node_name, node_id, + edge->src_output(), false, port); + } + } + } + + TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef( + g, graph_properties, subgraph_node_ids, &info->connections, + &info->segment_graph_def, &info->engine_name)); + // TODO(sami): This should not happen once segmenter is updated. + if (segment_devices.size() == 1) { + info->device = *segment_devices.begin(); + } else if (segment_devices.size() > 1) { + LOG(WARNING) << "Detected multiple(" << segment_devices.size() + << ") devices for the segment. Picking first one to continue " + << "but this shouldn't have happened"; + info->device = *segment_devices.begin(); + } else { + VLOG(1) << "Segment devices size is 0"; + } + return Status::OK(); +} + +// Function to insert a TRT node into the graph. The graph is not modified if +// the returned status is not ok. +// 'alloc' is only used for creating static engine. +tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, + const std::vector<EngineInfo>& infos, int pos, + nvinfer1::IGpuAllocator* alloc, + int max_batch_size) { + const auto& info = infos.at(pos); + std::vector<tensorflow::TensorShapeProto> out_shapes; + std::vector<tensorflow::TensorShapeProto> input_shapes; + std::vector<tensorflow::PartialTensorShape> shapes; + std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs; + std::vector<tensorflow::DataType> out_types; + VLOG(1) << "Processing " << info.engine_name; + + // Update the shape and data types of input/output nodes, and find all unique + // inputs. + for (const auto& conn : info.connections) { + if (!conn.is_input_edge) { + // Set the shapes and data types of output edge. + tensorflow::TensorShapeProto out_shape; + // shape of the output node inside segment + conn.inside_shape.AsProto(&out_shape); + if (out_shapes.size() <= conn.port_number) { + out_shapes.resize(conn.port_number + 1); + out_types.resize(conn.port_number + 1); + } + out_shapes.at(conn.port_number) = out_shape; + out_types.at(conn.port_number) = conn.connection_type; + continue; + } + + // Set the shapes and data types of input edge. + tensorflow::TensorShapeProto in_shape; + conn.outside_shape.AsProto(&in_shape); + if (input_shapes.size() <= conn.port_number) { + input_shapes.resize(conn.port_number + 1); + shapes.resize(conn.port_number + 1); + } + input_shapes.at(conn.port_number) = in_shape; + shapes.at(conn.port_number) = conn.outside_shape; + + string input_node = conn.outside_node_name; + int input_port = conn.outside_port; + bool found_engine = false; + // Rewire the inputs to other engines if they contain original input node. + // Note that we use the information of the engine here, not the information + // of the created TRT nodes, so we're able to find all the connections to + // any other engines beforehand. + for (size_t t = 0; t < infos.size(); ++t) { + if (t == pos) continue; + auto& engine_info = infos.at(t); + for (const auto& eng_conn : engine_info.connections) { + if (eng_conn.is_input_edge) continue; + if (eng_conn.inside_node_name == input_node) { + input_node = engine_info.engine_name; + if (eng_conn.inside_port == input_port) { + input_port = eng_conn.port_number; + found_engine = true; + break; + } + } + } + if (found_engine) break; + } + VLOG(1) << "Engine Input " << input_node << ":" << input_port << " -> " + << info.engine_name << ":" << inputs.size(); + // Skip duplicate inputs. + bool new_input = true; + for (const auto& inp : inputs) { + if (inp.node == input_node && inp.index == input_port) { + new_input = false; + break; + } + } + if (new_input) { + inputs.emplace_back(input_node, input_port, conn.connection_type); + } + } + + // Build the engine and get its serialized representation. + string segment_string; + if (info.engine_type == EngineInfo::EngineType::TRTStatic || + info.precision_mode == INT8MODE) { + // Create static engine for fp32/fp16 mode, and test validity of the engine + // for int8 mode. We don't want engine to fail at the calibration time. + // So we are constructing a FP32 engine here to check its validity, and if + // it is a valid engine then we put the serialized graphdef to the op. + // Otherwise we skip node creation for this engine. + Logger trt_logger; + TrtUniquePtrType<nvinfer1::ICudaEngine> engine; + // TODO(sami): What happens if 1st dim is not batch? + TF_RETURN_IF_ERROR(ConvertGraphDefToEngine( + info.segment_graph_def, + info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode, + max_batch_size, info.max_workspace_size_bytes, shapes, &trt_logger, + alloc, /*calibrator=*/nullptr, &engine, + /*convert_successfully=*/nullptr)); + TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize()); + segment_string = + string((const char*)engine_data->data(), engine_data->size()); + if (info.precision_mode == INT8MODE) { + // See above comment about why not putting this inside the 'else' branch. + segment_string = info.segment_graph_def.SerializeAsString(); + } + } else { + segment_string = info.segment_graph_def.SerializeAsString(); + } + + // TODO(aaroey): use enum instead, and add a helper method to do the + // conversion. + string prec_string; + switch (info.precision_mode) { + case FP32MODE: + prec_string = "FP32"; + break; + case FP16MODE: + prec_string = "FP16"; + break; + case INT8MODE: + prec_string = "INT8"; + if (!TRTResourceManager::instance()->getManager("TRTCalibration")) { + LOG(ERROR) << "Failed to construct calibration storage"; + } + break; + default: + return tensorflow::errors::OutOfRange("Unknown precision mode"); + } + tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp"); + if (!info.device.empty()) node_builder.Device(info.device); + if (VLOG_IS_ON(1)) { + string ins = StrCat(info.engine_name, " inputs= "); + for (const auto& ii : inputs) { + StrAppend(&ins, ii.node, ":", ii.index, " "); + } + VLOG(1) << ins; + } + node_builder.Input(inputs); + if (info.engine_type == EngineInfo::EngineType::TRTStatic && + info.cached_engine_batches.size()) { + LOG(WARNING) << "Cached engine batches are ignored for static engines"; + } + tensorflow::NodeDef trt_node; + tensorflow::Status status = + node_builder.Attr("input_shapes", input_shapes) + .Attr("output_shapes", out_shapes) + .Attr("static_engine", + info.engine_type == EngineInfo::EngineType::TRTStatic) + .Attr("segment_funcdef_name", + StrCat(info.engine_name, "_native_segment")) + .Attr("serialized_segment", segment_string) + .Attr("calibration_data", "") + .Attr("max_cached_engines_count", info.maximum_cached_engines) + .Attr("cached_engine_batches", {max_batch_size}) + .Attr("workspace_size_bytes", info.max_workspace_size_bytes) + .Attr("precision_mode", prec_string) + .Attr("OutT", out_types) + .Finalize(&trt_node); + if (!status.ok()) { + LOG(ERROR) << "Node construction failed with" << status; + return status; + } + VLOG(1) << "Adding TRTEngine " << info.engine_name << " to graph"; + + // Up until this point, graph is not modified. If we return !status.ok() from + // here, this segment will be skipped + tensorflow::Node* engine_node = graph->AddNode(trt_node, &status); + if (!status.ok()) { + LOG(ERROR) << "Adding node failed " << status; + return status; + } + // Updates the inputs of output edges destination nodes, and point them to the + // engine node. + for (auto& conn : info.connections) { + if (conn.is_input_edge) continue; + VLOG(1) << " Updating DBG " << engine_node->name() << " out_port " + << conn.port_number << " out_id " << conn.outside_id + << " name=" << conn.outside_node_name; + auto dst_node = graph->FindNodeId(conn.outside_id); + // dst_node can only be removed if it is an input node of another engine. + // In this case, other engines input edge is updated in nodedef to point to + // this engine. Even though edge doesn't exists in the graph, when it is + // deserialized again, correct edges will be constructed. This is a problem + // of graph->AddNode(). + if (!dst_node) continue; + VLOG(1) << "Updating " << engine_node->name() << ":" << conn.port_number + << " to " << dst_node->name() << ":" << conn.outside_port; + auto new_edge = graph->AddEdge(engine_node, conn.port_number, dst_node, + conn.outside_port); + CHECK(new_edge) << "Adding a new edge failed " << engine_node->name() << ":" + << conn.port_number << " -> " << dst_node->name() << ":" + << conn.outside_port; + } + return status; +} + +// Function to construct a funcdef from the segment and add it to the graph. +tensorflow::Status RegisterSegmentFunctionToFunctionLibrary( + tensorflow::Graph* graph, const tensorflow::GraphDef& segment, + const string& name) { + tensorflow::Graph sgraph(graph->flib_def()); + tensorflow::GraphConstructorOptions gcopts; + TF_RETURN_IF_ERROR( + tensorflow::ConvertGraphDefToGraph(gcopts, segment, &sgraph)); + std::map<string, tensorflow::Node*> io_nodes; + int num_inputs = 0; + for (auto n : sgraph.op_nodes()) { + if (tensorflow::str_util::StartsWith(n->name(), kInputPHName)) { + num_inputs++; + io_nodes.insert({n->name(), n}); + } else if (tensorflow::str_util::StartsWith(n->name(), kOutputPHName)) { + io_nodes.insert({n->name(), n}); + } + } + + for (int i = 0; i < num_inputs; ++i) { + auto name = StrCat(kInputPHName, i); + auto node = io_nodes[name]; + tensorflow::NodeDef nd; + tensorflow::NodeDefBuilder node_builder( + StrCat(name, "_Arg"), tensorflow::FunctionLibraryDefinition::kArgOp); + VLOG(1) << "Adding " << StrCat(name, "_Arg"); + TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0)) + .Attr("index", i) + .Finalize(&nd)); + tensorflow::Status s; + auto node_arg = sgraph.AddNode(nd, &s); + if (!s.ok()) { + LOG(ERROR) << "Couldn't add _Arg node for " << name; + } + for (auto edge : node->out_edges()) { + sgraph.AddEdge(node_arg, 0, edge->dst(), edge->dst_input()); + VLOG(1) << "Updating funcdef input " << node_arg->name() << ":" << 0 + << " - > " << edge->dst()->name() << ":" << edge->dst_input(); + if (!s.ok()) { + LOG(ERROR) << "Failed to update edge from " << node_arg->name() + << " to " << edge->dst()->name() << ":" << edge->dst_input(); + } + } + sgraph.RemoveNode(node); + } + + for (int i = 0; i < io_nodes.size() - num_inputs; ++i) { + auto name = StrCat(kOutputPHName, i); + auto node = io_nodes[name]; + tensorflow::NodeDef nd; + tensorflow::NodeDefBuilder node_builder( + StrCat(name, "_Ret"), tensorflow::FunctionLibraryDefinition::kRetOp); + auto edge = *(node->in_edges().begin()); + tensorflow::NodeDefBuilder::NodeOut nout( + edge->src()->name(), edge->src_output(), + edge->src()->output_type(edge->src_output())); + VLOG(1) << " input " << nout.node << ":" << nout.index + << " dtype=" << tensorflow::DataTypeString(nout.data_type); + node_builder.Input({nout}); + TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0)) + .Attr("index", i) + .Finalize(&nd)); + if (VLOG_IS_ON(3)) { + VLOG(3) << nd.DebugString(); + } + tensorflow::Status s; + auto node_ret = sgraph.AddNode(nd, &s); + if (!s.ok()) { + LOG(ERROR) << "Couldn't add _Ret node for " << name; + } + VLOG(1) << "Update edge from " << edge->src()->name() << ":" + << edge->src_output() << " - > " << node_ret->name() << ":" << 0; + sgraph.AddEdge(edge->src(), edge->src_output(), node_ret, 0); + s = sgraph.UpdateEdge(edge->src(), edge->src_output(), node_ret, 0); + if (!s.ok()) { + LOG(ERROR) << "Failed to update edge from " << edge->src()->name() << ":" + << edge->src_output() << " - > " << node_ret->name() << ":" + << 0; + } + sgraph.RemoveNode(node); + } + tensorflow::FunctionDefLibrary fdeflib; + auto native_segment = fdeflib.add_function(); + TF_RETURN_IF_ERROR(tensorflow::GraphToFunctionDef( + sgraph, StrCat(name, "_native_segment"), native_segment)); + if (VLOG_IS_ON(7)) { + VLOG(7) << name << " Function_Def "; + VLOG(7) << native_segment->DebugString(); + } + VLOG(1) << "Adding funcdef to graphlib"; + TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib)); + return tensorflow::Status::OK(); +} + +std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator( + ConversionParams& params, EngineInfo& engine) { + int cuda_device_id = -1; + auto check_device_id = [](int tfid) -> int { + tensorflow::TfGpuId tf_gpu_id(tfid); + CudaGpuId cuda_gpu_id; + Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); + if (s.ok()) { + VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device " + << cuda_gpu_id.value(); + return cuda_gpu_id.value(); + } + VLOG(2) << "TF GPU with id " << tfid << " do not exist " << s; + return -1; + }; + tensorflow::Allocator* dev_allocator = nullptr; + // we need to us PM here since in python path there is no way to get + // to allocators. + // TODO(sami): when grappler devices become available else path will not be + // necessary + auto pm = tensorflow::ProcessState::singleton(); + if (params.cluster) { // get allocator + tensorflow::Device* device = nullptr; + if (params.cluster->GetDeviceSet()) { + device = params.cluster->GetDeviceSet()->FindDeviceByName(engine.device); + } + if (device) { + tensorflow::AllocatorAttributes alloc_attr; + dev_allocator = device->GetAllocator(alloc_attr); + VLOG(1) << "Using allocator " << dev_allocator->Name(); + } else { + LOG(WARNING) << "Cluster is set but device '" << engine.device + << "' is not found in the cluster"; + } + } else { // cluster not found, possibly a python call + VLOG(1) << "Cluster is not set, probably called from python"; + int found_device = 0; + bool try_gpu_ids = true; + // if device is set, try to find the device. Might be a problem for multi + // host case but TensorRT do not support multi host setups yet. + if (!engine.device.empty()) { + DeviceNameUtils::ParsedName parsed_name; + if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name)) { + cuda_device_id = parsed_name.has_id ? parsed_name.id : -1; + } + try_gpu_ids = !parsed_name.has_id; + } + if (try_gpu_ids) { + while (found_device < 100) { + cuda_device_id = check_device_id(found_device); + if (cuda_device_id >= 0) break; + found_device++; + } + } + if (found_device == 100) { + LOG(ERROR) << " Can't find a GPU device to work with. Please " + "instantiate a session to initialize devices"; + return std::make_pair(cuda_device_id, dev_allocator); + } + LOG(WARNING) + << "Can't determine the device, constructing an allocator at device " + << found_device; + tensorflow::GPUOptions gpuoptions; + // this will be a noop if device is already initialized + gpuoptions.set_allow_growth(true); + tensorflow::TfGpuId tf_gpu_id(found_device); + dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1); + } + return std::make_pair(cuda_device_id, dev_allocator); +} + +// Entry function from optimization pass. +tensorflow::Status ConvertAfterShapes(ConversionParams& params) { + // Convert graphdef to graph. tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(), - gdef.library()); + params.input_graph_def->library()); tensorflow::Graph graph(flib); TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph( - tensorflow::GraphConstructorOptions(), gdef, &graph)); + tensorflow::GraphConstructorOptions(), *params.input_graph_def, &graph)); + // Segment the graph into subgraphs that can be converted to TensorRT + tensorflow::tensorrt::segment::SegmentOptions segment_options; // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT) - for (auto node : output_names) { + for (auto node : *(params.output_names)) { segment_options.exclude_node_list.insert(node); } - - // TODO(sami): this should be passed as a knob!!!! - segment_options.minimum_segment_size = minimum_segment_size; - tensorflow::tensorrt::segment::SegmentNodesVector segments; + segment_options.minimum_segment_size = params.minimum_segment_size; + tensorflow::tensorrt::segment::SegmentNodesVector initial_segments; TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph( - &graph, IsTensorRTCandidate, segment_options, &segments)); - if (segments.size() > 1) { - VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size(); + &graph, IsTensorRTCandidate, segment_options, &initial_segments)); + if (initial_segments.size() > 1) { + VLOG(0) << "MULTIPLE tensorrt candidate conversion: " + << initial_segments.size(); } + + // Get the EngineInfo for each segment. std::unordered_map<string, tensorflow::Node*> node_map; TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map)); - std::unordered_map<string, std::pair<int, string>> output_edge_map; - int count = 0; float total_num_nodes_in_segments = 0.; - for (auto s : segments) { - total_num_nodes_in_segments += s.first.size(); - } - // We create the map here since cluster may not be available in all cases. - std::map<string, tensorflow::Device*> name_to_device_map; - if (cluster) { - // TODO(aaroey): consider using DeviceSet::FindDeviceByName(), as in a - // distributed environment, devices from different workers can have same - // short name. - for (const auto dm : cluster->GetDeviceSet()->devices()) { - name_to_device_map[dm->name()] = dm; + std::vector<EngineInfo> engine_segments; + engine_segments.reserve(initial_segments.size()); + std::vector<tensorflow::Node*> reverse_topo_order; + tensorflow::GetPostOrder(graph, &reverse_topo_order); + size_t total_engine_bytes_size = 0; + std::vector<size_t> engine_bytes_size; + tensorflow::tensorrt::segment::SegmentNodesVector converted_segments; + converted_segments.reserve(initial_segments.size()); + for (size_t t = 0; t < initial_segments.size(); t++) { + auto& curr_segment = initial_segments.at(t); + EngineInfo curr_engine; + Status status = + GetEngineInfo(&graph, *params.graph_properties, curr_segment.first, + node_map, reverse_topo_order, &curr_engine); + if (!status.ok()) { + LOG(WARNING) << "Failed to get engine info for segment " << t << ": " + << status; + continue; } - } - for (const auto& segment_nodes_and_device : segments) { - const std::set<string>& subgraph_node_names = - segment_nodes_and_device.first; - std::set<int> subgraph_node_ids; - size_t max_mem_per_engine = - max_workspace_size_bytes * - ((float)subgraph_node_names.size() / total_num_nodes_in_segments); - std::stringstream oss; - for (const string& node_name : subgraph_node_names) { - oss << " " << node_name; - subgraph_node_ids.insert(node_map.at(node_name)->id()); + curr_engine.precision_mode = params.precision_mode; + curr_engine.engine_type = + (params.is_dyn_op || params.precision_mode == INT8MODE + ? EngineInfo::EngineType::TRTDynamic + : EngineInfo::EngineType::TRTStatic); + curr_engine.cached_engine_batches = params.cached_engine_batches; + curr_engine.maximum_cached_engines = params.max_cached_engines; + StrAppend(&curr_engine.engine_name, "my_trt_op_", t); + status = RegisterSegmentFunctionToFunctionLibrary( + &graph, curr_engine.segment_graph_def, curr_engine.engine_name); + if (!status.ok()) { + LOG(WARNING) << "Failed to register segment graphdef as a function " << t + << ": " << status; + continue; } - VLOG(1) << "Subgraph nodes at device " << segment_nodes_and_device.second - << " : " << oss.str(); - auto target_device = - name_to_device_map.find(segment_nodes_and_device.second); - std::shared_ptr<nvinfer1::IGpuAllocator> allocator(0); + engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong()); + total_engine_bytes_size += engine_bytes_size.back(); + total_num_nodes_in_segments += curr_segment.first.size(); + engine_segments.push_back(std::move(curr_engine)); + converted_segments.push_back(std::move(curr_segment)); + + if (VLOG_IS_ON(8)) { + string fname = curr_engine.engine_name; + StrAppend(&fname, ".pb"); + std::fstream f; + f.open(fname.c_str(), std::fstream::out | std::fstream::binary); + f << engine_segments.at(t).segment_graph_def.SerializeAsString(); + f.close(); + } + } + + // Create a TRT node for each segment using its EngineInfo. + int old_cuda_device = 0; + auto err = cudaGetDevice(&old_cuda_device); + if (err != cudaSuccess) { + LOG(ERROR) << "Couldn't get current device: " << cudaGetErrorString(err); + } + VLOG(1) << "Current cuda device is " << old_cuda_device; + for (int i = 0; i < engine_segments.size(); ++i) { + auto& engine = engine_segments.at(i); + // Partition the workspace size by the average of node ratio and segment + // graphdef size + engine.max_workspace_size_bytes = + params.max_workspace_size_bytes * + (engine_bytes_size.at(i) / total_engine_bytes_size + + converted_segments.at(i).first.size() / total_num_nodes_in_segments) / + 2.0; + // The allocator is used to build the engine. The build and the built engine + // will be destroyed after we get the serialized engine string, so it's fine + // to use unique_ptr here. + std::unique_ptr<nvinfer1::IGpuAllocator> alloc; + auto device_alloc = GetDeviceAndAllocator(params, engine); int cuda_device_id = 0; - if (target_device != name_to_device_map.end()) { - tensorflow::TfGpuId tf_gpu_id(target_device->second->parsed_name().id); - CudaGpuId cuda_gpu_id; - Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); - if (!s.ok()) { - LOG(ERROR) - << "Cuda device identification failed, using device 0. Error= " - << s; - } else { - cuda_device_id = cuda_gpu_id.value(); - } - tensorflow::GPUOptions gpuoptions; - // we need to us PM here since in python path there is no way to get to - // allocators - auto pm = tensorflow::ProcessState::singleton(); - // this should be instantiated by now - auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1); - VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value() - << " cuda device= " << cuda_device_id << " at " << dev_allocator; - allocator = std::make_shared<TRTDeviceAllocator>(dev_allocator); - } else { // device unknown or not available - allocator = std::make_shared<TRTCudaAllocator>(); + if (device_alloc.first >= 0) { + cuda_device_id = device_alloc.first; + alloc.reset(new TRTDeviceAllocator(device_alloc.second)); + } else { + // Setting allocator as nullptr should get revert to the cudamalloc + LOG(WARNING) << "Can't identify the cuda device. Running on device 0 "; } - ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size, - max_mem_per_engine, graph_properties, &output_edge_map, - precision_mode, segment_nodes_and_device.second, - allocator, cuda_device_id); - if (precision_mode == INT8MODE) { - tensorflow::Status status = GetCalibNode(&p); - if (status != tensorflow::Status::OK()) { - LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count - << " due to: \"" << status.ToString() - << "\" SKIPPING......( " << subgraph_node_names.size() - << " nodes)"; + cudaSetDevice(cuda_device_id); + auto status = CreateTRTNode(&graph, engine_segments, i, alloc.get(), + params.max_batch_size); + // If status is ok, we successfully added the node to the graph and can + // remove segment ops. Otherwise graph is not modified. + if (status.ok()) { + for (auto node_name : converted_segments.at(i).first) { + graph.RemoveNode(node_map.at(node_name)); } } else { - tensorflow::Status status = ConvertSubGraphToTensorRT(&p); - if (status != tensorflow::Status::OK()) { - LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count - << " due to: \"" << status.ToString() - << "\" SKIPPING......( " << subgraph_node_names.size() - << " nodes)"; - } + // Graph is not modified. + LOG(WARNING) << "Engine creation for segment " << i << ", composed of " + << converted_segments.at(i).first.size() + << " nodes failed: " << status << ". Skipping..."; } - count++; } - graph.ToGraphDef(new_graph_def); + cudaSetDevice(old_cuda_device); + graph.ToGraphDef(params.output_graph_def); + VLOG(1) << "Returning from conversion"; return tensorflow::Status::OK(); } diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h index 65a67d7e73..9d986e4890 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.h +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h @@ -30,29 +30,60 @@ namespace tensorflow { namespace tensorrt { namespace convert { -// This method converts an already generated calibration graph which was used in -// calibration runs to an inference graph +struct ConversionParams { + ConversionParams() + : input_graph_def(nullptr), + max_batch_size(1), + max_workspace_size_bytes(1 << 30), + output_graph_def(nullptr), + precision_mode(1), + minimum_segment_size(3), + graph_properties(nullptr), + cluster(nullptr), + is_dyn_op(false), + fixed_input_size(true), + max_cached_engines(1) {} + const tensorflow::GraphDef* input_graph_def; + const std::vector<string>* output_names; + size_t max_batch_size; + size_t max_workspace_size_bytes; + tensorflow::GraphDef* output_graph_def; + int precision_mode; + int minimum_segment_size; + const tensorflow::grappler::GraphProperties* graph_properties; + const tensorflow::grappler::Cluster* cluster; + bool is_dyn_op; // Whether to create engine on conversion or execution time + bool fixed_input_size; // Assume non-batch ranks of input tensors are fixed + int max_cached_engines; // maximum number of cached engines + std::vector<int> cached_engine_batches; // list of cached engines +}; + +// This method extracts calibration information from the resource managers +// and puts them in to engine nodedefs. tensorflow::Status ConvertCalibGraphToInferGraph( - const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def); + const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def, + bool is_dyn_op); -// max_batch_size: maximum batch size which can be used for inference for -// optimization targets inference run with max batch size. -// max_workspace_size_bytes: The upper bound of memory allowance for -// engine building. +// - max_batch_size: maximum batch size which can be used for inference for +// optimization targets inference run with max batch size. +// - max_workspace_size_bytes: The upper bound of memory allowance for engine +// building. tensorflow::Status ConvertGraphDefToTensorRT( const tensorflow::GraphDef& graph_def, const std::vector<string>& output_names, size_t max_batch_size, size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def, - int precision_mode, int minimum_segment_size); + int precision_mode = 1, int minimum_segment_size = 3, + bool is_dyn_op = false, int max_cached_engines = 1, + std::vector<int> cached_engine_batches = {}); // Method to call from optimization pass -tensorflow::Status ConvertAfterShapes( - const tensorflow::GraphDef& graph, const std::vector<string>& output_names, - size_t max_batch_size, size_t max_workspace_size_bytes, - tensorflow::GraphDef* new_graph_def, int precision_mode, - int minimum_segment_size, - const tensorflow::grappler::GraphProperties& graph_properties, - const tensorflow::grappler::Cluster* cluster); +tensorflow::Status ConvertAfterShapes(ConversionParams& params); + +// Return compile time TensorRT library version information. +std::vector<int> GetLinkedTensorRTVersion(); + +// Return runtime time TensorRT library version information. +std::vector<int> GetLoadedTensorRTVersion(); } // namespace convert } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc index 4e4d295538..146b9c7344 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc @@ -14,7 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h" -#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" #include <algorithm> #include <list> @@ -25,7 +24,9 @@ limitations under the License. #include <utility> #include <vector> +#include "tensorflow/contrib/tensorrt/convert/utils.h" #include "tensorflow/contrib/tensorrt/log/trt_logger.h" +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h" #include "tensorflow/contrib/tensorrt/resources/trt_resources.h" #include "tensorflow/core/framework/node_def.pb.h" // NOLINT @@ -37,6 +38,7 @@ limitations under the License. #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" @@ -54,8 +56,11 @@ limitations under the License. namespace tensorflow { namespace tensorrt { namespace convert { +using ::tensorflow::str_util::Split; + using ::tensorflow::strings::StrAppend; using ::tensorflow::strings::StrCat; + namespace { inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype, @@ -121,12 +126,10 @@ static std::vector<std::pair<int, int>> CreateSamePadding( string GetCommonNameScope(const string& op_name_a, const string& op_name_b) { size_t last_scope_separator = 0; - for (size_t i = 0; i < std::min(op_name_a.size(), op_name_b.size()); ++i) { - if (op_name_a[i] != op_name_b[i]) { - break; - } else if (op_name_a[i] == '/') { - last_scope_separator = i + 1; - } + const size_t min_size = std::min(op_name_a.size(), op_name_b.size()); + for (size_t i = 0; i < min_size; ++i) { + if (op_name_a[i] != op_name_b[i]) break; + if (op_name_a[i] == '/') last_scope_separator = i + 1; } return op_name_a.substr(0, last_scope_separator); } @@ -417,20 +420,6 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights, } } -struct InferDeleter { - template <typename T> - void operator()(T* obj) const { - if (obj) { - obj->destroy(); - } - } -}; - -template <typename T> -inline std::shared_ptr<T> infer_object(T* obj) { - return std::shared_ptr<T>(obj, InferDeleter()); -} - class Converter; using OpConverter = @@ -444,7 +433,7 @@ class Converter { OpConverter plugin_converter_; nvinfer1::INetworkDefinition* trt_network_; std::list<std::vector<uint8_t>> temp_bufs_; - tensorflow::tensorrt::TRTWeightStore* weight_store_; + TRTWeightStore* weight_store_; bool fp16_; void register_op_converters(); tensorflow::Status get_inputs(const tensorflow::NodeDef& node_def, @@ -486,11 +475,11 @@ class Converter { public: explicit Converter(nvinfer1::INetworkDefinition* trt_network, - tensorflow::tensorrt::TRTWeightStore* ws, bool fp16) + TRTWeightStore* ws, bool fp16) : trt_network_(trt_network), weight_store_(ws), fp16_(fp16) { this->register_op_converters(); } - tensorflow::tensorrt::TRTWeightStore* weight_store() { return weight_store_; } + TRTWeightStore* weight_store() { return weight_store_; } TRT_ShapedWeights get_temp_weights(tensorflow::DataType type, nvinfer1::Dims shape) { TRT_ShapedWeights weights(type, nullptr, shape); @@ -2140,559 +2129,265 @@ void Converter::register_op_converters() { } // namespace -tensorflow::Status ConvertCalibrationNodeToEngineNode( - tensorflow::Graph& graph, tensorflow::Node* c_node) { - const auto ndef = c_node->def(); - - TFAttrs attrs(ndef); - std::vector<string> segment_nodes( - attrs.get<std::vector<string>>("segment_nodes")); - std::vector<string> output_nodes( - attrs.get<std::vector<string>>("segment_output_names")); - std::vector<string> input_names( - attrs.get<std::vector<string>>("input_names")); - string res_name = attrs.get<string>("resource_name"); - VLOG(1) << "Node name " << c_node->name() << " res_name " << res_name; - string engine_name = "my_trt_op"; - { - const auto node_id = tensorflow::str_util::Split(res_name, "_"); - engine_name += node_id.back(); - } - std::map<string, tensorflow::Node*> node_maps; - - for (auto n : graph.op_nodes()) { - node_maps.insert({n->name(), n}); - } - std::set<int> subgraph_ids; - for (const auto internal_node : segment_nodes) { - subgraph_ids.insert(node_maps.at(internal_node)->id()); - } - if (VLOG_IS_ON(2)) { - string node_names = StrCat(c_node->name(), " segment nodes= "); - - for (const auto& node_name : segment_nodes) { - StrAppend(&node_names, node_name, ", "); - } - VLOG(2) << node_names; +tensorflow::Status ConvertGraphDefToEngine( + const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size, + size_t max_workspace_size_bytes, + const std::vector<tensorflow::PartialTensorShape>& input_shapes, + Logger* logger, nvinfer1::IGpuAllocator* allocator, + TRTInt8Calibrator* calibrator, + TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, + bool* convert_successfully) { + engine->reset(); + if (convert_successfully) *convert_successfully = false; + + // Create the builder. + TrtUniquePtrType<nvinfer1::IBuilder> builder( + nvinfer1::createInferBuilder(*logger)); + builder->setMaxBatchSize(max_batch_size); + // TODO(aaroey): use the allocator to allocate the TRT workspace. + builder->setMaxWorkspaceSize(max_workspace_size_bytes); +#if NV_TENSORRT_MAJOR > 3 + builder->setGpuAllocator(allocator); +#endif + if (precision_mode == FP16MODE) { + builder->setHalf2Mode(true); + } else if (precision_mode == INT8MODE) { + builder->setInt8Mode(true); + builder->setInt8Calibrator(calibrator); } - VLOG(1) << "Output Nodes:"; - std::vector<tensorflow::DataType> out_types; - std::vector<const tensorflow::Edge*> out_edges; + // Create the network. + auto trt_network = + TrtUniquePtrType<nvinfer1::INetworkDefinition>(builder->createNetwork()); + if (!trt_network) { + return tensorflow::errors::Internal( + "Failed to create TensorRT network object"); + } + auto ws = std::unique_ptr<TRTWeightStore>(new TRTWeightStore()); - for (auto& i : output_nodes) { - auto node_port = tensorflow::str_util::Split(i, ":"); - VLOG(1) << " " << i << " in graph " << node_maps.count(i); - auto out_node_name = node_port.at(0); - if (node_port.size() > 1) { - VLOG(1) << "Multi port output" << node_port.at(0) << " " - << node_port.at(1) << " size=" << node_port.size(); - } - auto node_it = node_maps.find(out_node_name); - if (node_it != node_maps.end()) { - tensorflow::Node* out_node = node_it->second; - int port = 0; - if (node_port.size() == 2) { - port = std::strtoul(node_port.at(1).c_str(), nullptr, 10); - out_types.push_back(out_node->output_type(port)); - } else { - out_types.push_back(out_node->output_type(0)); + // Build the network + VLOG(1) << "Starting engine conversion "; + Converter converter(trt_network.get(), ws.get(), precision_mode == FP16MODE); + std::vector<std::pair<string, string>> output_tensors; + // Graph nodes are already topologically sorted during construction + for (const auto& node_def : gdef.node()) { + string node_name = node_def.name(); + VLOG(1) << "Converting op name=" << node_name << ", op=" << node_def.op(); + if (tensorflow::str_util::StartsWith(node_name, kInputPHName) && + (node_def.op() == "Placeholder")) { + nvinfer1::DimsCHW input_dim_pseudo_chw; + for (int i = 0; i < 8; i++) input_dim_pseudo_chw.d[i] = 0; + nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT); + auto type_status = + ConvertDType(node_def.attr().at("dtype").type(), &dtype); + if (type_status != tensorflow::Status::OK()) { + LOG(WARNING) << "Type conversion failed for " << node_name; + return type_status; } - for (auto out_edge : out_node->out_edges()) { - if (subgraph_ids.count(out_edge->dst()->id())) - continue; // skip internal edges; - if (out_edge->src_output() == port) { - out_edges.push_back(out_edge); - VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":" - << out_edge->src_output() << " -> " << out_edge->dst()->name() - << ":" << out_edge->dst_input(); + int32 slot_number = -1; + if (!tensorflow::strings::safe_strto32(node_name.c_str() + 8, + &slot_number)) { + LOG(ERROR) << "Failed to parse slot number from " << node_name + << " +8= " << node_name.c_str() + 8; + } + auto shape = input_shapes.at(slot_number); + if (shape.dims() > 8) { + LOG(ERROR) << "Tensor rank is greater than 8 for " << node_name + << " at input slot " << slot_number; + return tensorflow::errors::OutOfRange( + "Input tensor rank is greater than 8"); + } + if (VLOG_IS_ON(1)) { + string dim_str("dims="); + StrAppend(&dim_str, "[ ", shape.dim_size(0)); + for (int i = 1; i < shape.dims(); i++) { + StrAppend(&dim_str, ", ", shape.dim_size(i)); } + StrAppend(&dim_str, " ]"); + VLOG(1) << dim_str; + } + for (int i = 1; i < shape.dims(); i++) { + input_dim_pseudo_chw.d[i - 1] = shape.dim_size(i); } - } else { - LOG(WARNING) << " couldn't find output node " << out_node_name; - } - } - if (VLOG_IS_ON(1)) { - VLOG(1) << c_node->name() << " Input Nodes:"; - for (auto& i : input_names) { - VLOG(1) << " Input " << i << " in graph " << node_maps.count(i); - } - } - auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance(); - auto resmgr = trt_rm->getManager("TRTCalibOps"); - tensorflow::tensorrt::TRTCalibrationResource* calib_res = nullptr; - auto status = resmgr->Lookup(res_name, res_name, &calib_res); - if (!status.ok() || !calib_res->calibrator_) { - return tensorflow::errors::FailedPrecondition( - "You must run calibration" - " and inference conversion in the same process"); - } - - calib_res->calibrator_->setDone(); - calib_res->thr_->join(); - delete calib_res->thr_; - if (!calib_res->engine_) { - LOG(ERROR) << "Calibration failed!, engine does not exist. Did you run " - "calibration graph?"; - return tensorflow::errors::FailedPrecondition( - "Calibration graph needs to be executed on" - " calibration data before convertsion to inference graph"); - } - auto weight_rmgr = trt_rm->getManager("WeightStore"); - TF_CHECK_OK(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>( - res_name, res_name)); - auto engine_plan = calib_res->engine_->serialize(); - calib_res->engine_->destroy(); - calib_res->network_->destroy(); - calib_res->builder_->destroy(); - calib_res->thr_ = nullptr; - calib_res->engine_ = nullptr; - calib_res->builder_ = nullptr; - tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp"); - std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges; - income_edges.resize(c_node->num_inputs()); - for (const auto in_edge : c_node->in_edges()) { - auto src = in_edge->src(); - int dest_port = in_edge->dst_input(); - VLOG(1) << "Incoming connection " << src->name() << ":" - << in_edge->src_output() << " -> " << c_node->name() << ":" - << dest_port; - income_edges.at(dest_port) = {src->name(), in_edge->src_output(), - c_node->input_type(dest_port)}; - } - tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list( - income_edges); - if (VLOG_IS_ON(2)) { - for (const auto& inp : input_list) { - VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " " - << tensorflow::DataTypeString(inp.data_type); - } - } - op_builder.Input(input_list); - tensorflow::NodeDef engine_node; - const char* engine_plan_data = static_cast<const char*>(engine_plan->data()); - string engine_plan_string(engine_plan_data, - engine_plan_data + engine_plan->size()); - status = op_builder.Attr("serialized_engine", engine_plan_string) - .Attr("input_nodes", input_names) - .Attr("output_nodes", output_nodes) - .Attr("OutT", out_types) - .Finalize(&engine_node); - if (!status.ok()) { - LOG(ERROR) << "Engine Node creation failed"; - return status; - } - auto trt_engine_node = graph.AddNode(engine_node, &status); - TF_RETURN_IF_ERROR(status); - std::map<string, int> port_map; - for (size_t t = 0; t < output_nodes.size(); t++) { - port_map.insert({output_nodes.at(t), t}); - } - for (auto& i : out_edges) { - string s(i->src()->name()); - if (i->src_output()) StrAppend(&s, ":", i->src_output()); - int out_port = port_map.at(s); - VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port - << " -> " << i->dst()->name() << ":" << i->dst_input(); - TF_RETURN_IF_ERROR( - graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input())); - } - for (const auto ed : trt_engine_node->in_edges()) { - VLOG(1) << "In Edge " << ed->src()->name() << ":" << ed->src_output() - << " -> " << ed->dst()->name() << ":" << ed->dst_input(); - } - for (const auto ed : trt_engine_node->out_edges()) { - VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output() - << " -> " << ed->dst()->name() << ":" << ed->dst_input(); - } - VLOG(1) << "Segment nodes:"; - for (auto& i : segment_nodes) { - VLOG(1) << " " << i << " in graph " << node_maps.count(i); - auto it = node_maps.find(i); - if (it != node_maps.end()) { - graph.RemoveNode(it->second); - } - } - graph.RemoveNode(c_node); - return tensorflow::Status::OK(); -} -tensorflow::Status ReverseTopologicalSort( - const tensorrt::convert::SubGraphParams& s, - std::list<tensorflow::Node*>* order) { - std::vector<tensorflow::Node*> order_vec; - tensorflow::GetPostOrder(s.graph, &order_vec); - // Select just the subgraph - for (tensorflow::Node* node : order_vec) { - if (s.subgraph_node_ids.count(node->id())) { - // We want topological order to contstruct the - // network layer by layer - order->push_front(node); + input_dim_pseudo_chw.nbDims = shape.dims() - 1; + nvinfer1::ITensor* input_tensor = converter.network()->addInput( + node_name.c_str(), dtype, input_dim_pseudo_chw); + if (!input_tensor) { + return tensorflow::errors::InvalidArgument( + "Failed to create Input layer tensor ", node_name, + " rank=", shape.dims() - 1); + } + VLOG(1) << "Input tensor name :" << node_name; + if (!converter.insert_input_tensor(node_name, input_tensor)) { + return tensorflow::errors::AlreadyExists( + "Output tensor already exists for op: " + node_name); + } + } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) && + (node_def.op() == "Identity")) { + int32 slot_number = -1; + if (!tensorflow::strings::safe_strto32(node_name.c_str() + 9, + &slot_number)) { + LOG(ERROR) << "Failed to parse slot number from " << node_name + << " +9=" << node_name.c_str() + 9; + } + if (output_tensors.size() <= slot_number) { + output_tensors.resize(slot_number + 1); + } + output_tensors.at(slot_number) = {node_def.input(0), node_name}; + } else { + VLOG(2) << "Converting node: " << node_def.name() << " , " + << node_def.op(); + TF_RETURN_IF_ERROR(converter.convert_node(node_def)); } } - return tensorflow::Status::OK(); -} - -tensorflow::Status SetInputList( - const tensorrt::convert::SubGraphParams& s, - tensorflow::NodeDefBuilder* op_builder, - const std::vector<string>* input_names, - std::vector<tensorflow::DataType>* input_dtypes) { - std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges; - VLOG(2) << "input edge size: " << input_names->size(); - for (size_t i = 0; i < input_names->size(); ++i) { - VLOG(2) << "input edges: " << i << " " << input_names->at(i); - int output_idx = s.input_inds.at(i).second; - // we wired up the input here already, it is redundant to do it again in - // ConvertSubGraphToTensorRT(convert_graph.cc) - auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut( - input_names->at(i), output_idx, input_dtypes->at(i)); - income_edges.push_back(incoming_edge); - } - tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list( - income_edges); - op_builder->Input(input_list); - return tensorflow::Status::OK(); -} - -string SubgraphNameScopeGenerator(const std::list<tensorflow::Node*>* order) { - string subgraph_name_scope; - if (!order->empty()) { - subgraph_name_scope = order->front()->name(); - } - for (const tensorflow::Node* node : *order) { - subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name()); - } - // TODO(sami,ben,jie): proper naming! - return subgraph_name_scope; -} - -tensorflow::Status ConvertSubgraph( - Converter& converter, tensorrt::convert::SubGraphParams& s, - std::list<tensorflow::Node*>* order, std::vector<string>* input_names, - std::vector<tensorflow::DataType>* input_dtypes, - std::vector<string>* output_names, - std::vector<tensorflow::DataType>* output_dtypes, - const string& engine_name) { - std::set<string> added_tensors; - for (const std::pair<int, int>& input : s.input_inds) { - VLOG(2) << "parsing input. Node id= " << input.first; - int node_id = input.first; - int output_idx = input.second; - tensorflow::Node* node = s.graph.FindNodeId(node_id); - auto node_name = node->name(); - // input_names should use the node name in the graph - // here it should be the input tensor name -> matching the binding - // insert original node name without port - auto tensor_name = node_name; - if (output_idx != 0) { - tensor_name = StrCat(tensor_name, ":", output_idx); - } - - VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name - << " idx: " << output_idx; - - auto shape_inference_node_name = node_name; - auto shape_inference_output_idx = output_idx; - // rewire the shape inference to original node in the graph - if (s.output_edge_map->count(tensor_name)) { - shape_inference_node_name = s.output_edge_map->at(tensor_name).second; - shape_inference_output_idx = s.output_edge_map->at(tensor_name).first; - } - if (shape_inference_output_idx < 0) continue; - VLOG(2) << "shapeinference name: " << shape_inference_node_name - << " idx: " << shape_inference_output_idx; - - if (!s.graph_properties.HasOutputProperties(shape_inference_node_name)) - return tensorflow::errors::Internal("failed to find input node: " + - shape_inference_node_name); - - auto op_info_vec = - s.graph_properties.GetOutputProperties(shape_inference_node_name); - if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx) - return tensorflow::errors::Internal( - "accessing output index of: ", shape_inference_output_idx, - ", at node: ", shape_inference_node_name, - " with output entry from shape_map: ", op_info_vec.size()); - - auto op_info = op_info_vec.at(shape_inference_output_idx); - tensorflow::DataType tf_dtype = op_info.dtype(); - - nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT); - auto type_status = ConvertDType(tf_dtype, &dtype); - if (type_status != tensorflow::Status::OK()) { - LOG(WARNING) << "Type conversion failed for " << node_name; - return type_status; - } - - VLOG(2) << "Accessing output index of: " << output_idx - << ", at node: " << node_name - << " with output entry from shape_map: " << op_info_vec.size(); - // TODO(ben,jie): update TRT input format/dimension - nvinfer1::DimsCHW input_dim_pseudo_chw; - for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1; - - // TODO(jie): TRT 3.x only support 4 dimensional input tensor. - // update the code once TRT 4.0 comes out. - if (op_info.shape().dim_size() != 4) { - string err_str = "Require 4 dimensional input."; - StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ", - shape_inference_node_name); - return tensorflow::errors::Unimplemented(err_str); - } - - for (int i = 1; i < op_info.shape().dim_size(); i++) { - VLOG(2) << "dimension: " << i - << " , size: " << op_info.shape().dim(i).size(); - input_dim_pseudo_chw.d[i - 1] = op_info.shape().dim(i).size(); - } - - // TODO(ben,jie): proper way to restore input tensor name? - auto input_tensor_name = node_name; - if (output_idx != 0) { - input_tensor_name = StrCat(node_name, ":", output_idx); - } - if (added_tensors.count(input_tensor_name)) continue; - added_tensors.insert(input_tensor_name); - input_names->push_back(input_tensor_name); - input_dtypes->push_back(tf_dtype); - nvinfer1::ITensor* input_tensor = converter.network()->addInput( - input_tensor_name.c_str(), dtype, input_dim_pseudo_chw); - - if (!input_tensor) - return tensorflow::errors::InvalidArgument( - "Failed to create Input layer"); - VLOG(2) << "Input tensor name :" << input_tensor_name; - - if (!converter.insert_input_tensor(input_tensor_name, input_tensor)) - return tensorflow::errors::AlreadyExists( - "Output tensor already exists for op: " + input_tensor_name); - } - - for (const tensorflow::Node* node : *order) { - const tensorflow::NodeDef& node_def = node->def(); - VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op(); - TF_RETURN_IF_ERROR(converter.convert_node(node_def)); - } - - VLOG(2) << "Finished conversion"; - - // Gather output metadata - int trt_engine_op_output_idx = 0; - added_tensors.clear(); - for (const std::pair<int, int>& output : s.output_inds) { - int node_id = output.first; - int output_idx = output.second; - tensorflow::Node* node = s.graph.FindNodeId(node_id); - string op_name = node->name(); - string tensor_name = op_name; - - s.output_edge_map->insert( - {trt_engine_op_output_idx == 0 - ? engine_name - : StrCat(engine_name, ":", trt_engine_op_output_idx), - {output_idx, tensor_name}}); - trt_engine_op_output_idx++; - if (output_idx != 0) - tensorflow::strings::StrAppend(&tensor_name, ":", output_idx); - VLOG(2) << "Output tensor name: " << tensor_name; - if (added_tensors.count(tensor_name)) continue; - added_tensors.insert(tensor_name); - output_names->push_back(tensor_name); - auto tensor_or_weights = converter.get_tensor(tensor_name); + for (const auto& output : output_tensors) { + auto tensor_or_weights = converter.get_tensor(output.first); if (!tensor_or_weights.is_tensor()) { - return tensorflow::errors::InvalidArgument("Output node '" + tensor_name + - "' is weights not tensor"); + return tensorflow::errors::InvalidArgument( + "Output node '" + output.first + "' is weights not tensor"); } nvinfer1::ITensor* tensor = tensor_or_weights.tensor(); + tensor->setName(output.second.c_str()); if (!tensor) { return tensorflow::errors::NotFound("Output tensor not found: " + - tensor_name); + output.first); } + VLOG(1) << "Marking output tensor " << output.first << ", as output tensor " + << output.second; + converter.network()->markOutput(*tensor); - tensorflow::DataType tf_dtype = node->output_type(output_idx); - output_dtypes->push_back(tf_dtype); - nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT; - TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype)); - tensor->setType(trt_dtype); } + if (convert_successfully) *convert_successfully = true; - return tensorflow::Status::OK(); -} - -tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) { - // Visit nodes in reverse topological order and construct the TRT network. - // Toposort - std::list<tensorflow::Node*> order; - TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order)); - - static int static_id = 0; - string subgraph_name_scope = SubgraphNameScopeGenerator(&order); - // TODO(sami,ben,jie): proper naming! - string calib_op_name = - StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id); - string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id); - static_id++; - - auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance(); - auto op_rmgr = trt_rmgr->getManager("TRTCalibOps"); - auto op_res = new tensorflow::tensorrt::TRTCalibrationResource(); - TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res)); - op_res->logger_ = new tensorflow::tensorrt::Logger(); - cudaSetDevice(s.cuda_gpu_id_); - op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_)); - op_res->allocator_ = s.allocator_; -#if NV_TENSORRT_MAJOR > 3 - op_res->builder_->setGpuAllocator(s.allocator_.get()); -#endif - if (!op_res->builder_) { - return tensorflow::errors::Internal( - "failed to create TensorRT builder object"); + // Build the engine. + VLOG(1) << "Starting engine creation"; + engine->reset(builder->buildCudaEngine(*converter.network())); + if (engine->get() == nullptr) { + return tensorflow::errors::Internal("Failed to build TensorRT engine"); } - - op_res->network_ = op_res->builder_->createNetwork(); - if (!op_res->network_) { - return tensorflow::errors::Internal( - "failed to create TensorRT network object"); - } - - // Build the network - auto weight_rmgr = trt_rmgr->getManager("WeightStore"); - auto ws = new tensorflow::tensorrt::TRTWeightStore(); - TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws)); - Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE); - - std::vector<string> input_names; - std::vector<tensorflow::DataType> input_dtypes; - std::vector<string> output_names; - std::vector<tensorflow::DataType> output_dtypes; - TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names, - &input_dtypes, &output_names, - &output_dtypes, engine_name)); - - VLOG(2) << "Finished processing outputs"; - - // Build the engine - op_res->builder_->setMaxBatchSize(s.max_batch_size); - op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes); - VLOG(0) << "Max batch size= " << s.max_batch_size - << " max workspace size= " << s.max_workspace_size_bytes; - - // Build the TRT op - // TODO(sami,ben,jie): proper naming! - tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp"); - TF_RETURN_IF_ERROR(SetInputList(s, &op_builder, &input_names, &input_dtypes)); - - std::vector<string> segment_names; - segment_names.reserve(s.subgraph_node_ids.size()); - for (int i : s.subgraph_node_ids) { - auto node = s.graph.FindNodeId(i); - segment_names.push_back(node->name()); - } - LOG(INFO) << "finished op preparation"; - - auto status = op_builder.Attr("segment_nodes", segment_names) - .Attr("input_names", input_names) - .Attr("segment_output_names", output_names) - .Attr("resource_name", calib_op_name) - .Finalize(s.trt_node); - - LOG(INFO) << status.ToString(); - LOG(INFO) << "finished op building"; - + VLOG(1) << "Finished conversion"; return tensorflow::Status::OK(); } -tensorflow::Status ConvertSubGraphToTensorRTNodeDef( - tensorrt::convert::SubGraphParams& s) { - // Visit nodes in reverse topological order and construct the TRT network. - std::list<tensorflow::Node*> order; - TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order)); - - static int static_id = 0; - string subgraph_name_scope = SubgraphNameScopeGenerator(&order); - string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id++); - - tensorflow::tensorrt::Logger trt_logger; - cudaSetDevice(s.cuda_gpu_id_); - auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger)); - if (!trt_builder) { - return tensorflow::errors::Internal( - "Failed to create TensorRT builder object"); - } -#if NV_TENSORRT_MAJOR > 3 - trt_builder->setGpuAllocator(s.allocator_.get()); -#endif - auto trt_network = infer_object(trt_builder->createNetwork()); - if (!trt_network) { - return tensorflow::errors::Internal( - "Failed to create TensorRT network object"); - } - - auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance(); - auto weight_rmgr = trt_rmgr->getManager("WeightStore"); - auto ws = new tensorflow::tensorrt::TRTWeightStore(); - TF_CHECK_OK(weight_rmgr->Create(engine_name, engine_name, ws)); - - // Build the network - Converter converter(trt_network.get(), ws, s.precision_mode == FP16MODE); - - std::vector<string> input_names; - std::vector<tensorflow::DataType> input_dtypes; - std::vector<string> output_names; - std::vector<tensorflow::DataType> output_dtypes; - TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names, - &input_dtypes, &output_names, - &output_dtypes, engine_name)); - - VLOG(2) << "Finished output"; - - // Build the engine - trt_builder->setMaxBatchSize(s.max_batch_size); - trt_builder->setMaxWorkspaceSize(s.max_workspace_size_bytes); - VLOG(0) << "Max batch size= " << s.max_batch_size - << " max workspace size= " << s.max_workspace_size_bytes; - if (s.precision_mode == FP16MODE) { - trt_builder->setHalf2Mode(true); - VLOG(0) << "Using FP16 precision mode"; - } - LOG(INFO) << "starting build engine"; - string engine_plan_string; - { - auto trt_engine = - infer_object(trt_builder->buildCudaEngine(*converter.network())); - VLOG(0) << "Built network"; - if (trt_engine.get() == nullptr) { - return tensorflow::errors::Internal("Engine building failure"); +tensorflow::Status ConvertSegmentToGraphDef( + const tensorflow::Graph* graph, + const tensorflow::grappler::GraphProperties& graph_properties, + const std::vector<int>& subgraph_node_ids, // In topological order + std::vector<EngineConnection>* connections, + tensorflow::GraphDef* segment_def, string* common_scope) { + std::set<string> marker_nodes; + // Update connection shapes/data types and add corresponding input/output + // nodes in the segment graphdef. + for (size_t i = 0; i < connections->size(); ++i) { + auto& connection = connections->at(i); + auto outside_node = graph->FindNodeId(connection.outside_id); + if (!outside_node) { + // This should never happen, unless the original graph is problematic. + return tensorflow::errors::NotFound( + "Cannot find node with id ", connection.outside_id, " in the graph."); + } + // Updates the shape and data types of input/output connections. + tensorflow::DataType input_type = tensorflow::DT_FLOAT; + tensorflow::PartialTensorShape partial_shape; + if (connection.is_input_edge) { + if (graph_properties.HasOutputProperties(connection.outside_node_name)) { + auto output_params = + graph_properties.GetOutputProperties(connection.outside_node_name); + auto out_shape = output_params.at(connection.outside_port); + input_type = out_shape.dtype(); + std::vector<tensorflow::int64> dims; + partial_shape = out_shape.shape(); + connection.outside_shape = partial_shape; + } else { + VLOG(0) << "Unknown output shape" << outside_node->name(); + input_type = graph->FindNodeId(connection.outside_id) + ->output_type(connection.outside_port); + } + connection.connection_type = input_type; + + } else { // output edge + if (graph_properties.HasInputProperties(connection.outside_node_name)) { + auto input_params = + graph_properties.GetInputProperties(connection.outside_node_name); + auto in_shape = input_params.at(connection.outside_port); + input_type = in_shape.dtype(); + partial_shape = in_shape.shape(); + connection.inside_shape = partial_shape; + } else { + input_type = graph->FindNodeId(connection.inside_id) + ->output_type(connection.outside_port); + } + connection.connection_type = input_type; } - auto engine_plan = infer_object(trt_engine->serialize()); - VLOG(0) << "Serialized engine"; - const char* engine_plan_data = - static_cast<const char*>(engine_plan->data()); - engine_plan_string = - string(engine_plan_data, engine_plan_data + engine_plan->size()); - } - TF_RETURN_IF_ERROR(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>( - engine_name, engine_name)); - LOG(INFO) << "finished engine " << engine_name << " containing " - << s.subgraph_node_ids.size() << " nodes"; - - // Build the TRT op - tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp"); - TF_RETURN_IF_ERROR(SetInputList(s, &op_builder, &input_names, &input_dtypes)); - - VLOG(0) << "Finished op preparation"; - - auto status = op_builder.Attr("serialized_engine", engine_plan_string) - .Attr("input_nodes", input_names) - .Attr("output_nodes", output_names) - .Attr("OutT", output_dtypes) - .Device(s.device_name_) - .Finalize(s.trt_node); - - VLOG(0) << status.ToString() << " finished op building for " << engine_name - << " on device " << s.device_name_; + // Add dummy input/output nodes to the segment graphdef. + if (connection.is_input_edge) { + const string node_name = StrCat(kInputPHName, connection.port_number); + if (marker_nodes.count(node_name)) { + VLOG(1) << "Reusing input " << node_name << " for the edge " + << connection.outside_node_name << ":" + << connection.outside_port << " -> " + << connection.inside_node_name << ":" << connection.inside_port; + continue; + } + marker_nodes.insert(node_name); + auto seg_node = segment_def->add_node(); + tensorflow::NodeDefBuilder builder(node_name, "Placeholder"); + auto status = builder.Attr("shape", partial_shape) + .Attr("dtype", input_type) + .Finalize(seg_node); + VLOG(1) << "Constructing input " << node_name << " for the edge " + << connection.outside_node_name << ":" << connection.outside_port + << " -> " << connection.inside_node_name << ":" + << connection.inside_port; + } else { + const string node_name = StrCat(kOutputPHName, connection.port_number); + if (marker_nodes.count(node_name)) { + VLOG(1) << "Reusing output " << node_name << " for the edge " + << connection.inside_node_name << ":" << connection.inside_port + << " -> " << connection.outside_node_name << ":" + << connection.outside_port; + continue; + } + marker_nodes.insert(node_name); + auto seg_node = segment_def->add_node(); + tensorflow::NodeDefBuilder builder(node_name, "Identity"); + auto status = builder.Input(connection.inside_node_name, 0, input_type) + .Finalize(seg_node); + VLOG(1) << "Constructing output " << node_name << " for the edge " + << connection.inside_node_name << ":" << connection.inside_port + << " -> " << connection.outside_node_name << ":" + << connection.outside_port; + } + } // for each connection. + + std::unordered_map<int, int> old_to_new_id_map; + // Copy internal nodes to new graphdef + string local_scope = graph->FindNodeId(*subgraph_node_ids.begin())->name(); + for (const auto node_id : subgraph_node_ids) { + const auto node = graph->FindNodeId(node_id); + local_scope = GetCommonNameScope(local_scope, node->name()); + old_to_new_id_map[node_id] = segment_def->node_size(); + auto snode = segment_def->add_node(); + snode->CopyFrom(node->def()); + VLOG(1) << "Copying " << snode->name() << " to subgraph"; + } + // Update the inputs of the new input nodes to point to placeholder nodes. + for (int i = 0; i < connections->size(); ++i) { + auto& connection = connections->at(i); + if (!connection.is_input_edge) continue; + auto snode = + segment_def->mutable_node(old_to_new_id_map[connection.inside_id]); + const string placeholder_name = + StrCat(kInputPHName, connection.port_number); + VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port + << " from " << snode->input(connection.inside_port) << " to " + << placeholder_name; + snode->set_input(connection.inside_port, placeholder_name); + } + *common_scope = local_scope; + VLOG(0) << "Segment @scope '" << local_scope << "', converted to graph"; return tensorflow::Status::OK(); } diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h index 3f6592cd25..1a4c0e755d 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h @@ -22,69 +22,112 @@ limitations under the License. #include <utility> #include <vector> +#include "tensorflow/contrib/tensorrt/convert/utils.h" #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h" +#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h" #include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/lib/core/status.h" + #if GOOGLE_CUDA #if GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { +static const char* kInputPHName = "InputPH_"; +static const char* kOutputPHName = "OutputPH_"; namespace convert { +// TODO(aaroey): use an enum instead. const int FP32MODE = 0; const int FP16MODE = 1; const int INT8MODE = 2; -struct SubGraphParams { - SubGraphParams( - tensorflow::Graph& inp_graph, - const std::set<int>& subgraph_node_id_numbers, - const std::vector<std::pair<int, int>>& input_indices, - const std::vector<std::pair<int, int>>& output_indices, - size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes, - const tensorflow::grappler::GraphProperties& current_graph_properties, - std::unordered_map<string, std::pair<int, string>>* output_edges, - tensorflow::NodeDef* constructed_trt_node, - int engine_precision_mode = FP32MODE, const string& device_name = "", - std::shared_ptr<nvinfer1::IGpuAllocator> allocator = nullptr, - int cuda_gpu_id = 0) - : graph(inp_graph), - subgraph_node_ids(subgraph_node_id_numbers), - input_inds(input_indices), - output_inds(output_indices), - max_batch_size(max_supported_batch_size), - max_workspace_size_bytes(max_consumed_workspace_size_bytes), - graph_properties(current_graph_properties), - output_edge_map(output_edges), - trt_node(constructed_trt_node), - precision_mode(engine_precision_mode), - device_name_(device_name), - allocator_(allocator), - cuda_gpu_id_(cuda_gpu_id) {} - - tensorflow::Graph& graph; - const std::set<int>& subgraph_node_ids; - const std::vector<std::pair<int, int>>& input_inds; // {node_id, output_idx} - const std::vector<std::pair<int, int>>& output_inds; // {node_id, output_idx} - size_t max_batch_size; - size_t max_workspace_size_bytes; - const tensorflow::grappler::GraphProperties& graph_properties; - std::unordered_map<string, std::pair<int, string>>* output_edge_map; - tensorflow::NodeDef* trt_node; - const int precision_mode; - const string device_name_; - std::shared_ptr<nvinfer1::IGpuAllocator> allocator_; - const int cuda_gpu_id_; +struct EngineConnection { + EngineConnection(const string& outside, int out_id, int out_port, + const string& inside, int in_id, int in_port, + bool input_edge, int port) + : outside_node_name(outside), + outside_id(out_id), + outside_port(out_port), + inside_node_name(inside), + inside_id(in_id), + inside_port(in_port), + is_input_edge(input_edge), + port_number(port) {} + + const string outside_node_name; + const int outside_id; + const int outside_port; + tensorflow::PartialTensorShape outside_shape; + + const string inside_node_name; + const int inside_id; + const int inside_port; + tensorflow::PartialTensorShape inside_shape; + + tensorflow::DataType connection_type; + bool is_input_edge; + + // The port number of the TRT node connecting to this edge. + int port_number; +}; + +struct EngineInfo { + EngineInfo() + : engine_type(EngineType::TRTStatic), + max_workspace_size_bytes(0), + precision_mode(FP32MODE) {} + + string engine_name; + string device; + tensorflow::GraphDef segment_graph_def; + + // The segment nodes that are on one side of the edges are topological sorted. + std::vector<EngineConnection> connections; + + enum class EngineType { TRTStatic = 0, TRTDynamic = 1 }; + EngineType engine_type; + int64 max_workspace_size_bytes; + int maximum_cached_engines; + std::vector<int> cached_engine_batches; + int precision_mode; }; -// TODO(sami): Replace references with const reference or pointers -tensorflow::Status ConvertSubGraphToTensorRTNodeDef(SubGraphParams& params); -tensorflow::Status InjectCalibrationNode(SubGraphParams& params); -tensorflow::Status ConvertCalibrationNodeToEngineNode(tensorflow::Graph& graph, - tensorflow::Node* c_node); +// Constructs a graphdef from the segment in the given graph. Adds placeholder +// nodes for input edges (InputPH_*) and identity nodes for output edges +// (OutputPH_*). This function needs to be called before TensorRT nodes +// inserted in order to correctly get sizes from the original graph. +// +// - subgraph_node_ids: the node ids of the subgraph, must be sorted in +// topological order. +// - segment_def: the output GraphDef, whose non-input/output nodedefs will be +// sorted in topological order. +tensorflow::Status ConvertSegmentToGraphDef( + const tensorflow::Graph* graph, + const tensorflow::grappler::GraphProperties& graph_properties, + const std::vector<int>& subgraph_node_ids, + std::vector<EngineConnection>* connections, + tensorflow::GraphDef* segment_def, string* common_scope); + +// Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff +// 'builder' successfully build the engine. If the result is not ok, 'engine' +// will be set to nullptr +// Once returned, 'builder' is not needed any more and can be safely detroyed. +// +// - convert_successfully: indicates whether the converson to TensorRT network +// is successful. This is different than successfully building the engine: +// building can still fail afterwards. +tensorflow::Status ConvertGraphDefToEngine( + const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size, + size_t max_workspace_size_bytes, + const std::vector<tensorflow::PartialTensorShape>& input_shapes, + Logger* logger, nvinfer1::IGpuAllocator* allocator, + TRTInt8Calibrator* calibrator, + TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, + bool* convert_successfully); + } // namespace convert } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc index 8f634b1f74..ec9dbfa13b 100644 --- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc +++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc @@ -45,8 +45,24 @@ tensorflow::Status TRTOptimizationPass::Init( if (params.count("max_batch_size")) { maximum_batch_size_ = params.at("max_batch_size").i(); } - if (params.count("max_workspace_size_bytes")) + is_dynamic_op_ = false; + if (params.count("is_dynamic_op")) { + is_dynamic_op_ = params.at("is_dynamic_op").b(); + } + if (params.count("cached_engine_batches")) { + auto batch_vec = params.at("cached_engine_batches").list(); + batches_.reserve(batch_vec.i_size()); + for (const auto i : batch_vec.i()) { + batches_.push_back(i); + } + } + max_cached_batches_ = 1; + if (params.count("maximum_cached_engines")) { + max_cached_batches_ = params.at("maximum_cached_engines").i(); + } + if (params.count("max_workspace_size_bytes")) { maximum_workspace_size_ = params.at("max_workspace_size_bytes").i(); + } if (params.count("precision_mode")) { string pm = Uppercase(params.at("precision_mode").s()); if (pm == "FP32") { @@ -175,6 +191,17 @@ tensorflow::Status TRTOptimizationPass::Optimize( if (VLOG_IS_ON(1)) { PrintDebugInfo(cluster, item); } + // This is a hack to workaround optimizer issue. MetaOptimizer calls + // optimization passes on function objects as well, we should not modify + // generated funcdefs! This is fragile but we don't have any other option + // until framework fixes it. + if (item.id != "tf_graph") { + LOG(WARNING) << name_ + << " is probably called on funcdef! This optimizer must *NOT* " + "be called on function objects."; + *optimized_graph = item.graph; + return tensorflow::Status::OK(); + } int max_dim = -1; if (item.feed.size()) { for (const auto& f : item.feed) { @@ -204,11 +231,22 @@ tensorflow::Status TRTOptimizationPass::Optimize( } tensorflow::grappler::GraphProperties static_graph_properties(item); TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true)); - auto status = tensorflow::tensorrt::convert::ConvertAfterShapes( - item.graph, item.fetch, maximum_batch_size_, maximum_workspace_size_, - optimized_graph, precision_mode_, minimum_segment_size_, - static_graph_properties, cluster); + tensorflow::tensorrt::convert::ConversionParams cp; + cp.input_graph_def = &item.graph; + cp.output_names = &item.fetch; + cp.max_batch_size = maximum_batch_size_; + cp.max_workspace_size_bytes = maximum_workspace_size_; + cp.output_graph_def = optimized_graph; + cp.precision_mode = precision_mode_; + cp.minimum_segment_size = minimum_segment_size_; + cp.graph_properties = &static_graph_properties; + cp.cluster = cluster; + cp.is_dyn_op = is_dynamic_op_; + cp.cached_engine_batches = batches_; + cp.max_cached_engines = max_cached_batches_; + auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(cp); VLOG(2) << optimized_graph->DebugString(); + VLOG(1) << "Returning from " << name_; return status; } diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h index d8ecead23e..463ed3883e 100644 --- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h +++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h @@ -61,6 +61,9 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer { int minimum_segment_size_; int precision_mode_; int maximum_batch_size_; + bool is_dynamic_op_; + std::vector<int> batches_; + int max_cached_batches_; int64_t maximum_workspace_size_; }; diff --git a/tensorflow/contrib/tensorrt/convert/utils.h b/tensorflow/contrib/tensorrt/convert/utils.h new file mode 100644 index 0000000000..f601c06701 --- /dev/null +++ b/tensorflow/contrib/tensorrt/convert/utils.h @@ -0,0 +1,37 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_ +#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_ + +#include <memory> + +namespace tensorflow { +namespace tensorrt { + +template <typename T> +struct TrtDestroyer { + void operator()(T* t) { + if (t) t->destroy(); + } +}; + +template <typename T> +using TrtUniquePtrType = std::unique_ptr<T, TrtDestroyer<T>>; + +} // namespace tensorrt +} // namespace tensorflow + +#endif // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_ diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index 9ac8047944..8a17eb02f1 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -14,8 +14,16 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h" +#include <algorithm> +#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h" +#include "tensorflow/contrib/tensorrt/convert/utils.h" #include "tensorflow/contrib/tensorrt/log/trt_logger.h" -#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" +#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h" +#include "tensorflow/contrib/tensorrt/resources/trt_resources.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" +#include "tensorflow/core/lib/core/refcount.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" @@ -25,144 +33,556 @@ limitations under the License. #include "cuda/include/cuda_runtime_api.h" namespace tensorflow { -static ::tensorflow::tensorrt::Logger logger; -using IRuntime = nvinfer1::IRuntime; -using Dims = nvinfer1::Dims; - namespace tensorrt { +static Logger logger; +using ::nvinfer1::IRuntime; +using ::tensorflow::strings::StrAppend; +using ::tensorflow::strings::StrCat; + +// A helper class to call done() when destructed for asynchronous execution. +// Helps simultaneous execution of native and TRT engines. +class AsyncHelper : public tensorflow::core::RefCounted { + public: + AsyncHelper(tensorflow::AsyncOpKernel::DoneCallback done) { done_ = done; } + ~AsyncHelper() override { done_(); } -TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) { + private: + tensorflow::AsyncOpKernel::DoneCallback done_; +}; + +#define TYPECASE(dt, X, Y) \ + case dt: { \ + return (void*)X->flat<tensorflow::EnumToDataType<dt>::Type>().data(); \ + } + +void* GetTensorAddress(const Tensor* tensor_ptr) { + auto tensor_type = tensor_ptr->dtype(); + switch (tensor_type) { + TYPECASE(tensorflow::DT_FLOAT, tensor_ptr, dest_ptr); + TYPECASE(tensorflow::DT_HALF, tensor_ptr, dest_ptr); + TYPECASE(tensorflow::DT_INT8, tensor_ptr, dest_ptr); + default: { + LOG(ERROR) << "Unsupported Data type " + << tensorflow::DataTypeString(tensor_type); + return nullptr; + } + } +} + +tensorflow::Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) { + VLOG(1) << "Constructing function handle"; + auto lib = ctx->function_library(); + if (lib == nullptr) { + return tensorflow::errors::Internal("Context function library is null"); + } + auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_); + if (fdef == nullptr) { + return tensorflow::errors::Internal("Native FunctionDef ", funcdef_name_, + " can't be found in function library"); + } + tensorflow::FunctionLibraryRuntime::InstantiateOptions inst_ops; + inst_ops.overlay_lib = nullptr; + inst_ops.state_handle = ""; + inst_ops.target = ctx->device()->name(); + native_func_ = 0; + auto status = lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()), + inst_ops, &native_func_); + if (!status.ok()) { + LOG(ERROR) << " Instantiating native function " << funcdef_name_ + << " failed!"; + } + return status; +} + +TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) + : AsyncOpKernel(context) { // read serialized_engine OP_REQUIRES_OK(context, - context->GetAttr("serialized_engine", &serialized_engine_)); + context->GetAttr("serialized_segment", &serialized_segment_)); + OP_REQUIRES_OK(context, + context->GetAttr("workspace_size_bytes", &workspace_size_)); + OP_REQUIRES_OK(context, context->GetAttr("static_engine", &static_engine_)); + if (!static_engine_) { + if (!segment_graph_.ParseFromString(serialized_segment_)) { + LOG(ERROR) << "Parsing segment graph failed!"; + context->SetStatus(tensorflow::errors::InvalidArgument( + "Failed to parse segment graphdef!")); + return; + } + serialized_segment_.resize(0); + } + VLOG(1) << "Constructing " << name(); + string precision_string; + OP_REQUIRES_OK(context, + context->GetAttr("precision_mode", &precision_string)); + string calibration_data; + OP_REQUIRES_OK(context, + context->GetAttr("calibration_data", &calibration_data)); + OP_REQUIRES_OK(context, + context->GetAttr("segment_funcdef_name", &funcdef_name_)); + if (precision_string == "FP32") { + precision_mode_ = convert::FP32MODE; + } else if (precision_string == "FP16") { + precision_mode_ = convert::FP16MODE; + } else if (precision_string == "INT8") { + precision_mode_ = convert::INT8MODE; + } + calibration_mode_ = + (precision_mode_ == convert::INT8MODE && calibration_data.size() == 0); + if (calibration_data.size()) { + calibrator_.reset(new TRTInt8Calibrator(calibration_data)); + calibration_data.resize(0); + } + native_func_ = tensorflow::kInvalidHandle; + OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count", + &max_cached_engines_)); + OP_REQUIRES_OK(context, + context->GetAttr("fixed_input_size", &fixed_input_size_)); + OP_REQUIRES_OK(context, context->GetAttr("cached_engine_batches", + &cached_engine_batches_)); + std::sort(cached_engine_batches_.begin(), cached_engine_batches_.end()); + if (VLOG_IS_ON(1)) { + string s("Engine Batches= "); + for (auto i : cached_engine_batches_) { + StrAppend(&s, i, " "); + } + VLOG(1) << s; + } +} - // register input output node name in trt_sub_graph - OP_REQUIRES_OK(context, context->GetAttr("input_nodes", &input_nodes_)); - OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_)); +void TRTEngineOp::ExecuteNativeSegment(tensorflow::OpKernelContext* ctx, + AsyncHelper* helper) { + if (!calibration_mode_) { + VLOG(1) << "Executing native engine"; + } + std::vector<Tensor> inputs; + std::vector<Tensor>* outputs = new std::vector<Tensor>(); + if (native_func_ == tensorflow::kInvalidHandle) { + auto status = ConstructFunctionHandle(ctx); + if (!status.ok()) { + LOG(ERROR) << "Couldn't construct function handle " << funcdef_name_; + ctx->SetStatus(status); + return; + } + } + auto lib = ctx->function_library(); + tensorflow::FunctionLibraryRuntime::Options opts; + opts.step_id = ctx->step_id(); + opts.rendezvous = ctx->rendezvous(); + opts.cancellation_manager = ctx->cancellation_manager(); + opts.runner = ctx->runner(); + for (int i = 0; i < ctx->num_inputs(); i++) { + inputs.push_back(ctx->input(i)); + } + helper->Ref(); // Increment count for calculating native graph + VLOG(1) << "Executing native segment " << name(); + lib->Run(opts, native_func_, inputs, outputs, + [ctx, outputs, helper](const tensorflow::Status& s) { + tensorflow::core::ScopedUnref sc(helper); + VLOG(1) << "Native Segment completed"; + if (!s.ok()) { + ctx->SetStatus(s); + return; + } + for (size_t t = 0; t < outputs->size(); ++t) { + ctx->set_output(t, outputs->at(t)); + } + delete outputs; + }); } -void TRTEngineOp::Compute(OpKernelContext* context) { - // TODO(samikama) runtime should be taken from a resourcemanager as well. - // Only engine should be in the op and context and runtime should be taken - // from resourcemanager +void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx, + AsyncHelper* helper) { + helper->Ref(); + tensorflow::core::ScopedUnref sc(helper); + // TODO(aaroey): remove the ResourceMgr singleton. + auto trt_rm = TRTResourceManager::instance(); + auto res_mgr = trt_rm->getManager("TRTCalibration"); + TRTCalibrationResource* calib_res = nullptr; + auto status = res_mgr->LookupOrCreate( + funcdef_name_, "Calibrator", &calib_res, + {[ctx, this](TRTCalibrationResource** cr) -> tensorflow::Status { + return this->AllocateCalibrationResources(ctx, cr); + }}); + if (!status.ok()) { + ctx->SetStatus(status); + return; + } + int num_inputs = ctx->num_inputs(); + // Pass input data to calibrator + std::unordered_map<string, void*> input_data; + for (int i = 0; i < num_inputs; i++) { + const Tensor& t = ctx->input(i); + void* data_address = GetTensorAddress(&t); + if (data_address == nullptr) { + ctx->SetStatus(tensorflow::errors::InvalidArgument( + "Unsupported data type encountered in input ", i)); + return; + } + // Check the allocated buffer is sufficient for input + const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx); + CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); + input_data.emplace(StrCat(kInputPHName, i), data_address); + } + VLOG(2) << "Filled map for sending"; + // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files + const cudaStream_t* stream = CHECK_NOTNULL( + reinterpret_cast<const cudaStream_t*>(ctx->op_device_context() + ->stream() + ->implementation() + ->CudaStreamMemberHack())); + calib_res->calibrator_->setBatch(input_data, *stream); + VLOG(2) << "Passed calibration data"; + ExecuteNativeSegment(ctx, helper); +} - if (!trt_execution_context_ptr_) { - IRuntime* infer = nvinfer1::createInferRuntime(logger); -#if NV_TENSORRT_MAJOR > 3 - auto device = context->device(); - auto dev_allocator = - device->GetAllocator(tensorflow::AllocatorAttributes()); - if (!dev_allocator) { - LOG(FATAL) << "Can't find device allocator for gpu device " - << device->name(); - } - allocator_ = std::make_shared<TRTDeviceAllocator>(dev_allocator); - infer->setGpuAllocator(allocator_.get()); -#endif - trt_engine_ptr_.reset(infer->deserializeCudaEngine( - serialized_engine_.c_str(), serialized_engine_.size(), - PluginFactoryTensorRT::GetInstance())); - trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext()); - // Runtime is safe to delete after engine creation - infer->destroy(); - serialized_engine_.clear(); +int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) { + int num_batch = ctx->input(0).shape().dim_size(0); + int smallest_engine = 0; + for (const auto i : cached_engine_batches_) { + if (i >= num_batch) { + smallest_engine = i; + break; + } } - int num_binding = context->num_inputs() + context->num_outputs(); - std::vector<void*> buffers(num_binding); + // TODO(sami): Need an LRU here + if (smallest_engine == 0) { + if (max_cached_engines_ > cached_engine_batches_.size()) { + smallest_engine = num_batch; + cached_engine_batches_.push_back(num_batch); + VLOG(1) << "Running with batch size " << num_batch; + } else { + string s("Engine buffer is full. buffer limit= "); + StrAppend(&s, max_cached_engines_, ", current entries= "); + for (auto i : cached_engine_batches_) StrAppend(&s, i, ", "); + StrAppend(&s, "Requested batch= ", num_batch); + LOG(ERROR) << s; + ctx->SetStatus(tensorflow::errors::ResourceExhausted( + "Requested batch size is not available and engine cache is full")); + return -1; + } + } + return smallest_engine; +} - size_t binding_index; - int num_batch = 0; - for (int i = 0; i < context->num_inputs(); i++) { - // Grab the input tensor - binding_index = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str()); +void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx, + tensorflow::AsyncOpKernel::DoneCallback done) { + auto helper = new AsyncHelper(done); + tensorflow::core::ScopedUnref sc(helper); + if (calibration_mode_) { + ExecuteCalibration(ctx, helper); + return; + } + const int smallest_engine = GetEngineBatch(ctx); + if (smallest_engine < 0) return; // GetEngineBatch already set the status. + + const int num_batch = ctx->input(0).shape().dim_size(0); + auto& engine_ctx_pair = GetEngine(smallest_engine, ctx); + auto& trt_engine_ptr = engine_ctx_pair.first; + if (!trt_engine_ptr) { + LOG(WARNING) << "Engine retrieval for batch size " << num_batch + << " failed Running native segment"; + ExecuteNativeSegment(ctx, helper); + return; + } - const Tensor& input_tensor = context->input(i); + const int num_binding = ctx->num_inputs() + ctx->num_outputs(); + std::vector<void*> buffers(num_binding); + for (int i = 0; i < ctx->num_inputs(); i++) { + const string inp_name = StrCat(kInputPHName, i); + const size_t binding_index = + trt_engine_ptr->getBindingIndex(inp_name.c_str()); + + const Tensor& input_tensor = ctx->input(i); const TensorShape& input_shape = input_tensor.shape(); - if (i == 0) { - num_batch = input_shape.dim_size(0); - if (num_batch > trt_engine_ptr_->getMaxBatchSize()) { - LOG(FATAL) << "input tensor batch larger than max_batch_size: " - << trt_engine_ptr_->getMaxBatchSize(); - } - } else if (num_batch != input_shape.dim_size(0)) { - LOG(FATAL) << "input data inconsistent batch size"; - break; + if (num_batch != input_shape.dim_size(0)) { + LOG(ERROR) << "input data inconsistent batch size"; + ctx->SetStatus(tensorflow::errors::FailedPrecondition( + "Different batch sizes between input tensors")); + return; } - auto dtype = trt_engine_ptr_->getBindingDataType(binding_index); + auto dtype = trt_engine_ptr->getBindingDataType(binding_index); switch (dtype) { case nvinfer1::DataType::kFLOAT: buffers[binding_index] = (void*)(input_tensor.flat<float>().data()); break; case nvinfer1::DataType::kHALF: - LOG(FATAL) << "half size is not supported yet!"; - break; + LOG(ERROR) << "FP16 inputs are not supported yet!"; + ctx->SetStatus(tensorflow::errors::InvalidArgument( + "FP16 inputs are not supported!")); + return; case nvinfer1::DataType::kINT8: - LOG(FATAL) << "int8 is not supported yet!"; - break; + LOG(ERROR) << "INT8 inputs are not supported yet!"; + ctx->SetStatus(tensorflow::errors::InvalidArgument( + "INT8 inputs are not supported!")); + return; default: - LOG(FATAL) << "Unknown data type: " << int(dtype); - break; + LOG(ERROR) << "Unknown TRT data type: " << int(dtype); + ctx->SetStatus(tensorflow::errors::InvalidArgument( + "Unknown output TRT data type! ", static_cast<int>(dtype))); + return; } } - for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) { - // This is bad that we have to reallocate output buffer every run. + for (int i = 0; i < ctx->num_outputs(); i++) { // Create an output tensor - binding_index = trt_engine_ptr_->getBindingIndex(output_nodes_[i].c_str()); + const string output_name = StrCat(kOutputPHName, i); + const size_t binding_index = + trt_engine_ptr->getBindingIndex(output_name.c_str()); Tensor* output_tensor = nullptr; TensorShape output_shape; if (binding_index != -1) { - auto dims = trt_engine_ptr_->getBindingDimensions(binding_index); + auto dims = trt_engine_ptr->getBindingDimensions(binding_index); std::vector<int> trt_shape(dims.nbDims + 1); trt_shape[0] = num_batch; for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j]; - OP_REQUIRES_OK(context, - TensorShapeUtils::MakeShape( - trt_shape.data(), trt_shape.size(), &output_shape)); + OP_REQUIRES_OK( + ctx, TensorShapeUtils::MakeShape(trt_shape.data(), trt_shape.size(), + &output_shape)); } else { - LOG(FATAL) << "output node not found, at " << output_nodes_[i]; - break; + LOG(ERROR) << "output node not found, at " << output_name; + ctx->SetStatus(tensorflow::errors::Internal("output ", output_name, + " couldn't be found!")); + return; } - - OP_REQUIRES_OK(context, - context->allocate_output(i, output_shape, &output_tensor)); - auto dtype = trt_engine_ptr_->getBindingDataType(binding_index); + auto status = ctx->allocate_output(i, output_shape, &output_tensor); + if (!status.ok()) { + LOG(ERROR) << "Allocating output failed with " << status; + ctx->SetStatus(status); + return; + } + auto dtype = trt_engine_ptr->getBindingDataType(binding_index); switch (dtype) { case nvinfer1::DataType::kFLOAT: buffers[binding_index] = reinterpret_cast<void*>(output_tensor->flat<float>().data()); break; case nvinfer1::DataType::kHALF: - LOG(FATAL) << "half size is not supported yet!"; - break; + LOG(ERROR) << "half size is not supported yet!"; + ctx->SetStatus(tensorflow::errors::InvalidArgument( + "Half outputs are not supported!")); + return; case nvinfer1::DataType::kINT8: - LOG(FATAL) << "int8 is not supported yet!"; - break; + LOG(ERROR) << "int8 is not supported yet!"; + ctx->SetStatus(tensorflow::errors::InvalidArgument( + "INT8 outputs are not supported!")); + return; default: - LOG(FATAL) << "Unknown data type: " << int(dtype); - break; + LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype); + ctx->SetStatus(tensorflow::errors::InvalidArgument( + "Unsupported output data type! ", static_cast<int>(dtype))); + return; } } // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files const cudaStream_t* stream = CHECK_NOTNULL( - reinterpret_cast<const cudaStream_t*>(context->op_device_context() + reinterpret_cast<const cudaStream_t*>(ctx->op_device_context() ->stream() ->implementation() ->CudaStreamMemberHack())); // TODO(jie): trt enqueue does not return error - auto ret = trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], - *stream, nullptr); - VLOG(2) << "enqueue returns: " << ret; + auto& trt_execution_context_ptr = engine_ctx_pair.second; + auto ret = trt_execution_context_ptr->enqueue(num_batch, &buffers[0], *stream, + nullptr); + if (!ret) { + LOG(ERROR) << "Failed to enqueue batch for TRT engine: " << name(); + ctx->SetStatus(tensorflow::errors::Internal( + "Failed to enqueue batch for TRT engine: ", name())); + } // sync should be done by TF. } + TRTEngineOp::~TRTEngineOp() { - // Order matters! - trt_execution_context_ptr_.reset(); - trt_engine_ptr_.reset(); + // We need to manually destroy the engine and execution context before + // the allocator is destructed. + for (auto& eng : engine_map_) { + eng.second.first.reset(); + eng.second.second.reset(); + } allocator_.reset(); } + +nvinfer1::IGpuAllocator* TRTEngineOp::GetAllocator(OpKernelContext* ctx) { + if (allocator_) return allocator_.get(); + auto device = ctx->device(); + auto alloc = device->GetAllocator(tensorflow::AllocatorAttributes()); + if (!alloc) { + LOG(ERROR) << "Can't find device allocator for gpu device " + << device->name(); + ctx->SetStatus(tensorflow::errors::Internal( + "Can't get device allocator for device ", device->name())); + return nullptr; + } + allocator_.reset(new TRTDeviceAllocator(alloc)); + return allocator_.get(); +} + +TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size, + OpKernelContext* ctx) { + static EngineCtxPair null_pair = { + TrtUniquePtrType<nvinfer1::ICudaEngine>(nullptr), + TrtUniquePtrType<nvinfer1::IExecutionContext>(nullptr)}; + // TODO(sami): This method needs to be re-written to use resource manager and + // with LRU mechanism option. + tensorflow::mutex_lock lock(engine_mutex_); + + if (static_engine_) { + if (engine_map_.size()) { + if (engine_map_.begin()->first >= batch_size) { + return engine_map_.begin()->second; + } + return null_pair; + } + TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger)); +#if NV_TENSORRT_MAJOR > 3 + auto allocator = GetAllocator(ctx); + if (allocator == nullptr) { + // GetAllocator already set the Status. + return null_pair; + } + infer->setGpuAllocator(allocator); +#endif + TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine( + infer->deserializeCudaEngine(serialized_segment_.c_str(), + serialized_segment_.size(), nullptr)); + auto raw_static_engine = static_engine.get(); + const auto max_batch_size = raw_static_engine->getMaxBatchSize(); + engine_map_[max_batch_size] = { + std::move(static_engine), + TrtUniquePtrType<nvinfer1::IExecutionContext>( + raw_static_engine->createExecutionContext())}; + // Runtime is safe to delete after engine creation + serialized_segment_.clear(); + if (max_batch_size < batch_size) return null_pair; + return engine_map_.at(max_batch_size); + } // static_engine_ + + // Handle the dynamic engine case. + auto engine_it = engine_map_.find(batch_size); + if (engine_it == engine_map_.end() && + engine_map_.size() < (size_t)max_cached_engines_) { + nvinfer1::IGpuAllocator* allocator = nullptr; +#if NV_TENSORRT_MAJOR > 3 + allocator = GetAllocator(ctx); + if (allocator == nullptr) { + // GetAllocator already set the Status. + return null_pair; + } +#endif + std::vector<tensorflow::PartialTensorShape> shapes; + for (int i = 0; i < ctx->num_inputs(); ++i) { + shapes.emplace_back(ctx->input(i).shape()); + } + TrtUniquePtrType<nvinfer1::ICudaEngine> engine; + bool convert_successfully = false; + VLOG(0) << name() << " Constructing a new engine with batch size " + << batch_size; + // Up to this point, calibrator_ can never be empty, since otherwise it + // means calibration_mode_ is true and this path won't get executed. + auto status = convert::ConvertGraphDefToEngine( + segment_graph_, precision_mode_, batch_size, workspace_size_, shapes, + &logger, allocator, calibrator_.get(), &engine, &convert_successfully); + if (!status.ok()) { + if (convert_successfully) { + // This means it fail to build the engine even when the network is built + // successfully, probably due to internal issues. In this case we don't + // retry in the future. + engine_map_[batch_size] = {nullptr, nullptr}; + } + LOG(ERROR) << "Engine creation for batch size " << batch_size + << " failed " << status; + ctx->SetStatus(tensorflow::errors::Internal("Engine creation failed!")); + return null_pair; + } + VLOG(1) << "Conversion is done"; + TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context( + engine->createExecutionContext()); + engine_map_[batch_size] = {std::move(engine), std::move(exec_context)}; + } + return engine_map_.at(batch_size); +} + +tensorflow::Status TRTEngineOp::AllocateCalibrationResources( + tensorflow::OpKernelContext* ctx, TRTCalibrationResource** cr) { + auto cres = new TRTCalibrationResource(); + *cr = cres; + // Get the allocator. + auto alloc = ctx->device()->GetAllocator(tensorflow::AllocatorAttributes()); + if (!alloc) { + LOG(WARNING) << "Can't get device allocator will not be able to " + "allocate memory from TensorFlow memory pool"; + cres->allocator_.reset(new TRTCudaAllocator); + } else { + cres->allocator_.reset(new TRTDeviceAllocator(alloc)); + } + // Get the input shapes. + const int batch_size = ctx->input(0).dim_size(0); + const int num_inputs = ctx->num_inputs(); + std::vector<tensorflow::PartialTensorShape> shapes; + dev_tensors_.resize(num_inputs); + VLOG(1) << " Constructing calibrator"; + for (int i = 0; i < num_inputs; i++) { + // allocate workspace on device for inputs + const tensorflow::Tensor& t = ctx->input(i); + shapes.emplace_back(t.shape()); + Tensor* device_tensor; + TF_RETURN_IF_ERROR(ctx->allocate_persistent( + t.dtype(), t.shape(), &dev_tensors_.at(i), &device_tensor)); + CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); + void* device_address = GetTensorAddress(device_tensor); + if (device_address == nullptr) { + return tensorflow::errors::InvalidArgument( + "Unsupported data type encountered in input ", i); + } + device_buffers_.emplace( + StrCat(kInputPHName, i), + std::pair<void*, size_t>(device_address, device_tensor->TotalBytes())); + } + cres->calibrator_.reset( + new TRTInt8Calibrator(device_buffers_, batch_size, name())); + const string label(name()); + auto segment_graph = &segment_graph_; + const int cuda_gpu_id = ctx->device()->tensorflow_gpu_device_info()->gpu_id; + if (cuda_gpu_id < 0) { + LOG(ERROR) << "Can't get gpu_device_info from context->device()"; + return tensorflow::errors::InvalidArgument( + "Context->device doesn't contain device info!"); + } + const int64 workspace_size_bytes = workspace_size_; + cres->thr_.reset(new std::thread([cres, label, segment_graph, shapes, + cuda_gpu_id, workspace_size_bytes]() { + VLOG(0) << "Starting calibration thread on device " << cuda_gpu_id + << ", Calibration Resource @ " << cres; + auto err = cudaSetDevice(cuda_gpu_id); + if (err != cudaSuccess) { + // TODO(aaroey): should return error here. + LOG(ERROR) << "Couldn't set cuda device to " << cuda_gpu_id + << " in calibration thread"; + } + // ConvertGraphDefToEngine() will try to build the engine. This thread + // will loop inside buildCudaEngine() consuming the calibration data + // that is set by the TF op, and drive the builder until calibrator returns + // false. Engine is discarded after calibration table is generated + // + // TODO(aaroey): maybe setting the max batch size using the python + // calibration wrapper class. + auto s = convert::ConvertGraphDefToEngine( + *segment_graph, convert::INT8MODE, cres->calibrator_->getBatchSize(), + workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(), + cres->calibrator_.get(), &cres->engine_, + /*convert_successfully=*/nullptr); + if (!s.ok()) { + LOG(ERROR) << "Calibration failed: " << s; + cres->calibrator_->setDone(); // Ignore further pushes + } + VLOG(1) << "Calibration loop terminated " << label; + })); + VLOG(1) << "initialized calibrator resource"; + return tensorflow::Status::OK(); +} + REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp); } // namespace tensorrt diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h index e613a71422..6fe318be6a 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h @@ -19,9 +19,14 @@ limitations under the License. #include <memory> #include <vector> +#include "tensorflow/contrib/tensorrt/convert/utils.h" +#include "tensorflow/contrib/tensorrt/log/trt_logger.h" #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/platform/mutex.h" #if GOOGLE_CUDA #if GOOGLE_TENSORRT @@ -30,32 +35,95 @@ limitations under the License. namespace tensorflow { namespace tensorrt { -class Logger; - +class TRTInt8Calibrator; +class TRTCalibrationResource; +class AsyncHelper; // TODO(Sami): Remove this file? -class TRTEngineOp : public OpKernel { + +// This OP can construct TRTEngine on the fly and if construction of engine +// fails, executes equivalent subgraph as a TensorFlow function. +class TRTEngineOp : public AsyncOpKernel { public: explicit TRTEngineOp(OpKernelConstruction* context); - void Compute(OpKernelContext* context) override; + void ComputeAsync(OpKernelContext* context, + AsyncOpKernel::DoneCallback done) override; ~TRTEngineOp(); private: - template <typename T> - struct Destroyer { - void operator()(T* d) { d->destroy(); } - }; - - template <typename T> - using destroyed_ptr = std::unique_ptr<T, Destroyer<T>>; - destroyed_ptr<nvinfer1::ICudaEngine> trt_engine_ptr_; + // Execute calibration + void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper); + + // Construct a function handle for executing native funcdef graph + Status ConstructFunctionHandle(OpKernelContext* ctx); + + // Execute replaced native segment as function Op. + void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper); + + // Allocate necessary resources for calibration + Status AllocateCalibrationResources(OpKernelContext* ctx, + TRTCalibrationResource** cr); + // TODO(samikama): context should go to a resource manager! - destroyed_ptr<nvinfer1::IExecutionContext> trt_execution_context_ptr_; + typedef std::pair<TrtUniquePtrType<nvinfer1::ICudaEngine>, + TrtUniquePtrType<nvinfer1::IExecutionContext>> + EngineCtxPair; + EngineCtxPair& GetEngine(int batch_size, OpKernelContext* ctx); + // Return engine batch closest to input batch. + int GetEngineBatch(OpKernelContext* ctx); + + nvinfer1::IGpuAllocator* GetAllocator(OpKernelContext* ctx); + + // map to keep engines and their execution context for given batch size. + std::unordered_map<int, EngineCtxPair> engine_map_; std::vector<string> input_nodes_; std::vector<string> output_nodes_; - std::shared_ptr<nvinfer1::IGpuAllocator> allocator_; - string serialized_engine_; + + // keep device allocator for TRT. + std::unique_ptr<TRTDeviceAllocator> allocator_; + + // serialized protobuf segment or trt engine depending on static_engine_ flag. + string serialized_segment_; + + // Name of the function for TF native execution of the segment. + string funcdef_name_; + + // GraphDef representation of the segment. + GraphDef segment_graph_; + + // Lookup table for temporary staging areas of input tensors for calibration. + std::unordered_map<string, std::pair<void*, size_t>> device_buffers_; + + // Temporary staging areas for calibration inputs. + std::vector<PersistentTensor> dev_tensors_; + + // Engine Precision mode. + int precision_mode_; + + // Whether engine is constructed during the conversion or needs to be + // constructed from protobuf segment. + bool static_engine_; + + // Whether to calibrate INT8 engine. + bool calibration_mode_; + + // Whether non-batch ranks of the inputs are assumed to be fixed or not for + // engine construction. + bool fixed_input_size_; + + // Batches of the cached engines + std::vector<int> cached_engine_batches_; + + // Maximum number of cached engines + int max_cached_engines_; + + int64 workspace_size_; + mutex engine_mutex_; + FunctionLibraryRuntime::Handle native_func_; + + // The finalized calibrator for inference. + std::unique_ptr<TRTInt8Calibrator> calibrator_; }; } // namespace tensorrt diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc index 079d73f7be..383635f428 100644 --- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc @@ -28,11 +28,19 @@ extern Status TRTEngineOpShapeInference(InferenceContext* c); } REGISTER_OP("TRTEngineOp") - .Attr("serialized_engine: string") - .Attr("input_nodes: list(string)") - .Attr("output_nodes: list(string)") - .Attr("InT: list({float32})") - .Attr("OutT: list({float32})") + .Attr("serialized_segment: string") + .Attr("input_shapes: list(shape)") + .Attr("output_shapes: list(shape)") + .Attr("segment_funcdef_name: string") + .Attr("InT: list({int8,float16,float32})") + .Attr("OutT: list({int8,float16,float32})") + .Attr("static_engine: bool = true") + .Attr("fixed_input_size: bool = true") + .Attr("cached_engine_batches: list(int) = []") + .Attr("max_cached_engines_count: int = 1") + .Attr("workspace_size_bytes: int") + .Attr("precision_mode: {'FP32', 'FP16', 'INT8', 'INT8CALIB'}") + .Attr("calibration_data: string = ''") .Input("in_tensor: InT") .Output("out_tensor: OutT") .SetShapeFn(shape_inference::TRTEngineOpShapeInference); diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py index 338475d90e..79f512dbcf 100644 --- a/tensorflow/contrib/tensorrt/python/trt_convert.py +++ b/tensorflow/contrib/tensorrt/python/trt_convert.py @@ -21,6 +21,8 @@ from __future__ import print_function # pylint: disable=unused-import,line-too-long import six as _six from tensorflow.contrib.tensorrt.wrap_conversion import calib_convert +from tensorflow.contrib.tensorrt.wrap_conversion import get_linked_tensorrt_version +from tensorflow.contrib.tensorrt.wrap_conversion import get_loaded_tensorrt_version from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert from tensorflow.core.framework import graph_pb2 from tensorflow.core.protobuf import rewriter_config_pb2 @@ -29,7 +31,9 @@ from tensorflow.python.framework import errors_impl as _impl from tensorflow.python.framework import meta_graph from tensorflow.python.framework import ops from tensorflow.python.grappler import tf_optimizer +from tensorflow.python.platform import tf_logging from tensorflow.python.util import compat + # pylint: enable=unused-import,line-too-long @@ -40,7 +44,10 @@ def create_inference_graph(input_graph_def, max_batch_size=1, max_workspace_size_bytes=2 << 20, precision_mode="FP32", - minimum_segment_size=3): + minimum_segment_size=3, + is_dynamic_op=False, + maximum_cached_engines=1, + cached_engine_batches=[]): """Python wrapper for the TRT transformation. Args: @@ -51,6 +58,10 @@ def create_inference_graph(input_graph_def, precision_mode: one of 'FP32', 'FP16' and 'INT8' minimum_segment_size: the minimum number of nodes required for a subgraph to be replaced by TRTEngineOp. + is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT + network and engine at run time. + maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops. + cached_engine_batches: batch sizes used to pre-create cached engines. Returns: New GraphDef with TRTEngineOps placed in graph replacing subgraphs. @@ -65,6 +76,30 @@ def create_inference_graph(input_graph_def, "It should be one of {}").format( precision_mode, "{'FP32', 'FP16', 'INT8'}")) mode = supported_precision_modes[precision_mode.upper()] + compiled_version = get_linked_tensorrt_version() + loaded_version = get_loaded_tensorrt_version() + version_mismatch = False + if loaded_version[0] < compiled_version[0]: + tf_logging.error( + "TensorRT version mismatch. Tensorflow was compiled against " + + "TensorRT %s but library loaded from environment is TensorRT %s" % + (".".join([str(x) for x in compiled_version]), + ".".join([str(x) for x in loaded_version])) + + ". Please make sure that correct version of TensorRT " + + "is available in the system and added to ldconfig or LD_LIBRARY_PATH" + ) + raise RuntimeError("Incompatible TensorRT library version") + for i in zip(loaded_version, compiled_version): + if i[0] != i[1]: + tf_logging.warn("TensorRT mismatch. Compiled against version " + + "%s, but loaded %s. Things may not work" % + (".".join([str(x) for x in compiled_version]), + ".".join([str(x) for x in loaded_version]))) + version_mismatch = True + break + if not version_mismatch: + tf_logging.info("Running against TensorRT version %s" % ".".join( + [str(x) for x in loaded_version])) def py2bytes(inp): return inp @@ -100,7 +135,9 @@ def create_inference_graph(input_graph_def, # pair or strings where first one is encoded status and the second # one is the transformed graphs protobuf string. out = trt_convert(input_graph_def_str, out_names, max_batch_size, - max_workspace_size_bytes, mode, minimum_segment_size) + max_workspace_size_bytes, mode, minimum_segment_size, + is_dynamic_op, maximum_cached_engines, + cached_engine_batches) status = to_string(out[0]) output_graph_def_string = out[1] del input_graph_def_str # Save some memory @@ -120,11 +157,12 @@ def create_inference_graph(input_graph_def, return output_graph_def -def calib_graph_to_infer_graph(calibration_graph_def): +def calib_graph_to_infer_graph(calibration_graph_def, is_dynamic_op=False): """Convert an existing calibration graph to inference graph. Args: calibration_graph_def: the calibration GraphDef object with calibration data + is_dynamic_op: whether to create dynamic static engines from calibration Returns: New GraphDef with TRTEngineOps placed in graph replacing calibration nodes. Raises: @@ -141,9 +179,16 @@ def calib_graph_to_infer_graph(calibration_graph_def): to_string = py2string else: to_string = py3string - + is_calib_graph = False + for n in calibration_graph_def.node: + if n.op == "TRTEngineOp": + is_calib_graph = is_calib_graph or not n.attr["calibration_data"].s + if not is_calib_graph: + tf_logging.error( + "Not a calib graph. Doesn't seem to contain any calibration nodes.") + return None graph_str = calibration_graph_def.SerializeToString() - out = calib_convert(graph_str) + out = calib_convert(graph_str, is_dynamic_op) status = to_string(out[0]) output_graph_def_string = out[1] del graph_str # Save some memory diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc index 0f0508331c..9f115990c3 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc +++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc @@ -50,7 +50,7 @@ TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator) } void TRTDeviceAllocator::free(void* memory) { - VLOG(2) << "Deallocating " << memory; + VLOG(2) << "Deallocating @ " << memory; allocator_->DeallocateRaw(memory); } diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h index a0c2540a76..c5d2cec730 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h +++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h @@ -16,7 +16,6 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_ #define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_ - #include "tensorflow/contrib/tensorrt/log/trt_logger.h" #include "tensorflow/core/framework/allocator.h" @@ -52,7 +51,9 @@ class TRTDeviceAllocator : public nvinfer1::IGpuAllocator { // Allocator implementation wrapping TF device allocators. public: TRTDeviceAllocator(tensorflow::Allocator* allocator); - virtual ~TRTDeviceAllocator() {} + virtual ~TRTDeviceAllocator() { + VLOG(1) << "Destroying allocator attached to " << allocator_->Name(); + } void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override; void free(void* memory) override; diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc index dc7c93f869..32e81858b9 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc +++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc @@ -16,7 +16,6 @@ limitations under the License. #include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h" #include <atomic> -#include <chrono> #include <unordered_map> #include "tensorflow/core/platform/logging.h" @@ -37,15 +36,22 @@ TRTInt8Calibrator::TRTInt8Calibrator( : batch_size_(batch_size), done_(false), dev_buffers_(dev_buffers), - calib_running_(false), + calib_running_(true), batch_is_set_(false), engine_name_(engine_name) {} +TRTInt8Calibrator::TRTInt8Calibrator(const string& calib_data) + : batch_size_(0), + done_(false), + calib_running_(false), + batch_is_set_(false), + calibration_table_(calib_data) {} + bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data, const cudaStream_t stream) { tensorflow::mutex_lock lock(cond_mtx_); - while ((calib_running_ || batch_is_set_) && - !done_) { // wait while calibration is running + // wait while calibration is running. + while ((calib_running_ || batch_is_set_) && !done_) { cond_.wait(lock); } if (done_) return false; @@ -59,8 +65,6 @@ bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data, } const auto& d = devptr->second; - // TODO(aaroey): we should not use sync copy on default stream. Make sure - // stream->ThenMemcpy() is used in future PRs. // TODO(sami,aaroey): Need to figure out a way to ensure synchronization // between stream, perhaps using a tensor? auto status = cudaMemcpyAsync(d.first, it.second, d.second, @@ -84,13 +88,11 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, tensorflow::mutex_lock lock(cond_mtx_); calib_running_ = false; cond_.notify_all(); - while ((!batch_is_set_ && !done_)) { // wait until new batch arrives + // wait until new batch arrives + while ((!batch_is_set_ && !done_)) { cond_.wait(lock); - - } - if (done_) { - return false; } + if (done_) return false; for (int i = 0; i < num_bindings; i++) { auto it = dev_buffers_.find(names[i]); @@ -107,7 +109,9 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, } const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) { - return nullptr; + if (calibration_table_.empty()) return nullptr; + length = calibration_table_.size(); + return calibration_table_.data(); } void TRTInt8Calibrator::setDone() { @@ -117,7 +121,11 @@ void TRTInt8Calibrator::setDone() { } void TRTInt8Calibrator::writeCalibrationCache(const void* ptr, - std::size_t length) {} + std::size_t length) { + calibration_table_ = string((const char*)ptr, length); + VLOG(1) << "Got calibration data for " << engine_name_ << " @" << ptr + << " length=" << length; +} TRTInt8Calibrator::~TRTInt8Calibrator() { VLOG(1) << "Destroying calibrator for " << engine_name_; } diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h index d77aa2c5ab..994312d7c3 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h +++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h @@ -39,29 +39,48 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { TRTInt8Calibrator( const std::unordered_map<string, std::pair<void*, size_t>>& dev_buffers, int batch_size, string engine_name); + + TRTInt8Calibrator(const string& calibration_data); + + ~TRTInt8Calibrator(); + int getBatchSize() const override; + bool getBatch(void* bindings[], const char* names[], int num_bindings) override; + bool setBatch(const std::unordered_map<string, void*>& data, const cudaStream_t stream); + void setDone(); + + // If not null, calibration is skipped. const void* readCalibrationCache(std::size_t& length) override; + void writeCalibrationCache(const void* ptr, std::size_t length) override; - ~TRTInt8Calibrator(); + + const string& getCalibrationTableAsString() { return calibration_table_; } private: const int batch_size_; - tensorflow::mutex cond_mtx_; // mutex for condition_variable - tensorflow::condition_variable cond_; // condition variable to implement - // producer-consumer queue for - // calibration + + // mutex for condition_variable + tensorflow::mutex cond_mtx_; + + // condition variable to implement producer-consumer queue for calibration + tensorflow::condition_variable cond_; + + // Is calibration finished? bool done_; - const std::unordered_map<string, std::pair<void*, size_t>> - dev_buffers_; // map to keep tensorrt input buffers and sizes keyed with - // buffer names + + // Map to keep tensorrt input buffers and sizes keyed with buffer names + const std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_; + bool calib_running_; bool batch_is_set_; + string engine_name_; + string calibration_table_; }; } // namespace tensorrt diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h index e3469124ac..b7d5ffd674 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_resources.h +++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h @@ -22,6 +22,7 @@ limitations under the License. #include <thread> #include <vector> +#include "tensorflow/contrib/tensorrt/convert/utils.h" #include "tensorflow/contrib/tensorrt/log/trt_logger.h" #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h" #include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h" @@ -34,50 +35,48 @@ limitations under the License. namespace tensorflow { namespace tensorrt { + class TRTCalibrationResource : public tensorflow::ResourceBase { public: - TRTCalibrationResource() - : calibrator_(nullptr), - builder_(nullptr), - network_(nullptr), - engine_(nullptr), - logger_(nullptr), - thr_(nullptr) {} - ~TRTCalibrationResource() { VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString(); + builder_.reset(); + engine_.reset(); + // We need to manually destroy the builder and engine before the allocator + // is destroyed. + allocator_.reset(); } string DebugString() override { std::stringstream oss; - oss << " Calibrator = " << std::hex << calibrator_ << std::dec << std::endl - << " Builder = " << std::hex << builder_ << std::dec << std::endl - << " Network = " << std::hex << network_ << std::dec << std::endl - << " Engine = " << std::hex << engine_ << std::dec << std::endl - << " Logger = " << std::hex << logger_ << std::dec << std::endl - << " Allocator = " << std::hex << allocator_.get() << std::dec - << std::endl - << " Thread = " << std::hex << thr_ << std::dec << std::endl; + using std::dec; + using std::endl; + using std::hex; + oss << " Calibrator = " << hex << calibrator_.get() << dec << endl + << " Builder = " << hex << builder_.get() << dec << endl + << " Engine = " << hex << engine_.get() << dec << endl + << " Logger = " << hex << &logger_ << dec << endl + << " Allocator = " << hex << allocator_.get() << dec << endl + << " Thread = " << hex << thr_.get() << dec << endl; return oss.str(); } - TRTInt8Calibrator* calibrator_; - nvinfer1::IBuilder* builder_; - nvinfer1::INetworkDefinition* network_; - nvinfer1::ICudaEngine* engine_; - std::shared_ptr<nvinfer1::IGpuAllocator> allocator_; - tensorflow::tensorrt::Logger* logger_; + std::unique_ptr<TRTInt8Calibrator> calibrator_; + TrtUniquePtrType<nvinfer1::IBuilder> builder_; + TrtUniquePtrType<nvinfer1::ICudaEngine> engine_; + std::unique_ptr<nvinfer1::IGpuAllocator> allocator_; + tensorflow::tensorrt::Logger logger_; // TODO(sami): Use threadpool threads! - std::thread* thr_; + std::unique_ptr<std::thread> thr_; }; -class TRTWeightStore : public tensorflow::ResourceBase { +class TRTWeightStore { public: TRTWeightStore() {} virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); } - string DebugString() override { + string DebugString() { std::stringstream oss; size_t len_bytes = 0; for (const auto& v : store_) { diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h index 1568dd9153..81b4bfe49f 100644 --- a/tensorflow/contrib/tensorrt/segment/segment.h +++ b/tensorflow/contrib/tensorrt/segment/segment.h @@ -29,8 +29,9 @@ namespace tensorflow { namespace tensorrt { namespace segment { -// vector of segments, each entry contains a device name and a set of nodes in -// segment +// Vector of segments, each entry contains a set of node names and a device name +// in the segment. +// TODO(aaroey): use node pointer instead of node name. using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>; struct SegmentOptions { @@ -48,6 +49,8 @@ struct SegmentOptions { // in the vector describes a subgraph by giving a set of the names of // all the NodeDefs in that subgraph. // @return the status. +// +// TODO(aaroey): remove this method. tensorflow::Status SegmentGraph( const tensorflow::GraphDef& gdef, const std::function<bool(const tensorflow::Node*)>& candidate_fn, diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc index f36495f6b6..227ac120dd 100644 --- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc +++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc @@ -29,61 +29,35 @@ namespace tensorflow { namespace shape_inference { tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) { - tensorflow::tensorrt::Logger logger; - string serialized_engine; - TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine)); - nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger); - nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine( - serialized_engine.c_str(), serialized_engine.size(), - tensorrt::PluginFactoryTensorRT::GetInstance()); - - int num_batch = -1; - std::vector<::tensorflow::DataType> input_type; - TF_RETURN_IF_ERROR(context->GetAttr("InT", &input_type)); - for (size_t i = 0; i < context->num_inputs(); i++) { - // Check if input shape is legit - auto input_shape = context->input(i); - for (int j = 0; j < context->Rank(input_shape); j++) { - auto dim_handler = context->Dim(input_shape, j); - if (j == 0) { - if (i == 0) { - num_batch = context->Value(dim_handler); - } else if (num_batch != context->Value(dim_handler)) { - // TODO(jie): TensorRT engine requires consistent batch between inputs - // tensors. Segmenter should be aware of this. - LOG(FATAL) << "TensorRT engine requires consistent batch size"; - } - } - } + std::vector<tensorflow::TensorShape> shapes; + for (int i = 0; i < context->num_outputs(); ++i) { + context->set_output(i, context->UnknownShape()); } - - // Arrange input here - std::vector<string> input_nodes; - TF_RETURN_IF_ERROR(context->GetAttr("input_nodes", &input_nodes)); - - // Arrange output here - std::vector<string> output_nodes; - TF_RETURN_IF_ERROR(context->GetAttr("output_nodes", &output_nodes)); - for (size_t i = 0; i < output_nodes.size(); i++) { - int binding_index = trt_engine->getBindingIndex(output_nodes[i].c_str()); - ShapeHandle output_shape; - std::vector<DimensionHandle> dim_vec; - dim_vec.emplace_back(context->MakeDim(num_batch)); - if (binding_index != -1) { - auto dims = trt_engine->getBindingDimensions(binding_index); - for (int j = 0; j < dims.nbDims; j++) { - dim_vec.emplace_back(context->MakeDim(dims.d[j])); - } - } else { - LOG(FATAL) << "TensorRT engine cannot find binding: " << output_nodes[i]; - } - output_shape = context->MakeShape(dim_vec); - context->set_output(i, output_shape); + auto status = context->GetAttr("input_shapes", &shapes); + // it is ok to not to have shapes + if (!status.ok()) return Status::OK(); + if ((int)shapes.size() != context->num_inputs()) return Status::OK(); + bool different_input = false; + for (int i = 0; i < context->num_inputs(); ++i) { + if (shapes.at(i) != context->input_tensor(i)->shape()) + different_input = true; + } + if (different_input) return Status::OK(); + shapes.resize(0); + status = context->GetAttr("output_shapes", &shapes); + if (!status.ok()) return Status::OK(); + if ((int)shapes.size() != context->num_outputs()) return Status::OK(); + std::vector<ShapeHandle> shape_handles(shapes.size()); + for (size_t i = 0; i < shapes.size(); ++i) { + status = + context->MakeShapeFromTensorShape(shapes.at(i), &shape_handles.at(i)); + if (!status.ok()) return Status::OK(); + } + for (int i = 0; i < context->num_outputs(); ++i) { + context->set_output(i, shape_handles.at(i)); } - return Status::OK(); } - } // namespace shape_inference } // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py index 175ccd8006..090aa8bdb0 100644 --- a/tensorflow/contrib/tensorrt/test/test_tftrt.py +++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py @@ -20,6 +20,7 @@ from __future__ import print_function import argparse import numpy as np +import six as _six # normally we should do import tensorflow as tf and then # tf.placeholder, tf.constant, tf.nn.conv2d etc but @@ -35,10 +36,75 @@ from tensorflow.python.framework import dtypes as dtypes from tensorflow.python.framework import importer as importer from tensorflow.python.framework import ops as ops from tensorflow.python.ops import array_ops as aops +from tensorflow.python.ops import math_ops as mops from tensorflow.python.ops import nn as nn from tensorflow.python.ops import nn_ops as nn_ops +def py2bytes(inp): + return inp + + +def py3bytes(inp): + return inp.encode("utf-8", errors="surrogateescape") + + +def py2string(inp): + return inp + + +def py3string(inp): + return inp.decode("utf-8") + + +if _six.PY2: + to_bytes = py2bytes + to_string = py2string +else: + to_bytes = py3bytes + to_string = py3string + + +def get_multi_engine_graph_def(mode="FP32"): + """Create a simple graph and return its graph_def.""" + dtype = dtypes.float32 + if mode.upper() == "FP16": + dtype = dtypes.float16 + else: + pass + + g = ops.Graph() + with g.as_default(): + x = aops.placeholder(shape=[None, 3, 7, 5], name="input", dtype=dtype) + with g.name_scope("Global_scope"): + with g.name_scope("first_scope"): + e = cop.constant( + np.random.randn(3, 2, 3, 4), name="weights", dtype=dtype) + conv = nn.conv2d( + input=x, + filter=e, + data_format="NCHW", + strides=[1, 1, 1, 1], + padding="VALID", + name="conv") + b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias1", dtype=dtype) + t = conv * b + + b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias2", dtype=dtype) + q = conv / b + edge = mops.sin(q) + edge1 = mops.cos(conv) + with g.name_scope("test_scope"): + de = edge + edge1 + t -= edge1 + q *= edge + t += q + t -= de + k = aops.squeeze(t, name="output") + print(k.dtype) + return g.as_graph_def() + + def get_simple_graph_def(): """Create a simple graph and return its graph_def.""" g = ops.Graph() @@ -65,7 +131,9 @@ def get_simple_graph_def(): def execute_graph(gdef, dumm_inp): """Run given graphdef once.""" print("executing") - gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50) + gpu_options = None + if trt.trt_convert.get_linked_tensorrt_version()[0] == 3: + gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50) sessconfig = cpb2.ConfigProto(gpu_options=gpu_options) ops.reset_default_graph() g = ops.Graph() @@ -83,7 +151,9 @@ def execute_graph(gdef, dumm_inp): # for calibration. For this test script it is random data. def execute_calibration(gdef, dumm_inp): """Run given calibration graph multiple times.""" - gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50) + gpu_options = None + if trt.trt_convert.get_linked_tensorrt_version()[0] == 3: + gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50) ops.reset_default_graph() g = ops.Graph() with g.as_default(): @@ -100,12 +170,17 @@ def execute_calibration(gdef, dumm_inp): return val -def user(run_graph=execute_graph, run_calibration=execute_calibration): +def user(multi_engine, + run_graph=execute_graph, + run_calibration=execute_calibration): """Example function that converts a graph to TFTRT graph.""" - - inp_dims = (100, 24, 24, 2) + if multi_engine: + inp_dims = (2, 3, 7, 5) + orig_graph = get_multi_engine_graph_def() + else: + inp_dims = (100, 24, 24, 2) + orig_graph = get_simple_graph_def() # use a frozen graph for inference dummy_input = np.random.random_sample(inp_dims) - orig_graph = get_simple_graph_def() # use a frozen graph for inference # Get optimized graph trt_graph = trt.create_inference_graph( input_graph_def=orig_graph, @@ -113,8 +188,10 @@ def user(run_graph=execute_graph, run_calibration=execute_calibration): max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP32", # TRT Engine precision "FP32","FP16" or "INT8" - minimum_segment_size=2 # minimum number of nodes in an engine - ) + minimum_segment_size=2, # minimum number of nodes in an engine + is_dynamic_op=False, + maximum_cached_engines=1, + cached_engine_batches=[]) o1 = run_graph(orig_graph, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) @@ -126,40 +203,51 @@ def user(run_graph=execute_graph, run_calibration=execute_calibration): max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" - minimum_segment_size=2 # minimum number of nodes in an engine - ) + minimum_segment_size=2, # minimum number of nodes in an engine + is_dynamic_op=False, + maximum_cached_engines=1, + cached_engine_batches=[]) int8_calib_gdef = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8" - minimum_segment_size=2 # minimum number of nodes in an engine - ) + minimum_segment_size=2, # minimum number of nodes in an engine + is_dynamic_op=False, + maximum_cached_engines=1, + cached_engine_batches=[]) o4 = run_graph(fp16_graph, dummy_input) _ = run_calibration(int8_calib_gdef, dummy_input) int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) o5 = run_graph(int8_graph, dummy_input) - assert np.allclose(o1, o4) - assert np.allclose(o1, o5) + print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4)) + print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5)) print("Pass") -def auto(): +def auto(multi_engine): """Run the conversion as an optimization pass.""" - inp_dims = (100, 24, 24, 2) + if multi_engine: + inp_dims = (2, 3, 7, 5) + orig_graph = get_multi_engine_graph_def() + else: + inp_dims = (100, 24, 24, 2) + orig_graph = get_simple_graph_def() # use a frozen graph for inference dummy_input = np.random.random_sample(inp_dims) - orig_graph = get_simple_graph_def() opt_config = rwpb2.RewriterConfig() + opt_config.meta_optimizer_iterations = opt_config.ONE opt_config.optimizers.extend(["constfold", "layout"]) custom_op = opt_config.custom_optimizers.add() custom_op.name = "TensorRTOptimizer" custom_op.parameter_map["minimum_segment_size"].i = 3 - custom_op.parameter_map["precision_mode"].s = "FP32" + custom_op.parameter_map["precision_mode"].s = to_bytes("FP32") custom_op.parameter_map["max_batch_size"].i = inp_dims[0] custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25 print(custom_op) - gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50) + gpu_options = None + if trt.trt_convert.get_linked_tensorrt_version()[0] == 3: + gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50) graph_options = cpb2.GraphOptions(rewrite_options=opt_config) sessconfig = cpb2.ConfigProto( gpu_options=gpu_options, graph_options=graph_options) @@ -168,7 +256,7 @@ def auto(): ops.reset_default_graph() with g.as_default(): inp, out = importer.import_graph_def( - graph_def=orig_graph, return_elements=["input", "output"]) + graph_def=orig_graph, return_elements=["input", "output"], name="") inp = inp.outputs[0] out = out.outputs[0] with csess.Session(config=sessconfig, graph=g) as sess: @@ -186,8 +274,14 @@ if "__main__" in __name__: action="store_true", help="Do TRT conversion automatically", default=False) + P.add_argument( + "--multi-engine", + "-m", + action="store_true", + help="Use a graph that will result in 2 engines", + default=False) flags, unparsed = P.parse_known_args() if flags.automatic: - auto() + auto(flags.multi_engine) else: - user() + user(flags.multi_engine) diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i index 46480e99a1..d51a0b59e2 100644 --- a/tensorflow/contrib/tensorrt/trt_conversion.i +++ b/tensorflow/contrib/tensorrt/trt_conversion.i @@ -48,12 +48,53 @@ PyObject* pair_helper(std::pair<string, string>* in) { } return tuple; } + +struct version_struct{ + int vmajor; + int vminor; + int vpatch; +}; + +PyObject* version_helper(version_struct* in) { + PyObject *tuple(nullptr); + tuple = Py_BuildValue("(iii)", in->vmajor, in->vminor, in->vpatch); + if (!tuple) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, + "Tuple creation from version structure failed!"); + } + return NULL; + } + return tuple; +} +/* Define converters for vector<int> */ +template<> +bool _PyObjAs(PyObject *pyobj, int* dest) { + *dest = PyLong_AsLong(pyobj); + return true; +} + +template<> +PyObject *_PyObjFrom(const int& src) { + return PyLong_FromLong(src); +} + %} + +_LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong); + %typemap(out) std::pair<string, string> { PyObject *tuple = pair_helper(&$1); if (!tuple) SWIG_fail; $result = tuple; } + +%typemap(out) version_struct { + PyObject *tuple = version_helper(&$1); + if (!tuple) SWIG_fail; + $result = tuple; +} + %{ #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" @@ -65,6 +106,8 @@ PyObject* pair_helper(std::pair<string, string>* in) { %unignore tensorflow; %unignore trt_convert; %unignore calib_convert; +%unignore get_linked_tensorrt_version; +%unignore get_loaded_tensorrt_version; %{ @@ -74,7 +117,10 @@ std::pair<string, string> trt_convert( size_t max_batch_size, size_t max_workspace_size_bytes, int precision_mode, - int minimum_segment_size + int minimum_segment_size, + bool is_dyn_op, + int max_cached_engines, + std::vector<int> cached_engine_batches // Unfortunately we can't use TF_Status here since it // is in c/c_api and brings in a lot of other libraries // which in turn declare ops. These ops are included @@ -102,11 +148,12 @@ std::pair<string, string> trt_convert( out_status = "InvalidArgument;Size of the output_names vector is 0"; return std::pair<string, string>{out_status, ""}; } - tensorflow::GraphDef outGraph; + tensorflow::GraphDef out_graph; tensorflow::Status conversion_status = tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT( graph_def, output_names, max_batch_size, max_workspace_size_bytes, - &outGraph, precision_mode, minimum_segment_size); + &out_graph, precision_mode, minimum_segment_size, + is_dyn_op, max_cached_engines, cached_engine_batches); if (!conversion_status.ok()) { auto retCode = (int)conversion_status.code(); char buff[2000]; @@ -116,7 +163,7 @@ std::pair<string, string> trt_convert( return std::pair<string, string>{out_status, ""}; } string result; - if (!outGraph.SerializeToString(&result)) { + if (!out_graph.SerializeToString(&result)) { out_status = "InvalidArgument;Couldn't serialize output as a GraphDef"; return std::pair<string, string>{out_status, ""}; } @@ -128,7 +175,8 @@ std::pair<string, string> trt_convert( #endif // GOOGLE_CUDA && GOOGLE_TENSORRT } -std::pair<string, string> calib_convert(string graph_def_string // const tensorflow::GraphDef& +std::pair<string, string> calib_convert( + string graph_def_string, bool is_dyn_op // unfortunately we can't use TF_Status here since it // is in c/c_api and brings in a lot of other libraries // which in turn declare ops. These ops are included @@ -147,11 +195,11 @@ std::pair<string, string> calib_convert(string graph_def_string // const tenso out_status = "InvalidArgument;Couldn't interpret input as a GraphDef"; return std::pair<string, string>{out_status, ""}; } - - tensorflow::GraphDef outGraph; + graph_def_string.resize(0); + tensorflow::GraphDef out_graph; tensorflow::Status conversion_status = - tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(graph_def, - &outGraph); + tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph( + graph_def, &out_graph, is_dyn_op); if (!conversion_status.ok()) { auto retCode = (int)conversion_status.code(); char buff[2000]; @@ -161,7 +209,7 @@ std::pair<string, string> calib_convert(string graph_def_string // const tenso return std::pair<string, string>{out_status, ""}; } string result; - if (!outGraph.SerializeToString(&result)) { + if (!out_graph.SerializeToString(&result)) { out_status = "InvalidArgument;Couldn't serialize output as a GraphDef"; return std::pair<string, string>{out_status, ""}; } @@ -172,15 +220,39 @@ std::pair<string, string> calib_convert(string graph_def_string // const tenso return std::pair<string, string>{"9;TensorRT is not enabled!", ""}; #endif // GOOGLE_CUDA && GOOGLE_TENSORRT } + +version_struct get_linked_tensorrt_version(){ + // Return the version at the link time. + const auto &lv = tensorflow::tensorrt::convert::GetLinkedTensorRTVersion(); + version_struct s; + s.vmajor = lv[0]; + s.vminor = lv[1]; + s.vpatch = lv[2]; + return s; +} +version_struct get_loaded_tensorrt_version(){ + // Return the version from the loaded library. + const auto &lv = tensorflow::tensorrt::convert::GetLoadedTensorRTVersion(); + version_struct s; + s.vmajor = lv[0]; + s.vminor = lv[1]; + s.vpatch = lv[2]; + return s; +} + %} -std::pair<string, string> calib_convert(string graph_def_string); +std::pair<string, string> calib_convert(string graph_def_string, bool is_dyn_op); std::pair<string, string> trt_convert(string graph_def_string, std::vector<string> output_names, size_t max_batch_size, size_t max_workspace_size_bytes, - int precision_mode, int minimum_segment_size); - + int precision_mode, int minimum_segment_size, + bool is_dyn_op, + int max_cached_engines, + std::vector<int> cached_engine_batches); +version_struct get_linked_tensorrt_version(); +version_struct get_loaded_tensorrt_version(); %unignoreall diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD index 3b2d7adfff..38d1c3049e 100644 --- a/tensorflow/contrib/tpu/profiler/BUILD +++ b/tensorflow/contrib/tpu/profiler/BUILD @@ -49,11 +49,11 @@ tf_cc_binary( ":tpu_profiler_analysis_proto_cc", ":tpu_profiler_proto_cc", ":version", + "//tensorflow:grpc++", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core/distributed_runtime/rpc:grpc_util", "//tensorflow/core/platform/cloud:gcs_file_system", - "@grpc//:grpc++", ], ) diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD index 1b45584dcb..19cb8983b6 100644 --- a/tensorflow/contrib/verbs/BUILD +++ b/tensorflow/contrib/verbs/BUILD @@ -53,12 +53,12 @@ cc_library( ":grpc_verbs_service_impl", ":rdma_mgr", ":verbs_service_proto_cc", + "//tensorflow:grpc++", "//tensorflow/core:lib_internal", "//tensorflow/core/distributed_runtime:session_mgr", "//tensorflow/core/distributed_runtime/rpc:async_service_interface", "//tensorflow/core/distributed_runtime/rpc:grpc_call", "//tensorflow/core/distributed_runtime/rpc:grpc_util", - "@grpc//:grpc++", ], alwayslink = 1, ) @@ -69,7 +69,7 @@ cc_library( hdrs = ["grpc_verbs_service_impl.h"], deps = [ ":verbs_service_proto_cc", - "@grpc//:grpc++", + "//tensorflow:grpc++", ], ) diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD index 19d6438809..06b797e32e 100644 --- a/tensorflow/core/api_def/BUILD +++ b/tensorflow/core/api_def/BUILD @@ -4,6 +4,7 @@ # The following targets can be used to access ApiDefs: # :base_api_def # :python_api_def +# :java_api_def package( default_visibility = ["//visibility:private"], @@ -29,6 +30,12 @@ filegroup( visibility = ["//tensorflow:internal"], ) +filegroup( + name = "java_api_def", + srcs = glob(["java_api/*"]), + visibility = ["//tensorflow:internal"], +) + cc_library( name = "excluded_ops_lib", srcs = ["excluded_ops.cc"], diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt index 6f1121dd37..5ab5917bd3 100644 --- a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt @@ -68,7 +68,7 @@ END name: "area_range" description: <<END The cropped area of the image must contain a fraction of the -supplied image within in this range. +supplied image within this range. END } attr { diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt index 473aec50aa..663fc582d4 100644 --- a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt @@ -68,7 +68,7 @@ END name: "area_range" description: <<END The cropped area of the image must contain a fraction of the -supplied image within in this range. +supplied image within this range. END } attr { diff --git a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt index 9fabe7863e..c80ee77f73 100644 --- a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt @@ -11,7 +11,7 @@ END name: "stride" description: <<END A scalar representing the steps moving the sliding window -forward in one iteration. It must be in `[1, window_size)`. +forward in one iteration. It must be positive. END } summary: "Creates a dataset that passes a sliding window over `input_dataset`." diff --git a/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt new file mode 100644 index 0000000000..b1f868897d --- /dev/null +++ b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt @@ -0,0 +1,4 @@ +op { + graph_op_name: "Assert" #TODO(karllessard) escape that reserved name + visibility: HIDDEN +} diff --git a/tensorflow/core/api_def/java_api/api_def_Const.pbtxt b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt new file mode 100644 index 0000000000..2dbdca34e0 --- /dev/null +++ b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt @@ -0,0 +1,4 @@ +op { + graph_op_name: "Const" #TODO(karllessard) escape that reserved name + visibility: HIDDEN +} diff --git a/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt new file mode 100644 index 0000000000..0d3362a91e --- /dev/null +++ b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt @@ -0,0 +1,4 @@ +op { + graph_op_name: "Switch" #TODO(karllessard) escape that reserved name + visibility: HIDDEN +} diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc index 486f0be698..0b096a14a3 100644 --- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc +++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc @@ -106,24 +106,24 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) { EXPECT_EQ(1, shape.dim(1).size()); if (node->name() == y->name()) { #ifdef INTEL_MKL - // if MKL is used, it goes through various additional - // graph rewrite pass. In TF, everytime a graph pass + // if MKL is used, it goes through various additional + // graph rewrite pass. In TF, everytime a graph pass // happens, "constant" nodes are allocated // and deallocated. Each allocation calls the // (FindChunkPtr of BFCAllocator), - // which increments the value of AllocationId. - // Thus AllocationId becomes more than 3 and 4 if - // MKL is used. Now they are 9 and 10 for MKL. - EXPECT_EQ(19, cm->AllocationId(node, 0)); + // which increments the value of AllocationId. + // Thus AllocationId becomes more than TF if MKL + // is used. Now IDs for MKL are 8 more than TF. + EXPECT_EQ(29, cm->AllocationId(node, 0)); #else EXPECT_EQ(21, cm->AllocationId(node, 0)); -#endif +#endif } else { #ifdef INTEL_MKL - EXPECT_EQ(20, cm->AllocationId(node, 0)); + EXPECT_EQ(30, cm->AllocationId(node, 0)); #else EXPECT_EQ(22, cm->AllocationId(node, 0)); -#endif +#endif } } EXPECT_LE(0, cm->MaxExecutionTime(node)); diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc index 43a909466e..4ec85457ad 100644 --- a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc +++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc @@ -17,6 +17,13 @@ limitations under the License. #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h" +#ifdef _WIN32 +// Declare function to avoid unresolved symbol in VS +i_malloc_t i_malloc; +i_calloc_t i_calloc; +i_realloc_t i_realloc; +i_free_t i_free; +#endif namespace tensorflow { constexpr const char* MklCPUAllocator::kMaxLimitStr; diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD index 50f8a307d8..36e9b3455a 100644 --- a/tensorflow/core/debug/BUILD +++ b/tensorflow/core/debug/BUILD @@ -143,6 +143,7 @@ tf_cuda_library( ":debug_node_key", ":debug_service_proto_cc", ":debugger_event_metadata_proto_cc", + "//tensorflow:grpc++", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:graph", @@ -150,7 +151,6 @@ tf_cuda_library( "//tensorflow/core:lib_internal", "//tensorflow/core:proto_text", "//tensorflow/core:protos_all_cc", - "@grpc//:grpc++", ], alwayslink = 1, ) @@ -166,11 +166,11 @@ tf_cuda_library( ":debug_io_utils", ":debug_service_proto_cc", ":debugger_event_metadata_proto_cc", + "//tensorflow:grpc++", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", - "@grpc//:grpc++", ], alwayslink = 1, ) diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD index 8247651c24..75f8a19e9c 100644 --- a/tensorflow/core/distributed_runtime/BUILD +++ b/tensorflow/core/distributed_runtime/BUILD @@ -628,6 +628,7 @@ tf_cuda_cc_test( ":master", ":remote_device", ":worker_interface", + "//tensorflow:grpc++", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", @@ -649,7 +650,6 @@ tf_cuda_cc_test( "//tensorflow/core/kernels:dense_update_ops", "//tensorflow/core/kernels:identity_op", "//tensorflow/core/kernels:variable_ops", - "@grpc//:grpc++", ], ) @@ -667,6 +667,7 @@ tf_cuda_cc_test( ":master", ":remote_device", ":worker_interface", + "//tensorflow:grpc++", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", @@ -682,7 +683,6 @@ tf_cuda_cc_test( "//tensorflow/core/distributed_runtime/rpc:grpc_testlib", "//tensorflow/core/distributed_runtime/rpc:grpc_util", "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache", - "@grpc//:grpc++", ], ) diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD index 22d0902af2..055e5dfced 100644 --- a/tensorflow/core/distributed_runtime/eager/BUILD +++ b/tensorflow/core/distributed_runtime/eager/BUILD @@ -48,6 +48,8 @@ cc_library( "eager_service_impl.h", ], deps = [ + "//tensorflow:grpc", + "//tensorflow:grpc++", "//tensorflow/c:c_api_internal", "//tensorflow/c:tf_status_helper", "//tensorflow/core:core_cpu_internal", @@ -67,8 +69,6 @@ cc_library( "//tensorflow/core/distributed_runtime:worker_env", "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle", "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr", - "@grpc", - "@grpc//:grpc++", ], ) diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD index 382ea336ca..d6c493c022 100644 --- a/tensorflow/core/distributed_runtime/rpc/BUILD +++ b/tensorflow/core/distributed_runtime/rpc/BUILD @@ -41,8 +41,8 @@ cc_library( srcs = ["grpc_util.cc"], hdrs = ["grpc_util.h"], deps = [ - "@grpc", - "@grpc//:grpc++", + "//tensorflow:grpc", + "//tensorflow:grpc++", "//tensorflow/core:lib", # Required to be able to overload TensorResponse parsing. "//tensorflow/core/distributed_runtime:tensor_coding", @@ -55,8 +55,8 @@ cc_library( hdrs = ["grpc_client_cq_tag.h"], deps = [ ":grpc_util", + "//tensorflow:grpc++", "//tensorflow/core:lib", - "@grpc//:grpc++", ], ) @@ -67,10 +67,10 @@ cc_library( deps = [ ":grpc_client_cq_tag", ":grpc_util", + "//tensorflow:grpc++", "//tensorflow/core:lib", "//tensorflow/core/distributed_runtime:call_options", "//tensorflow/core/distributed_runtime:tensor_coding", - "@grpc//:grpc++", ], ) @@ -83,6 +83,7 @@ cc_library( ":grpc_state", ":grpc_util", ":grpc_worker_service_impl", + "//tensorflow:grpc++", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", @@ -90,7 +91,6 @@ cc_library( "//tensorflow/core/distributed_runtime:tensor_coding", "//tensorflow/core/distributed_runtime:worker_cache_logger", "//tensorflow/core/distributed_runtime:worker_interface", - "@grpc//:grpc++", ], ) @@ -100,10 +100,10 @@ cc_library( hdrs = ["grpc_channel.h"], deps = [ ":grpc_util", + "//tensorflow:grpc++", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", - "@grpc//:grpc++", ], ) @@ -112,13 +112,13 @@ cc_library( srcs = ["grpc_tensor_coding.cc"], hdrs = ["grpc_tensor_coding.h"], deps = [ + "//tensorflow:grpc++", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", "//tensorflow/core:worker_proto_cc", - "@grpc//:grpc++", ], ) @@ -127,9 +127,9 @@ cc_library( srcs = [], hdrs = ["grpc_call.h"], deps = [ + "//tensorflow:grpc++", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", - "@grpc//:grpc++", ], ) @@ -167,6 +167,7 @@ tf_cuda_library( ":grpc_tensor_coding", ":grpc_util", ":grpc_worker_service_impl", + "//tensorflow:grpc++", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:lib", @@ -180,7 +181,6 @@ tf_cuda_library( "//tensorflow/core/distributed_runtime:worker_cache", "//tensorflow/core/distributed_runtime:worker_env", "//tensorflow/core/distributed_runtime:worker_session", - "@grpc//:grpc++", ], ) @@ -190,9 +190,9 @@ cc_library( hdrs = ["grpc_worker_service_impl.h"], deps = [ ":grpc_util", + "//tensorflow:grpc++", "//tensorflow/core:worker_proto_cc", "//tensorflow/core/distributed_runtime:tensor_coding", - "@grpc//:grpc++", ], ) @@ -220,12 +220,12 @@ cc_library( ":async_service_interface", ":grpc_call", ":grpc_util", + "//tensorflow:grpc++", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:master_proto_cc", "//tensorflow/core:master_service_proto_cc", "//tensorflow/core/distributed_runtime:master", - "@grpc//:grpc++", ], alwayslink = 1, ) @@ -259,6 +259,8 @@ cc_library( ":grpc_worker_cache", ":grpc_worker_service", ":rpc_rendezvous_mgr", + "//tensorflow:grpc", + "//tensorflow:grpc++", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", @@ -277,8 +279,6 @@ cc_library( "//tensorflow/core/distributed_runtime:worker_cache_wrapper", "//tensorflow/core/distributed_runtime:worker_env", "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service_impl", - "@grpc", - "@grpc//:grpc++", ], alwayslink = 1, ) @@ -299,13 +299,13 @@ tf_cc_binary( ], deps = [ ":grpc_server_lib", + "//tensorflow:grpc++", "//tensorflow/core:core_cpu", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", "//tensorflow/core/distributed_runtime:server_lib", "//tensorflow/core/kernels:data_flow", - "@grpc//:grpc++", ], ) @@ -317,6 +317,7 @@ tf_cc_binary( ], deps = [ ":grpc_server_lib", + "//tensorflow:grpc++", "//tensorflow/core:core_cpu", "//tensorflow/core:framework_internal", "//tensorflow/core:lib", @@ -330,7 +331,6 @@ tf_cc_binary( "//tensorflow/core/kernels:matmul_op", "//tensorflow/core/kernels:reduction_ops", "//tensorflow/core/kernels:variable_ops", - "@grpc//:grpc++", ], ) @@ -415,6 +415,7 @@ tf_cc_test( deps = [ ":grpc_tensor_coding", ":grpc_testlib", + "//tensorflow:grpc++", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", @@ -424,7 +425,6 @@ tf_cc_test( "//tensorflow/core:test_main", "//tensorflow/core:testlib", "//tensorflow/core:worker_proto_cc", - "@grpc//:grpc++", ], ) @@ -434,11 +434,11 @@ tf_cc_test( srcs = ["grpc_util_test.cc"], deps = [ ":grpc_util", + "//tensorflow:grpc", + "//tensorflow:grpc++", "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:worker_proto_cc", - "@grpc", - "@grpc//:grpc++", ], ) diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD index 8cec497361..d09a85c6a5 100644 --- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD +++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD @@ -11,8 +11,8 @@ cc_library( srcs = ["grpc_eager_service.cc"], hdrs = ["grpc_eager_service.h"], deps = [ + "//tensorflow:grpc++", "//tensorflow/core:eager_service_proto_cc", - "@grpc//:grpc++", ], ) @@ -21,6 +21,7 @@ cc_library( srcs = ["grpc_eager_client.cc"], hdrs = ["grpc_eager_client.h"], deps = [ + "//tensorflow:grpc++", "//tensorflow/core:eager_service_proto_cc", "//tensorflow/core:lib", "//tensorflow/core/distributed_runtime/eager:eager_client", @@ -29,7 +30,6 @@ cc_library( "//tensorflow/core/distributed_runtime/rpc:grpc_state", "//tensorflow/core/distributed_runtime/rpc:grpc_util", "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service", - "@grpc//:grpc++", ], ) @@ -39,6 +39,7 @@ cc_library( hdrs = ["grpc_eager_service_impl.h"], deps = [ ":grpc_eager_service", + "//tensorflow:grpc++", "//tensorflow/core:framework", "//tensorflow/core:ptr_util", "//tensorflow/core/distributed_runtime/eager:eager_service_impl", @@ -47,6 +48,6 @@ cc_library( "//tensorflow/core/distributed_runtime/rpc:grpc_channel", "//tensorflow/core/distributed_runtime/rpc:grpc_util", "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache", - "@grpc//:grpc++", + "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service", ], ) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index 7a9f3c5198..2c833d11a9 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -289,6 +289,12 @@ Status GrpcServer::Init( nullptr); } +Status GrpcServer::Init( + ServiceInitFunction service_func, + const RendezvousMgrCreationFunction& rendezvous_mgr_func) { + return Init(std::move(service_func), rendezvous_mgr_func, nullptr, nullptr); +} + Status GrpcServer::Init() { return Init(nullptr, nullptr, nullptr, nullptr); } Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options, diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h index c674da9490..3366246afb 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h @@ -97,6 +97,9 @@ class GrpcServer : public ServerInterface { const RendezvousMgrCreationFunction& rendezvous_mgr_func, const CollectiveMgrCreationFunction& collective_mgr_func); + Status Init(ServiceInitFunction service_func, + const RendezvousMgrCreationFunction& rendezvous_mgr_func); + Status Init(); // A subclass can override this method to support secure credentials. diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 7645b4a7f0..fc474c0dc8 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -1901,6 +1901,11 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000); #else // INTEL_MKL_ML +// NOTE: Unit tests in this file rely on a topological sorted graph for +// printing. But since sibling nodes of a node in the topologically sorted graph +// can be printed in different orders, tests may fail if the order in which +// sibling nodes are visited is changed. + namespace { const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0"; @@ -2572,9 +2577,9 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) { "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);" "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;" - "A:control->DMT/_2:control;A:control->DMT/_3:control;" - "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;" - "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;" + "A:control->DMT/_0:control;A:control->DMT/_1:control;" + "B->E:1;C->F;C:control->DMT/_2:control;C:control->DMT/_3:control;" + "D->F:1;DMT/_0->E:2;DMT/_1->E:3;DMT/_2->F:2;DMT/_3->F:3;" "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;" "G:control->DMT/_4:control;H->I:1"); } @@ -2681,9 +2686,9 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) { "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);" "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;" - "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;" - "C:control->DMT/_0:control;C:control->DMT/_1:control;" - "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;" + "A:control->DMT/_0:control;A:control->DMT/_1:control;B->E:1;C->F;" + "C:control->DMT/_2:control;C:control->DMT/_3:control;" + "D->F:1;DMT/_0->E:2;DMT/_1->E:3;DMT/_2->F:2;DMT/_3->F:3;" "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;" "F:2->H:4;G->H:2;H->I:1"); } @@ -3060,8 +3065,8 @@ TEST_F(MklLayoutPassTest, LRN_Negative3) { "C:control->DMT/_1:control;C:control->DMT/_2:control;" "C:control->DMT/_3:control;C:control->DMT/_4:control;" "C:control->DMT/_5:control;C:control->DMT/_6:control;" - "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;" - "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1"); + "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;DMT/_3->F:3;" + "DMT/_4->F:7;DMT/_5->F:4;DMT/_6->F:6;E->G;F->G:1"); } /* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */ diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc index 48776cbf61..07cc91f9d5 100644 --- a/tensorflow/core/kernels/data/slide_dataset_op.cc +++ b/tensorflow/core/kernels/data/slide_dataset_op.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/kernels/data/dataset.h" +#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/util/batch_util.h" namespace tensorflow { @@ -32,16 +33,24 @@ class SlideDatasetOp : public UnaryDatasetOpKernel { void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { int64 window_size = 0; - int64 stride = 1; + int64 stride = 0; OP_REQUIRES_OK( ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size)); OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "stride", &stride)); OP_REQUIRES( ctx, window_size > 0, errors::InvalidArgument("Window size must be greater than zero.")); - OP_REQUIRES( - ctx, stride > 0 && stride < window_size, - errors::InvalidArgument("Stride must be in [1, window_size).")); + OP_REQUIRES(ctx, stride > 0, + errors::InvalidArgument("Stride must be greater than zero.")); + if (stride == window_size) { + LOG(WARNING) << "stride: " << stride + << " is equal to window_size: " << window_size + << ", to use `batch` instead."; + } else if (stride > window_size) { + LOG(WARNING) << "stride: " << stride + << " is greater than window_size: " << window_size + << ", you will lose some data."; + } *output = new Dataset(ctx, window_size, stride, input); } @@ -124,12 +133,15 @@ class SlideDatasetOp : public UnaryDatasetOpKernel { return Status::OK(); } batch_elements.reserve(window_size); - const bool first_call = cache_.empty(); - if (first_call) { - cache_.reserve(window_size); - } else { - // Reuse cache in the previous iteration. - cache_.swap(batch_elements); + // Use cache if stride < window_size. + if (stride < window_size) { + const bool first_call = cache_.empty(); + if (first_call) { + cache_.reserve(window_size); + } else { + // Reuse cache in the previous iteration. + cache_.swap(batch_elements); + } } // Fill up with new elements. *end_of_sequence = false; @@ -149,9 +161,22 @@ class SlideDatasetOp : public UnaryDatasetOpKernel { DCHECK(*end_of_sequence); return Status::OK(); } - // Cache the data used for the next iteration. - for (size_t i = stride; i < window_size; ++i) { - cache_.emplace_back(batch_elements[i]); + + if (stride < window_size) { + // Cache the data used for the next iteration. + for (size_t i = stride; i < window_size; ++i) { + cache_.emplace_back(batch_elements[i]); + } + } else if (stride > window_size) { + // Drop the data before the next iteration. + std::vector<Tensor> batch_element_tuple; + for (size_t i = window_size; i < stride && !*end_of_sequence; ++i) { + TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple, + end_of_sequence)); + if (*end_of_sequence) { + input_impl_.reset(); + } + } } } diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index f2b14f1278..1d0edb10b3 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -59,7 +59,8 @@ namespace tensorflow { #ifndef INTEL_MKL_ML -struct ConvFwdDimensions { +// This structure aggregates multiple inputs to Conv2DFwd* methods. +struct MklConvFwdParams { memory::dims src_dims; memory::dims filter_dims; memory::dims bias_dims; @@ -69,48 +70,56 @@ struct ConvFwdDimensions { memory::dims padding_left; memory::dims padding_right; - ConvFwdDimensions(memory::dims src_dims, - memory::dims filter_dims, memory::dims bias_dims, - memory::dims dst_dims, memory::dims strides, - memory::dims dilations, memory::dims padding_left, - memory::dims padding_right) : - src_dims(src_dims), filter_dims(filter_dims), - bias_dims(bias_dims), dst_dims(dst_dims), - strides(strides), dilations(dilations), - padding_left(padding_left), padding_right(padding_right) { - } + MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims, + memory::dims bias_dims, memory::dims dst_dims, + memory::dims strides, memory::dims dilations, + memory::dims padding_left, memory::dims padding_right) + : src_dims(src_dims), + filter_dims(filter_dims), + bias_dims(bias_dims), + dst_dims(dst_dims), + strides(strides), + dilations(dilations), + padding_left(padding_left), + padding_right(padding_right) {} }; template <typename T> -class Conv2DFwd : public DnnOp { +class MklConv2DFwdPrimitive : public MklPrimitive { public: - explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) { - fwd_stream_.reset(new stream(stream::kind::eager)); + explicit MklConv2DFwdPrimitive(const MklConvFwdParams& convFwdDims) + : cpu_engine_(engine::cpu, 0) { + context_.fwd_stream.reset(new stream(stream::kind::eager)); // create conv primitive - if (conv_fwd_ == nullptr) { + if (context_.conv_fwd == nullptr) { Setup(convFwdDims); } } - ~Conv2DFwd() {} + ~MklConv2DFwdPrimitive() {} // Convolution forward execute with bias // src_data: input data buffer of src // filter_data: input data buffer of filter (weights) // bias_data: input data buffer of bias // dst_data: output data buffer of dst - void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) { - src_mem_->set_data_handle(static_cast<void*>(src_data)); - filter_mem_->set_data_handle(static_cast<void*>(filter_data)); - bias_mem_->set_data_handle(static_cast<void*>(bias_data)); - dst_mem_->set_data_handle(static_cast<void*>(dst_data)); - fwd_stream_->submit(fwd_primitives_); + void Execute(const T* src_data, const T* filter_data, const T* bias_data, + const T* dst_data) { + context_.src_mem->set_data_handle( + static_cast<void*>(const_cast<T*>(src_data))); + context_.filter_mem->set_data_handle( + static_cast<void*>(const_cast<T*>(filter_data))); + context_.bias_mem->set_data_handle( + static_cast<void*>(const_cast<T*>(bias_data))); + context_.dst_mem->set_data_handle( + static_cast<void*>(const_cast<T*>(dst_data))); + context_.fwd_stream->submit(context_.fwd_primitives); // after exec, set data handle back - src_mem_->set_data_handle(DummyData); - filter_mem_->set_data_handle(DummyData); - bias_mem_->set_data_handle(DummyData); - dst_mem_->set_data_handle(DummyData); + context_.src_mem->set_data_handle(DummyData); + context_.filter_mem->set_data_handle(DummyData); + context_.bias_mem->set_data_handle(DummyData); + context_.dst_mem->set_data_handle(DummyData); return; } @@ -119,139 +128,177 @@ class Conv2DFwd : public DnnOp { // src_data: input data buffer of src // filter_data: input data buffer of filter (weights) // dst_data: output data buffer of dst - void Execute(T* src_data, T* filter_data, T* dst_data) { - src_mem_->set_data_handle(static_cast<void*>(src_data)); - filter_mem_->set_data_handle(static_cast<void*>(filter_data)); - dst_mem_->set_data_handle(static_cast<void*>(dst_data)); - fwd_stream_->submit(fwd_primitives_); - - // after exec, set data handle back - src_mem_->set_data_handle(DummyData); - filter_mem_->set_data_handle(DummyData); - dst_mem_->set_data_handle(DummyData); - - return; + void Execute(const T* src_data, const T* filter_data, const T* dst_data) { + context_.src_mem->set_data_handle( + static_cast<void*>(const_cast<T*>(src_data))); + context_.filter_mem->set_data_handle( + static_cast<void*>(const_cast<T*>(filter_data))); + context_.dst_mem->set_data_handle( + static_cast<void*>(const_cast<T*>(dst_data))); + context_.fwd_stream->submit(context_.fwd_primitives); + + // after execution, set data handle back + context_.src_mem->set_data_handle(DummyData); + context_.filter_mem->set_data_handle(DummyData); + context_.dst_mem->set_data_handle(DummyData); } - // expected memory format for this primitive instance - memory::format src_fmt_; - memory::format filter_fmt_; + memory::format GetSrcMemoryFormat() const { return context_.src_fmt; } - // convolution primitive - std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_; - std::shared_ptr<mkldnn::primitive> conv_fwd_; + memory::format GetFilterMemoryFormat() const { return context_.filter_fmt; } + + std::shared_ptr<mkldnn::convolution_forward::primitive_desc> + GetPrimitiveDesc() const { + return context_.fwd_pd; + } private: - void Setup(const ConvFwdDimensions& convFwdDims) { + // Primitive reuse context for Conv2D Fwd op + struct ConvFwdContext { + // expected memory format for this primitive instance + memory::format src_fmt; + memory::format filter_fmt; + + // MKLDNN memory + std::shared_ptr<mkldnn::memory> src_mem; + std::shared_ptr<mkldnn::memory> filter_mem; + std::shared_ptr<mkldnn::memory> bias_mem; + std::shared_ptr<mkldnn::memory> dst_mem; + + // desc & prmitive desc + std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc; + + // memory desc + std::shared_ptr<mkldnn::memory::desc> src_md; + std::shared_ptr<mkldnn::memory::desc> filter_md; + std::shared_ptr<mkldnn::memory::desc> bias_md; + std::shared_ptr<mkldnn::memory::desc> dst_md; + + // convolution primitive + std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd; + std::shared_ptr<mkldnn::primitive> conv_fwd; + + std::shared_ptr<mkldnn::stream> fwd_stream; + std::vector<mkldnn::primitive> fwd_primitives; + + ConvFwdContext() + : src_fmt(memory::format::any), + filter_fmt(memory::format::any), + src_mem(nullptr), + filter_mem(nullptr), + bias_mem(nullptr), + dst_mem(nullptr), + fwd_desc(nullptr), + src_md(nullptr), + filter_md(nullptr), + bias_md(nullptr), + fwd_pd(nullptr), + conv_fwd(nullptr), + fwd_stream(nullptr) {} + }; + + void Setup(const MklConvFwdParams& convFwdDims) { // create memory descriptors for convolution data w/ no specified format - src_md_.reset(new memory::desc({convFwdDims.src_dims}, - MklDnnType<T>(), memory::format::any)); + context_.src_md.reset(new memory::desc( + {convFwdDims.src_dims}, MklDnnType<T>(), memory::format::any)); - filter_md_.reset(new memory::desc({convFwdDims.filter_dims}, - MklDnnType<T>(), memory::format::any)); + context_.filter_md.reset(new memory::desc( + {convFwdDims.filter_dims}, MklDnnType<T>(), memory::format::any)); - dst_md_.reset(new memory::desc({convFwdDims.dst_dims}, - MklDnnType<T>(), memory::format::any)); + context_.dst_md.reset(new memory::desc( + {convFwdDims.dst_dims}, MklDnnType<T>(), memory::format::any)); if (!convFwdDims.bias_dims.empty()) - bias_md_.reset(new memory::desc({convFwdDims.bias_dims}, - MklDnnType<T>(), memory::format::any)); + context_.bias_md.reset(new memory::desc( + {convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::any)); // create a convolution if (!convFwdDims.bias_dims.empty()) { - fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward, - convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_, + context_.fwd_desc.reset(new convolution_forward::desc( + prop_kind::forward, convolution_direct, *context_.src_md, + *context_.filter_md, *context_.bias_md, *context_.dst_md, convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, convFwdDims.padding_right, padding_kind::zero)); } else { - fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward, - convolution_direct, *src_md_, *filter_md_, *dst_md_, - convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, + context_.fwd_desc.reset(new convolution_forward::desc( + prop_kind::forward, convolution_direct, *context_.src_md, + *context_.filter_md, *context_.dst_md, convFwdDims.strides, + convFwdDims.dilations, convFwdDims.padding_left, convFwdDims.padding_right, padding_kind::zero)); } - fwd_pd_.reset(new convolution_forward::primitive_desc( - *fwd_desc_, cpu_engine_)); + context_.fwd_pd.reset(new convolution_forward::primitive_desc( + *context_.fwd_desc, cpu_engine_)); // store the expected memory format - src_fmt_ = static_cast<mkldnn::memory::format>( - fwd_pd_.get()->src_primitive_desc().desc().data.format); + context_.src_fmt = static_cast<mkldnn::memory::format>( + context_.fwd_pd.get()->src_primitive_desc().desc().data.format); - filter_fmt_ = static_cast<mkldnn::memory::format>( - fwd_pd_.get()->weights_primitive_desc().desc().data.format); + context_.filter_fmt = static_cast<mkldnn::memory::format>( + context_.fwd_pd.get()->weights_primitive_desc().desc().data.format); // create memory primitive based on dummy data - src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData)); - filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(), - DummyData)); - dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData)); + context_.src_mem.reset( + new memory(context_.fwd_pd.get()->src_primitive_desc(), DummyData)); + context_.filter_mem.reset( + new memory(context_.fwd_pd.get()->weights_primitive_desc(), DummyData)); + context_.dst_mem.reset( + new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData)); // create convolution primitive and add it to net if (!convFwdDims.bias_dims.empty()) { - bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType<T>(), - memory::format::x}, cpu_engine_}, DummyData)); - conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_, - *filter_mem_, *bias_mem_, *dst_mem_)); + context_.bias_mem.reset(new memory( + {{{convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::x}, + cpu_engine_}, + DummyData)); + context_.conv_fwd.reset(new convolution_forward( + *context_.fwd_pd, *context_.src_mem, *context_.filter_mem, + *context_.bias_mem, *context_.dst_mem)); } else { - conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_, - *filter_mem_, *dst_mem_)); + context_.conv_fwd.reset( + new convolution_forward(*context_.fwd_pd, *context_.src_mem, + *context_.filter_mem, *context_.dst_mem)); } - fwd_primitives_.push_back(*conv_fwd_); + context_.fwd_primitives.push_back(*context_.conv_fwd); return; } - // MKLDNN memory - std::shared_ptr<mkldnn::memory> src_mem_; - std::shared_ptr<mkldnn::memory> filter_mem_; - std::shared_ptr<mkldnn::memory> bias_mem_; - std::shared_ptr<mkldnn::memory> dst_mem_; - - std::shared_ptr<mkldnn::stream> fwd_stream_; - std::vector<mkldnn::primitive> fwd_primitives_; - - // desc & prmitive desc - std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_; - - // memory desc - std::shared_ptr<mkldnn::memory::desc> src_md_; - std::shared_ptr<mkldnn::memory::desc> filter_md_; - std::shared_ptr<mkldnn::memory::desc> bias_md_; - std::shared_ptr<mkldnn::memory::desc> dst_md_; - - engine cpu_engine_ = engine(engine::cpu, 0); + struct ConvFwdContext context_; + engine cpu_engine_; }; template <typename T> -class Conv2DFwdFactory : public DnnOpFactory<T> { +class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> { public: - static Conv2DFwd<T>* Get(const ConvFwdDimensions& convFwdDims) { - Conv2DFwd<T>* conv2d_fwd = nullptr; - - // try to find a suitable one in pool - conv2d_fwd = dynamic_cast<Conv2DFwd<T>*> ( - Conv2DFwdFactory<T>::GetInstance().GetConv2DFwd(convFwdDims)); - - if (conv2d_fwd == nullptr) { - conv2d_fwd = new Conv2DFwd<T>(convFwdDims); - Conv2DFwdFactory<T>::GetInstance().SetConv2DFwd( - convFwdDims, conv2d_fwd); - } - return conv2d_fwd; + static MklConv2DFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims) { + MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr; + + // try to find a suitable one in pool + conv2d_fwd = dynamic_cast<MklConv2DFwdPrimitive<T>*>( + MklConv2DFwdPrimitiveFactory<T>::GetInstance().GetConv2DFwd( + convFwdDims)); + + if (conv2d_fwd == nullptr) { + conv2d_fwd = new MklConv2DFwdPrimitive<T>(convFwdDims); + MklConv2DFwdPrimitiveFactory<T>::GetInstance().SetConv2DFwd(convFwdDims, + conv2d_fwd); + } + return conv2d_fwd; } private: - Conv2DFwdFactory() {} - ~Conv2DFwdFactory() {} + MklConv2DFwdPrimitiveFactory() {} + ~MklConv2DFwdPrimitiveFactory() {} static const int kDilationH = 0, kDilationW = 1; - static Conv2DFwdFactory& GetInstance() { - static Conv2DFwdFactory instance_; + static MklConv2DFwdPrimitiveFactory& GetInstance() { + static MklConv2DFwdPrimitiveFactory instance_; return instance_; } - static std::string CreateKey(const ConvFwdDimensions& convFwdDims) { + static std::string CreateKey(const MklConvFwdParams& convFwdDims) { std::string prefix = "conv2d_fwd_"; FactoryKeyCreator key_creator; key_creator.AddAsKey(prefix); @@ -266,12 +313,12 @@ class Conv2DFwdFactory : public DnnOpFactory<T> { return key_creator.GetKey(); } - DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) { + MklPrimitive* GetConv2DFwd(const MklConvFwdParams& convFwdDims) { std::string key = CreateKey(convFwdDims); return this->GetOp(key); } - void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) { + void SetConv2DFwd(const MklConvFwdParams& convFwdDims, MklPrimitive* op) { std::string key = CreateKey(convFwdDims); this->SetOp(key, op); } @@ -762,7 +809,6 @@ class MklConv2DOp : public OpKernel { MklDnnData<T> src(&cpu_engine); MklDnnData<T> filter(&cpu_engine); - MklDnnData<T> dst(&cpu_engine); // output memory::dims src_dims, filter_dims, padding_left, padding_right, dilations, strides; @@ -812,7 +858,6 @@ class MklConv2DOp : public OpKernel { auto src_md = src_mkl_shape.IsMklTensor() ? src_mkl_shape.GetMklLayout() : memory::desc(src_dims, MklDnnType<T>(), tf_fmt); - src.SetUsrMem(src_md, &src_tensor); // Although filter shape (filter_dims) required is in MKL-DNN order, // the layout is Tensorflow's layout (HWIO). @@ -820,29 +865,30 @@ class MklConv2DOp : public OpKernel { ? filter_mkl_shape.GetMklLayout() : memory::desc(filter_dims, MklDnnType<T>(), memory::format::hwio); - filter.SetUsrMem(filter_md, &filter_tensor); // MKLDNN dilation starts from 0. dilations[kDilationH] -= 1; dilations[kDilationW] -= 1; // get a conv2d fwd from primitive pool - Conv2DFwd<T> *conv2d_fwd = nullptr; + MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr; if (biasEnabled) { memory::dims bias_dims = {}; conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims); - ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims, - dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims); + MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims, + dst_dims_mkl_order, strides, dilations, + padding_left, padding_right); + conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims); } else { - ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS, - dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims); + MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS, + dst_dims_mkl_order, strides, dilations, + padding_left, padding_right); + conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims); } // allocate output tensors output_tensor and filter_out_tensor - std::shared_ptr<mkldnn::convolution_forward::primitive_desc> - conv_fwd_pd = conv2d_fwd->fwd_pd_; + std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd = + conv2d_fwd->GetPrimitiveDesc(); AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt, &dst_tensor); Tensor* filter_out_tensor = nullptr; @@ -854,20 +900,28 @@ class MklConv2DOp : public OpKernel { // check whether src/filter need reorder std::vector<primitive> net; - if (src_md.data.format != conv2d_fwd->src_fmt_) - src.CheckReorderToOpMem( - conv_fwd_pd.get()->src_primitive_desc(), &net); - - if (filter_md.data.format != conv2d_fwd->filter_fmt_) - filter.CheckReorderToOpMem( - conv_fwd_pd.get()->weights_primitive_desc(), - filter.GetTensorBuffer(filter_out_tensor), &net); + T* src_data = nullptr; + if (src_md.data.format != conv2d_fwd->GetSrcMemoryFormat()) { + src.SetUsrMem(src_md, &src_tensor); + src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc(), &net); + src_data = static_cast<T*>(src.GetOpMem().get_data_handle()); + } else { + src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data())); + } + T* filter_data = nullptr; + if (filter_md.data.format != conv2d_fwd->GetFilterMemoryFormat()) { + filter.SetUsrMem(filter_md, &filter_tensor); + filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(), + filter.GetTensorBuffer(filter_out_tensor), + &net); + filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle()); + } else { + filter_data = + static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data())); + } + stream(stream::kind::eager).submit(net).wait(); - T* src_data = static_cast<T*>( - src.GetOpMem().get_data_handle()); - T* filter_data = static_cast<T*>( - filter.GetOpMem().get_data_handle()); // execute convolution if (biasEnabled) { diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h index 6655084045..9af4cc23b6 100644 --- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h +++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h @@ -295,7 +295,7 @@ __global__ void ColumnReduceMax16ColumnsKernel( // 1D array necessary due to bug in CUDA 9 compiler. // TODO(nluehr) revert to 2D array when compiler is ready. - // This is the mimic the following, but without any constructors: + // This is to mimic the following, but without any constructors: // __shared__ storage_type<value_type> partial_sums[32 * 33]; __shared__ __align__( alignof(value_type)) char partial_sums_raw[32 * 33 * sizeof(value_type)]; diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h index d65692a552..d28e35157b 100644 --- a/tensorflow/core/kernels/segment_reduction_ops.h +++ b/tensorflow/core/kernels/segment_reduction_ops.h @@ -16,6 +16,12 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_ #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_ +// This file requires the following include because it uses CudaAtomicMax: +// #include "tensorflow/core/util/cuda_kernel_helper.h" + +// Unfortunately we can't add the #include, since it breaks compilation for +// non-GPU targets. This only breaks in clang, because it's more strict for +// template code and CudaAtomicMax is used in template context. // This file requires the following include because it uses CudaAtomicMax: // #include "tensorflow/core/util/cuda_kernel_helper.h" diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index 262526846d..c229bd5a41 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -614,7 +614,13 @@ REGISTER_OP("ApproximateEqual") .SetIsCommutative() .Attr("T: numbertype") .Attr("tolerance: float = 0.00001") - .SetShapeFn(shape_inference::UnchangedShape); + .SetShapeFn([](InferenceContext* c) { + // The inputs 'x' and 'y' must have the same shape. + ShapeHandle data_x = c->input(0); + ShapeHandle data_y = c->input(1); + TF_RETURN_IF_ERROR(c->Merge(data_x, data_y, &data_x)); + return shape_inference::UnchangedShape(c); + }); // -------------------------------------------------------------------------- diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc index e64653a67a..ee6ba7b041 100644 --- a/tensorflow/core/platform/cloud/oauth_client.cc +++ b/tensorflow/core/platform/cloud/oauth_client.cc @@ -137,8 +137,8 @@ Status EncodeJwtClaim(StringPiece client_email, StringPiece scope, const auto expiration_timestamp_sec = request_timestamp_sec + kRequestedTokenLifetimeSec; - root["iat"] = request_timestamp_sec; - root["exp"] = expiration_timestamp_sec; + root["iat"] = Json::Value::UInt64(request_timestamp_sec); + root["exp"] = Json::Value::UInt64(expiration_timestamp_sec); // Step 2: represent the JSON as a string. string claim = root.toStyledString(); diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index a319ccbdbe..66ccd81e41 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -202,7 +202,10 @@ def cc_proto_library( ) if use_grpc_plugin: - cc_libs += ["//external:grpc_lib"] + cc_libs += select({ + "//tensorflow:linux_s390x": ["//external:grpc_lib_unsecure"], + "//conditions:default": ["//external:grpc_lib"], + }) if default_header: header_only_name = name diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc index 174f41a993..f2aaf13bec 100644 --- a/tensorflow/core/platform/windows/port.cc +++ b/tensorflow/core/platform/windows/port.cc @@ -171,5 +171,10 @@ int64 AvailableRam() { return INT64_MAX; } +int NumHyperthreadsPerCore() { + static const int ht_per_core = tensorflow::port::CPUIDNumSMT(); + return (ht_per_core > 0) ? ht_per_core : 1; +} + } // namespace port } // namespace tensorflow diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc index b0dd8ce5e0..979b437914 100644 --- a/tensorflow/core/profiler/internal/tfprof_timeline.cc +++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc @@ -47,9 +47,9 @@ Json::Value ChromeTraceFormatter::CreateEvent(const string& ph, event["ph"] = Json::Value(ph); event["cat"] = Json::Value(category); event["name"] = Json::Value(name); - event["pid"] = Json::Value(pid); - event["tid"] = Json::Value(tid); - event["ts"] = Json::Value(ts); + event["pid"] = Json::Int64(pid); + event["tid"] = Json::Int64(tid); + event["ts"] = Json::Int64(ts); return event; } @@ -57,7 +57,7 @@ void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) { Json::Value event(Json::objectValue); event["name"] = Json::Value("process_name"); event["ph"] = Json::Value("M"); - event["pid"] = Json::Value(pid); + event["pid"] = Json::Int64(pid); Json::Value args(Json::objectValue); args["name"] = Json::Value(name); event["args"] = args; @@ -68,7 +68,7 @@ void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid, int64 tid, const string& category, const string& name, Json::Value args) { Json::Value event = CreateEvent("X", category, name, pid, tid, ts); - event["dur"] = Json::Value(duration); + event["dur"] = Json::Int64(duration); event["args"] = std::move(args); metadata_.push_back(event); } @@ -76,14 +76,14 @@ void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid, void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts, int64 pid, int64 tid, int64 flow_id) { Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts); - event["id"] = flow_id; + event["id"] = Json::Int64(flow_id); events_.push_back(event); } void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid, int64 tid, int64 flow_id) { Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts); - event["id"] = flow_id; + event["id"] = Json::Int64(flow_id); events_.push_back(event); } @@ -93,7 +93,7 @@ void ChromeTraceFormatter::EmitCounter( const std::map<int64, std::vector<string>>& tensor_mem) { Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts); Json::Value args(Json::objectValue); - args["Allocator Bytes in Use"] = Json::Value(bytes); + args["Allocator Bytes in Use"] = Json::Int64(bytes); event["args"] = args; events_.push_back(event); diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 90b6533690..b5e42f5384 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -1814,11 +1814,11 @@ class MklDnnData { } }; -/// Base class for operations with reuse of DNN primitives +/// Base class for operations with reuse of primitives /// -class DnnOp { +class MklPrimitive { public: - virtual ~DnnOp() {} + virtual ~MklPrimitive() {} // Dummy data. Its size, hard-coded as 256 here, does // not matter since MKL should never operate on this buffer. @@ -1826,33 +1826,33 @@ class DnnOp { }; const mkldnn::memory::dims NONE_DIMS = {}; -// This constant is used to declare dummy buffer (size), for MKL primitives + template <typename T> -class DnnOpFactory { +class MklPrimitiveFactory { public: - DnnOpFactory() {} - ~DnnOpFactory() {} + MklPrimitiveFactory() {} + ~MklPrimitiveFactory() {} - DnnOp* GetOp(const std::string& key) { - auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key); - if (stream_iter == DnnOpFactory<T>::GetHashMap().end()) { + MklPrimitive* GetOp(const std::string& key) { + auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key); + if (stream_iter == MklPrimitiveFactory<T>::GetHashMap().end()) { return nullptr; } else { return stream_iter->second; } } - void SetOp(const std::string& key, DnnOp* op) { - auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key); + void SetOp(const std::string& key, MklPrimitive* op) { + auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key); - CHECK(stream_iter == DnnOpFactory<T>::GetHashMap().end()); + CHECK(stream_iter == MklPrimitiveFactory<T>::GetHashMap().end()); - DnnOpFactory<T>::GetHashMap()[key] = op; + MklPrimitiveFactory<T>::GetHashMap()[key] = op; } private: - static inline std::unordered_map<std::string, DnnOp*> &GetHashMap() { - static thread_local std::unordered_map<std::string, DnnOp*> map_; + static inline std::unordered_map<std::string, MklPrimitive*>& GetHashMap() { + static thread_local std::unordered_map<std::string, MklPrimitive*> map_; return map_; } }; diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md new file mode 100644 index 0000000000..bd2a80d9ef --- /dev/null +++ b/tensorflow/docs_src/get_started/index.md @@ -0,0 +1,29 @@ +# Get Started + +If you are new to machine learning, we recommend taking the following online +course prior to diving into TensorFlow documentation: + + * [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/), + which introduces machine learning concepts and encourages experimentation + with existing TensorFlow code. + +TensorFlow is a tool for machine learning. While it contains a wide range of +functionality, TensorFlow is mainly designed for deep neural network models. + +The easiest way to get started with TensorFlow is by using Eager Execution. + + * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow. + +TensorFlow provides many APIs. The remainder of this section focuses on the +Estimator API which provide scalable, high-performance models. See the +@{$estimators} guide. + +For more advanced users: + + * The @{$low_level_intro$Low Level Introduction} demonstrates how to use + TensorFlow outside of the Estimator framework, for debugging and + experimentation. + * The @{$guide$Programmer's Guide} details major + TensorFlow components. + * The @{$tutorials$Tutorials} provide walkthroughs of a variety of + TensorFlow models. diff --git a/tensorflow/docs_src/guide/debugger.md b/tensorflow/docs_src/guide/debugger.md index 5cf9af904a..dc4db58857 100644 --- a/tensorflow/docs_src/guide/debugger.md +++ b/tensorflow/docs_src/guide/debugger.md @@ -17,7 +17,7 @@ how to use the graphical user interface (GUI) of tfdbg, i.e., the Note: The TensorFlow debugger uses a [curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text user interface. On Mac OS X, the `ncurses` library is required and can be -installed with `brew install homebrew/dupes/ncurses`. On Windows, curses isn't as +installed with `brew install ncurses`. On Windows, curses isn't as well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based interface can be used with tfdbg by installing `pyreadline` with `pip`. If you use Anaconda3, you can install it with a command such as diff --git a/tensorflow/go/attrs.go b/tensorflow/go/attrs.go new file mode 100644 index 0000000000..f86c5737bc --- /dev/null +++ b/tensorflow/go/attrs.go @@ -0,0 +1,245 @@ +/* +Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tensorflow + +// #include <stdlib.h> +// #include "tensorflow/c/c_api.h" +import "C" +import ( + "fmt" + "unsafe" +) + +// makeCShape converts a shape specified in C.int64_t into a Shape. +func makeCShape(shape []C.int64_t) Shape { + s := Shape{dims: make([]int64, len(shape))} + for i, n := range shape { + s.dims[i] = int64(n) + } + return s +} + +// Attr returns the value of an attribute on op. It returns an error if the +// attribute does not exist. +func (op *Operation) Attr(name string) (interface{}, error) { + cname := C.CString(name) + defer C.free(unsafe.Pointer(cname)) + + status := newStatus() + meta := C.TF_OperationGetAttrMetadata(op.c, cname, status.c) + if err := status.Err(); err != nil { + return nil, err + } + + if meta.is_list == 1 { + return listAttribute(op, cname, meta) + } + return scalarAttribute(op, cname, meta) +} + +func listAttribute(op *Operation, cname *C.char, meta C.TF_AttrMetadata) (interface{}, error) { + status := newStatus() + + switch meta._type { + case C.TF_ATTR_STRING: + if meta.list_size == 0 { + return []string(nil), nil + } + values := make([]unsafe.Pointer, meta.list_size) + lengths := make([]C.size_t, meta.list_size) + // Add one element in case total_size is zero. + storage := make([]C.char, meta.total_size+1) + C.TF_OperationGetAttrStringList(op.c, cname, &values[0], &lengths[0], C.int(meta.list_size), unsafe.Pointer(&storage[0]), C.size_t(meta.total_size), status.c) + if err := status.Err(); err != nil { + return nil, err + } + list := make([]string, meta.list_size) + for i, val := range values { + length := lengths[i] + list[i] = C.GoStringN((*C.char)(val), C.int(length)) + } + return list, nil + + case C.TF_ATTR_INT: + if meta.list_size == 0 { + return []int64(nil), nil + } + list := make([]C.int64_t, meta.list_size) + C.TF_OperationGetAttrIntList(op.c, cname, &list[0], C.int(meta.list_size), status.c) + if err := status.Err(); err != nil { + return nil, err + } + vals := make([]int64, meta.list_size) + for i, val := range list { + vals[i] = int64(val) + } + return vals, nil + + case C.TF_ATTR_FLOAT: + if meta.list_size == 0 { + return []float32(nil), nil + } + list := make([]C.float, meta.list_size) + C.TF_OperationGetAttrFloatList(op.c, cname, &list[0], C.int(meta.list_size), status.c) + if err := status.Err(); err != nil { + return nil, err + } + vals := make([]float32, meta.list_size) + for i, val := range list { + vals[i] = float32(val) + } + return vals, nil + + case C.TF_ATTR_BOOL: + if meta.list_size == 0 { + return []bool(nil), nil + } + list := make([]C.uchar, meta.list_size) + C.TF_OperationGetAttrBoolList(op.c, cname, &list[0], C.int(meta.list_size), status.c) + if err := status.Err(); err != nil { + return nil, err + } + vals := make([]bool, meta.list_size) + for i, val := range list { + vals[i] = val == 1 + } + return vals, nil + + case C.TF_ATTR_TYPE: + if meta.list_size == 0 { + return []DataType(nil), nil + } + list := make([]C.TF_DataType, meta.list_size) + C.TF_OperationGetAttrTypeList(op.c, cname, &list[0], C.int(meta.list_size), status.c) + if err := status.Err(); err != nil { + return nil, err + } + vals := make([]DataType, meta.list_size) + for i, val := range list { + vals[i] = DataType(val) + } + return vals, nil + + case C.TF_ATTR_TENSOR: + if meta.list_size == 0 { + return []*Tensor(nil), nil + } + list := make([]*C.TF_Tensor, meta.list_size) + C.TF_OperationGetAttrTensorList(op.c, cname, &list[0], C.int(meta.list_size), status.c) + if err := status.Err(); err != nil { + return nil, err + } + vals := make([]*Tensor, meta.list_size) + for i, t := range list { + vals[i] = newTensorFromC(t) + } + return vals, nil + + case C.TF_ATTR_SHAPE: + if meta.list_size == 0 { + return []Shape(nil), nil + } + dims := make([]*C.int64_t, meta.list_size) + numDims := make([]C.int, meta.list_size) + // Add one element in case total_size is zero. + storage := make([]C.int64_t, meta.total_size+1) + C.TF_OperationGetAttrShapeList(op.c, cname, &dims[0], &numDims[0], C.int(meta.list_size), &storage[0], C.int(meta.total_size), status.c) + if err := status.Err(); err != nil { + return nil, err + } + list := make([]Shape, meta.list_size) + for i, dim := range dims { + numDim := numDims[i] + // If the number of dimensions is unknown, default to empty shape. + if numDim < 0 { + continue + } + // A []C.int64_t slice backed by C memory. + // See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices + slice := (*[1 << 30]C.int64_t)(unsafe.Pointer(dim))[:numDim:numDim] + list[i] = makeCShape(slice) + } + return list, nil + + default: + return nil, fmt.Errorf("list type %v not supported", meta._type) + } +} + +func scalarAttribute(op *Operation, cname *C.char, meta C.TF_AttrMetadata) (interface{}, error) { + status := newStatus() + + switch meta._type { + case C.TF_ATTR_STRING: + if meta.total_size == 0 { + return "", nil + } + v := make([]C.char, meta.total_size) + C.TF_OperationGetAttrString(op.c, cname, unsafe.Pointer(&v[0]), C.size_t(meta.total_size), status.c) + if err := status.Err(); err != nil { + return nil, err + } + return C.GoStringN(&v[0], C.int(meta.total_size)), nil + + case C.TF_ATTR_INT: + var v C.int64_t + C.TF_OperationGetAttrInt(op.c, cname, &v, status.c) + return int64(v), status.Err() + + case C.TF_ATTR_FLOAT: + var v C.float + C.TF_OperationGetAttrFloat(op.c, cname, &v, status.c) + return float32(v), status.Err() + + case C.TF_ATTR_BOOL: + var v C.uchar + C.TF_OperationGetAttrBool(op.c, cname, &v, status.c) + return v == 1, status.Err() + + case C.TF_ATTR_TYPE: + var v C.TF_DataType + C.TF_OperationGetAttrType(op.c, cname, &v, status.c) + return DataType(v), status.Err() + + case C.TF_ATTR_TENSOR: + var v *C.TF_Tensor + C.TF_OperationGetAttrTensor(op.c, cname, &v, status.c) + if err := status.Err(); err != nil { + return nil, err + } + return newTensorFromC(v), nil + + case C.TF_ATTR_SHAPE: + numDims := meta.total_size + // If number of dims is unknown return empty shape to indicate that. + if numDims < 0 { + return Shape{}, nil + } + if numDims == 0 { + return ScalarShape(), nil + } + dims := make([]C.int64_t, numDims) + C.TF_OperationGetAttrShape(op.c, cname, (*C.int64_t)(unsafe.Pointer(&dims[0])), C.int(numDims), status.c) + if err := status.Err(); err != nil { + return nil, err + } + return makeCShape(dims), nil + + default: + return nil, fmt.Errorf("type %v not supported", meta._type) + } +} diff --git a/tensorflow/go/attrs_test.go b/tensorflow/go/attrs_test.go new file mode 100644 index 0000000000..ea8af221ae --- /dev/null +++ b/tensorflow/go/attrs_test.go @@ -0,0 +1,193 @@ +/* +Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tensorflow + +import ( + "fmt" + "reflect" + "testing" +) + +func TestOperationAttrs(t *testing.T) { + g := NewGraph() + + i := 0 + makeConst := func(v interface{}) Output { + op, err := Const(g, fmt.Sprintf("const/%d/%+v", i, v), v) + i++ + if err != nil { + t.Fatal(err) + } + return op + } + + makeTensor := func(v interface{}) *Tensor { + tensor, err := NewTensor(v) + if err != nil { + t.Fatal(err) + } + return tensor + } + + cases := []OpSpec{ + { + Name: "type", + Type: "Placeholder", + Attrs: map[string]interface{}{ + "dtype": Float, + }, + }, + { + Name: "list(float)", + Type: "Bucketize", + Input: []Input{ + makeConst([]float32{1, 2, 3, 4}), + }, + Attrs: map[string]interface{}{ + "boundaries": []float32{0, 1, 2, 3, 4, 5}, + }, + }, + { + Name: "list(float) empty", + Type: "Bucketize", + Input: []Input{ + makeConst([]float32{}), + }, + Attrs: map[string]interface{}{ + "boundaries": []float32(nil), + }, + }, + /* TODO(ashankar): debug this issue and add it back later. + { + Name: "list(type),list(shape)", + Type: "InfeedEnqueueTuple", + Input: []Input{ + OutputList([]Output{ + makeConst(float32(1)), + makeConst([][]int32{{2}}), + }), + }, + Attrs: map[string]interface{}{ + "dtypes": []DataType{Float, Int32}, + "shapes": []Shape{ScalarShape(), MakeShape(1, 1)}, + }, + }, + { + Name: "list(type),list(shape) empty", + Type: "InfeedEnqueueTuple", + Input: []Input{ + OutputList([]Output{ + makeConst([][]int32{{2}}), + }), + }, + Attrs: map[string]interface{}{ + "dtypes": []DataType{Int32}, + "shapes": []Shape(nil), + }, + }, + { + Name: "list(type) empty,string empty,int", + Type: "_XlaSendFromHost", + Input: []Input{ + OutputList([]Output{}), + makeConst(""), + }, + Attrs: map[string]interface{}{ + "Tinputs": []DataType(nil), + "key": "", + "device_ordinal": int64(0), + }, + }, + */ + { + Name: "list(int),int", + Type: "StringToHashBucketStrong", + Input: []Input{ + makeConst(""), + }, + Attrs: map[string]interface{}{ + "num_buckets": int64(2), + "key": []int64{1, 2}, + }, + }, + { + Name: "list(int) empty,int", + Type: "StringToHashBucketStrong", + Input: []Input{ + makeConst(""), + }, + Attrs: map[string]interface{}{ + "num_buckets": int64(2), + "key": ([]int64)(nil), + }, + }, + { + Name: "list(string),type", + Type: "TensorSummary", + Input: []Input{ + makeConst(""), + }, + Attrs: map[string]interface{}{ + "T": String, + "labels": []string{"foo", "bar"}, + }, + }, + { + Name: "list(string) empty,type", + Type: "TensorSummary", + Input: []Input{ + makeConst(""), + }, + Attrs: map[string]interface{}{ + "T": String, + "labels": ([]string)(nil), + }, + }, + { + Name: "tensor", + Type: "Const", + Attrs: map[string]interface{}{ + "dtype": String, + "value": makeTensor("foo"), + }, + }, + } + + for i, spec := range cases { + op, err := g.AddOperation(spec) + if err != nil { + t.Fatal(err) + } + for key, want := range spec.Attrs { + out, err := op.Attr(key) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(out, want) { + t.Fatalf("%d. %q: Got %#v, wanted %#v", i, key, out, want) + } + wantT, ok := want.(*Tensor) + if ok { + wantVal := wantT.Value() + outVal := out.(*Tensor).Value() + if !reflect.DeepEqual(outVal, wantVal) { + t.Fatalf("%d. %q: Got %#v, wanted %#v", i, key, outVal, wantVal) + } + } + } + } +} diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index b2dbdafc5f..6d9cb7c6ec 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -11210,7 +11210,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted // SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value. // // value: The cropped area of the image must contain a fraction of the -// supplied image within in this range. +// supplied image within this range. // If not specified, defaults to <f:0.05 f:1 > func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr { return func(m optionalAttr) { @@ -17969,9 +17969,10 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val } // Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)` -// // if < 0, `scale * features` otherwise. // +// Assumes weights to have zero mean and variance 1.0 / fan_in. +// // See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515) func Selu(scope *Scope, features tf.Output) (activations tf.Output) { if scope.Err() != nil { @@ -21655,7 +21656,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr { // generated sequentially as '*tag*/image/0', '*tag*/image/1', etc. // // The `bad_color` argument is the color to use in the generated images for -// non-finite input values. It is a `unit8` 1-D tensor of length `channels`. +// non-finite input values. It is a `uint8` 1-D tensor of length `channels`. // Each element must be in the range `[0, 255]` (It represents the value of a // pixel in the output image). Non-finite values in the input tensor are // replaced by this tensor in the output image. The default value is the color @@ -24048,7 +24049,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort // SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value. // // value: The cropped area of the image must contain a fraction of the -// supplied image within in this range. +// supplied image within this range. // If not specified, defaults to <f:0.05 f:1 > func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr { return func(m optionalAttr) { diff --git a/tensorflow/go/operation.go b/tensorflow/go/operation.go index 8fcad61f4c..25ec718703 100644 --- a/tensorflow/go/operation.go +++ b/tensorflow/go/operation.go @@ -65,6 +65,11 @@ func (op *Operation) Output(i int) Output { return Output{op, i} } +// NumInputs returns the number of inputs of op. +func (op *Operation) NumInputs() int { + return int(C.TF_OperationNumInputs(op.c)) +} + // Output represents one of the outputs of an operation in the graph. Has a // DataType (and eventually a Shape). May be passed as an input argument to a // function for adding operations to a graph, or to a Session's Run() method to @@ -123,6 +128,67 @@ func (p Output) c() C.TF_Output { func (p Output) canBeAnInput() {} +// Consumers returns the inputs that consume this output. +func (p Output) Consumers() []Consumer { + max := int(C.TF_OperationOutputNumConsumers(p.c())) + if max == 0 { + return nil + } + inputs := make([]C.TF_Input, max) + n := C.TF_OperationOutputConsumers(p.c(), (*C.TF_Input)(unsafe.Pointer(&inputs[0])), C.int(max)) + inputs = inputs[:int(n)] + + var consumers []Consumer + for _, consumer := range inputs { + consumers = append(consumers, Consumer{ + Index: int(consumer.index), + Op: &Operation{ + c: consumer.oper, + g: p.Op.g, + }, + }) + } + + return consumers +} + +// Consumer identifies a specific input of an operation that consumes the output +// of another operation. +type Consumer struct { + // Op is the Operation that is consuming the output of another operation. + Op *Operation + + // Index is the index of the input within Op that the output of another + // operation is connected to. + Index int +} + +func (p Consumer) c() C.TF_Input { + if p.Op == nil { + // Attempt to provide a more useful panic message than "nil + // pointer dereference". + panic("nil-Operation. Consumer objects should only be created by a call to Output.Consumers") + } + return C.TF_Input{oper: p.Op.c, index: C.int(p.Index)} +} + +// DataType returns the type of the input. +func (p Consumer) DataType() DataType { + return DataType(C.TF_OperationInputType(p.c())) +} + +// Producer returns the Output that is connected to this Consumer. +func (p Consumer) Producer() Output { + output := C.TF_OperationInput(p.c()) + return Output{ + Op: &Operation{ + c: output.oper, + g: p.Op.g, + }, + Index: int(output.index), + } +} + // Input is the interface for specifying inputs to an operation being added to // a Graph. // diff --git a/tensorflow/go/operation_test.go b/tensorflow/go/operation_test.go index 40c951ab8c..06b65bdfb7 100644 --- a/tensorflow/go/operation_test.go +++ b/tensorflow/go/operation_test.go @@ -166,6 +166,68 @@ func TestOutputDataTypeAndShape(t *testing.T) { } } +func TestOperationInputs(t *testing.T) { + g := NewGraph() + x, err := Placeholder(g, "x", Float) + if err != nil { + t.Fatal(err) + } + y, err := Placeholder(g, "y", Float) + if err != nil { + t.Fatal(err) + } + add, err := Add(g, "add", x, y) + if err != nil { + t.Fatal(err) + } + addOp := add.Op + + if out := addOp.NumInputs(); out != 2 { + t.Fatalf("Got %d inputs, wanted 2", out) + } +} + +func TestOperationConsumers(t *testing.T) { + g := NewGraph() + x, err := Placeholder(g, "x", Float) + if err != nil { + t.Fatal(err) + } + a, err := Neg(g, "a", x) + if err != nil { + t.Fatal(err) + } + b, err := Neg(g, "b", x) + if err != nil { + t.Fatal(err) + } + + consumers := []*Operation{a.Op, b.Op} + + xConsumers := x.Consumers() + if out := len(xConsumers); out != 2 { + t.Fatalf("Got %d consumers, wanted 2", out) + } + + for i, consumer := range xConsumers { + got := consumer.Op.Name() + want := consumers[i].Name() + if got != want { + t.Fatalf("%d. Got op name %q, wanted %q", i, got, want) + } + + got = consumer.Producer().Op.Name() + want = x.Op.Name() + if got != want { + t.Fatalf("%d. Got op name %q, wanted %q", i, got, want) + } + } + + if len(b.Consumers()) != 0 { + t.Fatalf("expected %+v to have no consumers", b) + } +} + func forceGC() { var mem runtime.MemStats runtime.ReadMemStats(&mem) diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD index 19d2133a55..73e210fae0 100644 --- a/tensorflow/java/BUILD +++ b/tensorflow/java/BUILD @@ -56,6 +56,10 @@ java_library( srcs = glob(["src/gen/java/org/tensorflow/processor/**/*.java"]), javacopts = JAVACOPTS, resources = glob(["src/gen/resources/META-INF/services/javax.annotation.processing.Processor"]), + deps = [ + "@com_google_guava", + "@com_squareup_javapoet", + ], ) filegroup( @@ -70,6 +74,7 @@ tf_java_op_gen_srcjar( name = "java_op_gen_sources", api_def_srcs = [ "//tensorflow/core/api_def:base_api_def", + "//tensorflow/core/api_def:java_api_def", ], base_package = "org.tensorflow.op", gen_tool = ":java_op_gen_tool", diff --git a/tensorflow/java/maven/.gitignore b/tensorflow/java/maven/.gitignore index ff080515d5..657e2a60bc 100644 --- a/tensorflow/java/maven/.gitignore +++ b/tensorflow/java/maven/.gitignore @@ -11,4 +11,10 @@ tensorflow/src tensorflow/target proto/src proto/target +hadoop/src +hadoop/target +spark-connector/src +spark-connector/target +spark-connector/dependency-reduced-pom.xml +spark-connector/spark-warehouse pom.xml.versionsBackup diff --git a/tensorflow/java/maven/README.md b/tensorflow/java/maven/README.md index c7e8f03806..3e030dcd09 100644 --- a/tensorflow/java/maven/README.md +++ b/tensorflow/java/maven/README.md @@ -53,6 +53,12 @@ There are seven artifacts and thus `pom.xml`s involved in this release: 7. [`parentpom`](https://maven.apache.org/pom/index.html): Common settings shared by all of the above. +8. `hadoop`: The TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop. + The source code for this package is available in the [TensorFlow Ecosystem](https://github.com/tensorflow/ecosystem/tree/master/hadoop) + +9. `spark-connector`: A Scala library for loading and storing TensorFlow TFRecord + using Apache Spark DataFrames. The source code for this package is available + in the [TensorFlow Ecosystem](https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-connector) ## Updating the release diff --git a/tensorflow/java/maven/hadoop/pom.xml b/tensorflow/java/maven/hadoop/pom.xml new file mode 100644 index 0000000000..0642be06fa --- /dev/null +++ b/tensorflow/java/maven/hadoop/pom.xml @@ -0,0 +1,24 @@ +<project + xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <!-- Placeholder pom which is replaced by TensorFlow ecosystem Hadoop pom during build --> + <modelVersion>4.0.0</modelVersion> + <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description> + <artifactId>hadoop</artifactId> + <packaging>jar</packaging> + + <scm> + <url>https://github.com/tensorflow/ecosystem.git</url> + <connection>git@github.com:tensorflow/ecosystem.git</connection> + <developerConnection>scm:git:https://github.com/tensorflow/ecosystem.git</developerConnection> + </scm> + + <url>https://github.com/tensorflow/ecosystem/</url> + <parent> + <groupId>org.tensorflow</groupId> + <artifactId>parentpom</artifactId> + <version>1.9.0-rc0</version> + <relativePath>../</relativePath> + </parent> +</project>
\ No newline at end of file diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml index 3890f3fcaa..b4746794ea 100644 --- a/tensorflow/java/maven/pom.xml +++ b/tensorflow/java/maven/pom.xml @@ -32,6 +32,8 @@ <module>libtensorflow_jni_gpu</module> <module>tensorflow</module> <module>proto</module> + <module>hadoop</module> + <module>spark-connector</module> </modules> <!-- Two profiles are used: diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh index bf19c09b1d..2e771064e4 100644 --- a/tensorflow/java/maven/run_inside_container.sh +++ b/tensorflow/java/maven/run_inside_container.sh @@ -19,6 +19,7 @@ RELEASE_URL_PREFIX="https://storage.googleapis.com/tensorflow/libtensorflow" +TF_ECOSYSTEM_URL="https://github.com/tensorflow/ecosystem.git" # By default we deploy to both ossrh and bintray. These two # environment variables can be set to skip either repository. @@ -44,7 +45,9 @@ clean() { # (though if run inside a clean docker container, there won't be any dirty # artifacts lying around) mvn -q clean - rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target libtensorflow/src libtensorflow/target tensorflow-android/target + rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target \ + libtensorflow/src libtensorflow/target tensorflow-android/target proto/src proto/target \ + hadoop/src hadoop/target spark-connector/src spark-connector/target } update_version_in_pom() { @@ -183,6 +186,43 @@ generate_java_protos() { rm -rf "${DIR}/proto/tmp" } + +# Download the TensorFlow ecosystem source from git. +# The pom files from this repo do not inherit from the parent pom so the maven version +# is updated for each module. +download_tf_ecosystem() { + ECOSYSTEM_DIR="/tmp/tensorflow-ecosystem" + HADOOP_DIR="${DIR}/hadoop" + SPARK_DIR="${DIR}/spark-connector" + + # Clean any previous attempts + rm -rf "${ECOSYSTEM_DIR}" + + # Clone the TensorFlow ecosystem project + mkdir -p "${ECOSYSTEM_DIR}" + cd "${ECOSYSTEM_DIR}" + git clone "${TF_ECOSYSTEM_URL}" + cd ecosystem + git checkout r${TF_VERSION} + + # Copy the TensorFlow Hadoop source + cp -r "${ECOSYSTEM_DIR}/ecosystem/hadoop/src" "${HADOOP_DIR}" + cp "${ECOSYSTEM_DIR}/ecosystem/hadoop/pom.xml" "${HADOOP_DIR}" + cd "${HADOOP_DIR}" + update_version_in_pom + + # Copy the TensorFlow Spark connector source + cp -r "${ECOSYSTEM_DIR}/ecosystem/spark/spark-tensorflow-connector/src" "${SPARK_DIR}" + cp "${ECOSYSTEM_DIR}/ecosystem/spark/spark-tensorflow-connector/pom.xml" "${SPARK_DIR}" + cd "${SPARK_DIR}" + update_version_in_pom + + # Cleanup + rm -rf "${ECOSYSTEM_DIR}" + + cd "${DIR}" +} + # Deploy artifacts using a specific profile. # Arguments: # profile - name of selected profile. @@ -240,7 +280,8 @@ cd "${DIR}" # Comment lines out appropriately if debugging/tinkering with the release # process. # gnupg2 is required for signing -apt-get -qq update && apt-get -qqq install -y gnupg2 +apt-get -qq update && apt-get -qqq install -y gnupg2 git + clean update_version_in_pom download_libtensorflow @@ -248,6 +289,8 @@ download_libtensorflow_jni download_libtensorflow_jni_gpu update_tensorflow_android generate_java_protos +download_tf_ecosystem + # Build the release artifacts mvn verify # Push artifacts to repository diff --git a/tensorflow/java/maven/spark-connector/pom.xml b/tensorflow/java/maven/spark-connector/pom.xml new file mode 100644 index 0000000000..19c752d08b --- /dev/null +++ b/tensorflow/java/maven/spark-connector/pom.xml @@ -0,0 +1,24 @@ +<project + xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <!-- Placeholder pom which is replaced by TensorFlow ecosystem Spark pom during build --> + <modelVersion>4.0.0</modelVersion> + <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description> + <artifactId>spark-connector</artifactId> + <packaging>jar</packaging> + + <scm> + <url>https://github.com/tensorflow/ecosystem.git</url> + <connection>git@github.com:tensorflow/ecosystem.git</connection> + <developerConnection>scm:git:https://github.com/tensorflow/ecosystem.git</developerConnection> + </scm> + + <url>https://github.com/tensorflow/ecosystem/</url> + <parent> + <groupId>org.tensorflow</groupId> + <artifactId>parentpom</artifactId> + <version>1.9.0-rc0</version> + <relativePath>../</relativePath> + </parent> +</project>
\ No newline at end of file diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc index 9b171f66ec..d5bd99bdd9 100644 --- a/tensorflow/java/src/gen/cc/op_generator.cc +++ b/tensorflow/java/src/gen/cc/op_generator.cc @@ -35,7 +35,7 @@ namespace tensorflow { namespace java { namespace { -const char* kLicense = +constexpr const char kLicense[] = "/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.\n" "\n" "Licensed under the Apache License, Version 2.0 (the \"License\");\n" @@ -391,9 +391,12 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint, } if (!op.hidden()) { // expose the op in the Ops Graph API only if it is visible - op_class.add_annotation( - Annotation::Create("Operator", "org.tensorflow.op.annotation") - .attributes("group = \"" + endpoint.package() + "\"")); + Annotation oper_annot = + Annotation::Create("Operator", "org.tensorflow.op.annotation"); + if (endpoint.package() != kDefaultEndpointPackage) { + oper_annot.attributes("group = \"" + endpoint.package() + "\""); + } + op_class.add_annotation(oper_annot); } // create op class file const string op_dir_name = io::JoinPath( diff --git a/tensorflow/java/src/gen/cc/op_specs.h b/tensorflow/java/src/gen/cc/op_specs.h index ca0ba16745..30ecb8ce53 100644 --- a/tensorflow/java/src/gen/cc/op_specs.h +++ b/tensorflow/java/src/gen/cc/op_specs.h @@ -27,6 +27,8 @@ limitations under the License. namespace tensorflow { namespace java { +constexpr const char kDefaultEndpointPackage[] = "core"; + class EndpointSpec { public: // A specification for an operation endpoint diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java index 11fda4fc22..796d6a62dc 100644 --- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java +++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java @@ -15,19 +15,44 @@ limitations under the License. package org.tensorflow.processor; +import com.google.common.base.CaseFormat; +import com.google.common.base.Strings; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.Multimap; +import com.squareup.javapoet.ClassName; +import com.squareup.javapoet.FieldSpec; +import com.squareup.javapoet.JavaFile; +import com.squareup.javapoet.MethodSpec; +import com.squareup.javapoet.ParameterSpec; +import com.squareup.javapoet.TypeName; +import com.squareup.javapoet.TypeSpec; +import com.squareup.javapoet.TypeVariableName; import java.io.IOException; -import java.io.PrintWriter; +import java.util.Collection; import java.util.Collections; -import java.util.HashSet; +import java.util.HashMap; +import java.util.Map; import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.annotation.processing.AbstractProcessor; import javax.annotation.processing.Filer; import javax.annotation.processing.Messager; import javax.annotation.processing.ProcessingEnvironment; import javax.annotation.processing.RoundEnvironment; import javax.lang.model.SourceVersion; +import javax.lang.model.element.AnnotationMirror; +import javax.lang.model.element.AnnotationValue; import javax.lang.model.element.Element; +import javax.lang.model.element.ExecutableElement; +import javax.lang.model.element.Modifier; import javax.lang.model.element.TypeElement; +import javax.lang.model.element.TypeParameterElement; +import javax.lang.model.element.VariableElement; +import javax.lang.model.type.TypeMirror; +import javax.lang.model.type.TypeVariable; +import javax.lang.model.util.ElementFilter; +import javax.lang.model.util.Elements; import javax.tools.Diagnostic.Kind; /** @@ -55,6 +80,7 @@ public final class OperatorProcessor extends AbstractProcessor { super.init(processingEnv); messager = processingEnv.getMessager(); filer = processingEnv.getFiler(); + elements = processingEnv.getElementUtils(); } @Override @@ -98,42 +124,77 @@ public final class OperatorProcessor extends AbstractProcessor { } // Collect all classes tagged with our annotation. - Set<TypeElement> opClasses = new HashSet<TypeElement>(); - if (!collectOpClasses(roundEnv, opClasses, annotation)) { + Multimap<String, MethodSpec> groupedMethods = HashMultimap.create(); + if (!collectOpsMethods(roundEnv, groupedMethods, annotation)) { return true; } // Nothing to do when there are no tagged classes. - if (opClasses.isEmpty()) { + if (groupedMethods.isEmpty()) { return true; } - // TODO:(kbsriram) validate operator classes and generate Op API. - writeApi(); + // Validate operator classes and generate Op API. + writeApi(groupedMethods); + hasRun = true; return true; } @Override public Set<String> getSupportedAnnotationTypes() { - return Collections.singleton(String.format("%s.annotation.Operator", OP_PACKAGE)); + return Collections.singleton("org.tensorflow.op.annotation.Operator"); + } + + private static final Pattern JAVADOC_TAG_PATTERN = + Pattern.compile("@(?:param|return|throws|exception|see)\\s+.*"); + private static final TypeName T_OPS = ClassName.get("org.tensorflow.op", "Ops"); + private static final TypeName T_OPERATOR = + ClassName.get("org.tensorflow.op.annotation", "Operator"); + private static final TypeName T_SCOPE = ClassName.get("org.tensorflow.op", "Scope"); + private static final TypeName T_GRAPH = ClassName.get("org.tensorflow", "Graph"); + private static final TypeName T_STRING = ClassName.get(String.class); + + private Filer filer; + private Messager messager; + private Elements elements; + private boolean hasRun = false; + + private void error(Element e, String message, Object... args) { + if (args != null && args.length > 0) { + message = String.format(message, args); + } + messager.printMessage(Kind.ERROR, message, e); } - private void writeApi() { - // Generate an empty class for now and get the build working correctly. This will be changed to - // generate the actual API once we've done with build-related changes. - // TODO:(kbsriram) - try (PrintWriter writer = - new PrintWriter(filer.createSourceFile(String.format("%s.Ops", OP_PACKAGE)).openWriter())) { - writer.println(String.format("package %s;", OP_PACKAGE)); - writer.println("public class Ops{}"); + private void write(TypeSpec spec) { + try { + JavaFile.builder("org.tensorflow.op", spec).skipJavaLangImports(true).build().writeTo(filer); } catch (IOException e) { - error(null, "Unexpected failure generating API: %s", e.getMessage()); + throw new AssertionError(e); + } + } + + private void writeApi(Multimap<String, MethodSpec> groupedMethods) { + Map<String, ClassName> groups = new HashMap<>(); + + // Generate a API class for each group collected other than the default one (= empty string) + for (Map.Entry<String, Collection<MethodSpec>> entry : groupedMethods.asMap().entrySet()) { + if (!entry.getKey().isEmpty()) { + TypeSpec groupClass = buildGroupClass(entry.getKey(), entry.getValue()); + write(groupClass); + groups.put(entry.getKey(), ClassName.get("org.tensorflow.op", groupClass.name)); + } } + // Generate the top API class, adding any methods added to the default group + TypeSpec topClass = buildTopClass(groups, groupedMethods.get("")); + write(topClass); } - private boolean collectOpClasses( - RoundEnvironment roundEnv, Set<TypeElement> opClasses, TypeElement annotation) { + private boolean collectOpsMethods( + RoundEnvironment roundEnv, + Multimap<String, MethodSpec> groupedMethods, + TypeElement annotation) { boolean result = true; for (Element e : roundEnv.getElementsAnnotatedWith(annotation)) { // @Operator can only apply to types, so e must be a TypeElement. @@ -145,20 +206,251 @@ public final class OperatorProcessor extends AbstractProcessor { result = false; continue; } - opClasses.add((TypeElement) e); + TypeElement opClass = (TypeElement) e; + // Skip deprecated operations for now, as we do not guarantee API stability yet + if (opClass.getAnnotation(Deprecated.class) == null) { + collectOpMethods(groupedMethods, opClass, annotation); + } } return result; } - private void error(Element e, String message, Object... args) { - if (args != null && args.length > 0) { - message = String.format(message, args); + private void collectOpMethods( + Multimap<String, MethodSpec> groupedMethods, TypeElement opClass, TypeElement annotation) { + AnnotationMirror am = getAnnotationMirror(opClass, annotation); + String groupName = getAnnotationElementValueAsString("group", am); + String methodName = getAnnotationElementValueAsString("name", am); + ClassName opClassName = ClassName.get(opClass); + if (Strings.isNullOrEmpty(methodName)) { + methodName = CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, opClassName.simpleName()); + } + // Build a method for each @Operator found in the class path. There should be one method per + // operation factory called + // "create", which takes in parameter a scope and, optionally, a list of arguments + for (ExecutableElement opMethod : ElementFilter.methodsIn(opClass.getEnclosedElements())) { + if (opMethod.getModifiers().contains(Modifier.STATIC) + && opMethod.getSimpleName().contentEquals("create")) { + MethodSpec method = buildOpMethod(methodName, opClassName, opMethod); + groupedMethods.put(groupName, method); + } } - messager.printMessage(Kind.ERROR, message, e); } - private Filer filer; - private Messager messager; - private boolean hasRun = false; - private static final String OP_PACKAGE = "org.tensorflow.op"; + private MethodSpec buildOpMethod( + String methodName, ClassName opClassName, ExecutableElement factoryMethod) { + MethodSpec.Builder builder = + MethodSpec.methodBuilder(methodName) + .addModifiers(Modifier.PUBLIC) + .returns(TypeName.get(factoryMethod.getReturnType())) + .varargs(factoryMethod.isVarArgs()) + .addJavadoc("$L", buildOpMethodJavadoc(opClassName, factoryMethod)); + + for (TypeParameterElement tp : factoryMethod.getTypeParameters()) { + TypeVariableName tvn = TypeVariableName.get((TypeVariable) tp.asType()); + builder.addTypeVariable(tvn); + } + for (TypeMirror thrownType : factoryMethod.getThrownTypes()) { + builder.addException(TypeName.get(thrownType)); + } + StringBuilder call = new StringBuilder("return $T.create(scope"); + boolean first = true; + for (VariableElement param : factoryMethod.getParameters()) { + ParameterSpec p = ParameterSpec.get(param); + if (first) { + first = false; + continue; + } + call.append(", "); + call.append(p.name); + builder.addParameter(p); + } + call.append(")"); + builder.addStatement(call.toString(), opClassName); + return builder.build(); + } + + private String buildOpMethodJavadoc(ClassName opClassName, ExecutableElement factoryMethod) { + StringBuilder javadoc = new StringBuilder(); + javadoc + .append("Adds an {@link ") + .append(opClassName.simpleName()) + .append("} operation to the graph\n\n"); + + // Add all javadoc tags found in the operator factory method but the first one, which should be + // in all cases the + // 'scope' parameter that is implicitly passed by this API + Matcher tagMatcher = JAVADOC_TAG_PATTERN.matcher(elements.getDocComment(factoryMethod)); + boolean firstParam = true; + + while (tagMatcher.find()) { + String tag = tagMatcher.group(); + if (tag.startsWith("@param") && firstParam) { + firstParam = false; + } else { + javadoc.append(tag).append('\n'); + } + } + javadoc.append("@see {@link ").append(opClassName).append("}\n"); + + return javadoc.toString(); + } + + private static TypeSpec buildGroupClass(String group, Collection<MethodSpec> methods) { + MethodSpec.Builder ctorBuilder = + MethodSpec.constructorBuilder() + .addParameter(T_SCOPE, "scope") + .addStatement("this.scope = scope"); + + TypeSpec.Builder builder = + TypeSpec.classBuilder(CaseFormat.LOWER_CAMEL.to(CaseFormat.UPPER_CAMEL, group) + "Ops") + .addModifiers(Modifier.PUBLIC, Modifier.FINAL) + .addJavadoc( + "An API for adding {@code $L} operations to a {@link $T Graph}\n\n" + + "@see {@link $T}\n", + group, + T_GRAPH, + T_OPS) + .addMethods(methods) + .addMethod(ctorBuilder.build()); + + builder.addField( + FieldSpec.builder(T_SCOPE, "scope").addModifiers(Modifier.PRIVATE, Modifier.FINAL).build()); + + return builder.build(); + } + + private static TypeSpec buildTopClass( + Map<String, ClassName> groupToClass, Collection<MethodSpec> methods) { + MethodSpec.Builder ctorBuilder = + MethodSpec.constructorBuilder() + .addModifiers(Modifier.PRIVATE) + .addParameter(T_SCOPE, "scope") + .addStatement("this.scope = scope", T_SCOPE); + + for (Map.Entry<String, ClassName> entry : groupToClass.entrySet()) { + ctorBuilder.addStatement("$L = new $T(scope)", entry.getKey(), entry.getValue()); + } + + TypeSpec.Builder opsBuilder = + TypeSpec.classBuilder("Ops") + .addModifiers(Modifier.PUBLIC, Modifier.FINAL) + .addJavadoc( + "An API for building a {@link $T} with operation wrappers\n<p>\n" + + "Any operation wrapper found in the classpath properly annotated as an" + + "{@link $T @Operator} is exposed\n" + + "by this API or one of its subgroup.\n<p>Example usage:\n<pre>{@code\n" + + "try (Graph g = new Graph()) {\n" + + " Ops ops = new Ops(g);\n" + + " // Operations are typed classes with convenience\n" + + " // builders in Ops.\n" + + " Constant three = ops.constant(3);\n" + + " // Single-result operations implement the Operand\n" + + " // interface, so this works too.\n" + + " Operand four = ops.constant(4);\n" + + " // Most builders are found within a group, and accept\n" + + " // Operand types as operands\n" + + " Operand nine = ops.math().add(four, ops.constant(5));\n" + + " // Multi-result operations however offer methods to\n" + + " // select a particular result for use.\n" + + " Operand result = \n" + + " ops.math().add(ops.array().unique(s, a).y(), b);\n" + + " // Optional attributes\n" + + " ops.math().matMul(a, b, MatMul.transposeA(true));\n" + + " // Naming operators\n" + + " ops.withName(“foo”).constant(5); // name “foo”\n" + + " // Names can exist in a hierarchy\n" + + " Ops sub = ops.withSubScope(“sub”);\n" + + " sub.withName(“bar”).constant(4); // “sub/bar”\n" + + "}\n" + + "}</pre>\n", + T_GRAPH, + T_OPERATOR) + .addMethods(methods) + .addMethod(ctorBuilder.build()); + + opsBuilder.addMethod( + MethodSpec.methodBuilder("withSubScope") + .addModifiers(Modifier.PUBLIC) + .addParameter(T_STRING, "childScopeName") + .returns(T_OPS) + .addStatement("return new $T(scope.withSubScope(childScopeName))", T_OPS) + .addJavadoc( + "Returns an API that adds operations to the graph with the provided name prefix.\n" + + "\n@see {@link $T#withSubScope(String)}\n", + T_SCOPE) + .build()); + + opsBuilder.addMethod( + MethodSpec.methodBuilder("withName") + .addModifiers(Modifier.PUBLIC) + .addParameter(T_STRING, "opName") + .returns(T_OPS) + .addStatement("return new Ops(scope.withName(opName))") + .addJavadoc( + "Returns an API that uses the provided name for an op.\n\n" + + "@see {@link $T#withName(String)}\n", + T_SCOPE) + .build()); + + opsBuilder.addField( + FieldSpec.builder(T_SCOPE, "scope").addModifiers(Modifier.PRIVATE, Modifier.FINAL).build()); + + opsBuilder.addMethod( + MethodSpec.methodBuilder("scope") + .addModifiers(Modifier.PUBLIC, Modifier.FINAL) + .returns(T_SCOPE) + .addStatement("return scope") + .addJavadoc("Returns the current {@link $T scope} of this API\n", T_SCOPE) + .build()); + + for (Map.Entry<String, ClassName> entry : groupToClass.entrySet()) { + opsBuilder.addField( + FieldSpec.builder(entry.getValue(), entry.getKey()) + .addModifiers(Modifier.PUBLIC, Modifier.FINAL) + .build()); + + opsBuilder.addMethod( + MethodSpec.methodBuilder(entry.getKey()) + .addModifiers(Modifier.PUBLIC, Modifier.FINAL) + .returns(entry.getValue()) + .addStatement("return $L", entry.getKey()) + .addJavadoc( + "Returns an API for adding {@code $L} operations to the graph\n", entry.getKey()) + .build()); + } + + opsBuilder.addMethod( + MethodSpec.methodBuilder("create") + .addModifiers(Modifier.PUBLIC, Modifier.STATIC) + .addParameter(T_GRAPH, "graph") + .returns(T_OPS) + .addStatement("return new Ops(new $T(graph))", T_SCOPE) + .addJavadoc("Creates an API for adding operations to the provided {@code graph}\n") + .build()); + + return opsBuilder.build(); + } + + private static AnnotationMirror getAnnotationMirror(Element element, TypeElement annotation) { + for (AnnotationMirror am : element.getAnnotationMirrors()) { + if (am.getAnnotationType().asElement().equals(annotation)) { + return am; + } + } + throw new IllegalArgumentException( + "Annotation " + + annotation.getSimpleName() + + " not present on element " + + element.getSimpleName()); + } + + private static String getAnnotationElementValueAsString(String elementName, AnnotationMirror am) { + for (Map.Entry<? extends ExecutableElement, ? extends AnnotationValue> entry : + am.getElementValues().entrySet()) { + if (entry.getKey().getSimpleName().contentEquals(elementName)) { + return entry.getValue().getValue().toString(); + } + } + return ""; + } } diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py index 78d18e41ed..20c7a69b7c 100644 --- a/tensorflow/python/estimator/canned/baseline.py +++ b/tensorflow/python/estimator/canned/baseline.py @@ -24,10 +24,10 @@ Example: classifier = BaselineClassifier(n_classes=3) # Input builders -def input_fn_train: # returns x, y (where y represents label's class index). +def input_fn_train(): # returns x, y (where y represents label's class index). pass -def input_fn_eval: # returns x, y (where y represents label's class index). +def input_fn_eval(): # returns x, y (where y represents label's class index). pass # Fit model. diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py index 010c0f3f59..ca26341445 100644 --- a/tensorflow/python/estimator/export/export.py +++ b/tensorflow/python/estimator/export/export.py @@ -333,11 +333,7 @@ def build_raw_serving_input_receiver_fn(features, default_batch_size=None): """A serving_input_receiver_fn that expects features to be fed directly.""" receiver_tensors = _placeholders_from_receiver_tensors_dict( features, default_batch_size) - - # TODO(b/34885899): remove the unnecessary copy - # The features provided are simply the placeholders, but we defensively copy - # the dict because it may be mutated. - return ServingInputReceiver(receiver_tensors, receiver_tensors.copy()) + return ServingInputReceiver(receiver_tensors, receiver_tensors) return serving_input_receiver_fn diff --git a/tensorflow/python/keras/datasets/boston_housing.py b/tensorflow/python/keras/datasets/boston_housing.py index 4c4cab8c08..eeb7cbc44a 100644 --- a/tensorflow/python/keras/datasets/boston_housing.py +++ b/tensorflow/python/keras/datasets/boston_housing.py @@ -45,10 +45,9 @@ def load_data(path='boston_housing.npz', test_split=0.2, seed=113): origin=origin_folder + 'boston_housing.npz', file_hash= 'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5') - f = np.load(path) - x = f['x'] - y = f['y'] - f.close() + with np.load(path) as f: + x = f['x'] + y = f['y'] np.random.seed(seed) indices = np.arange(len(x)) diff --git a/tensorflow/python/keras/datasets/mnist.py b/tensorflow/python/keras/datasets/mnist.py index 03564accc7..a96b581960 100644 --- a/tensorflow/python/keras/datasets/mnist.py +++ b/tensorflow/python/keras/datasets/mnist.py @@ -47,8 +47,8 @@ def load_data(path='mnist.npz'): path, origin=origin_folder + 'mnist.npz', file_hash='8a61469f7ea1b51cbae51d4f78837e45') - f = np.load(path) - x_train, y_train = f['x_train'], f['y_train'] - x_test, y_test = f['x_test'], f['y_test'] - f.close() - return (x_train, y_train), (x_test, y_test) + with np.load(path) as f: + x_train, y_train = f['x_train'], f['y_train'] + x_test, y_test = f['x_test'], f['y_test'] + + return (x_train, y_train), (x_test, y_test) diff --git a/tensorflow/python/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py index 2120b4b242..cb796bb06c 100644 --- a/tensorflow/python/keras/datasets/reuters.py +++ b/tensorflow/python/keras/datasets/reuters.py @@ -130,7 +130,5 @@ def get_word_index(path='reuters_word_index.json'): path, origin=origin_folder + 'reuters_word_index.json', file_hash='4d44cc38712099c9e383dc6e5f11a921') - f = open(path) - data = json.load(f) - f.close() - return data + with open(path) as f: + return json.load(f) diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py index 3234c05be0..e3a686f45d 100644 --- a/tensorflow/python/keras/layers/__init__.py +++ b/tensorflow/python/keras/layers/__init__.py @@ -87,9 +87,11 @@ from tensorflow.python.keras.layers.local import LocallyConnected2D # Merge layers. from tensorflow.python.keras.layers.merge import Add +from tensorflow.python.keras.layers.merge import Subtract from tensorflow.python.keras.layers.merge import Multiply from tensorflow.python.keras.layers.merge import Average from tensorflow.python.keras.layers.merge import Maximum +from tensorflow.python.keras.layers.merge import Minimum from tensorflow.python.keras.layers.merge import Concatenate from tensorflow.python.keras.layers.merge import Dot from tensorflow.python.keras.layers.merge import add diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py index 770665c5fb..f295af3fe0 100644 --- a/tensorflow/python/keras/layers/merge.py +++ b/tensorflow/python/keras/layers/merge.py @@ -250,6 +250,7 @@ class Add(_Merge): return output +@tf_export('keras.layers.Subtract') class Subtract(_Merge): """Layer that subtracts two inputs. @@ -336,6 +337,7 @@ class Maximum(_Merge): return output +@tf_export('keras.layers.Minimum') class Minimum(_Merge): """Layer that computes the minimum (element-wise) a list of inputs. @@ -586,6 +588,7 @@ def add(inputs, **kwargs): return Add(**kwargs)(inputs) +@tf_export('keras.layers.subtract') def subtract(inputs, **kwargs): """Functional interface to the `Subtract` layer. @@ -656,6 +659,7 @@ def maximum(inputs, **kwargs): return Maximum(**kwargs)(inputs) +@tf_export('keras.layers.minimum') def minimum(inputs, **kwargs): """Functional interface to the `Minimum` layer. diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py index 159cba5fa3..c4d4ce780b 100644 --- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py +++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py @@ -27,7 +27,6 @@ from tensorflow.python.ops import data_flow_ops from tensorflow.python.ops import gradients_impl import tensorflow.python.ops.data_flow_grad # pylint: disable=unused-import from tensorflow.python.platform import test -from tensorflow.python.framework import dtypes class DynamicStitchTestBase(object): diff --git a/tensorflow/python/lib/core/numpy.h b/tensorflow/python/lib/core/numpy.h index 25322b458b..d4621d61ee 100644 --- a/tensorflow/python/lib/core/numpy.h +++ b/tensorflow/python/lib/core/numpy.h @@ -29,7 +29,9 @@ limitations under the License. #define NO_IMPORT_ARRAY #endif +// Place `<locale>` before <Python.h> to avoid build failure in macOS. #include <Python.h> +#include <locale> #include "numpy/arrayobject.h" #include "numpy/ufuncobject.h" diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc index dcda1f4a44..6b6c82015f 100644 --- a/tensorflow/python/lib/core/py_util.cc +++ b/tensorflow/python/lib/core/py_util.cc @@ -15,7 +15,9 @@ limitations under the License. #include "tensorflow/python/lib/core/py_util.h" +// Place `<locale>` before <Python.h> to avoid build failure in macOS. #include <Python.h> +#include <locale> #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/strings/strcat.h" diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index f27d9224c1..a2eae452ae 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -57,6 +57,7 @@ ops.NotDifferentiable('NonMaxSuppression') ops.NotDifferentiable('NonMaxSuppressionV2') +# pylint: disable=invalid-name def _assert(cond, ex_type, msg): """A polymorphic assert, works with tensors and boolean expressions. @@ -945,7 +946,7 @@ def resize_images(images, Resized images will be distorted if their original aspect ratio is not the same as `size`. To avoid distortions see - @{tf.image.resize_image_with_crop_or_pad}. + @{tf.image.resize_image_with_pad}. `method` can be one of: @@ -1069,6 +1070,106 @@ def resize_images(images, return images +@tf_export('image.resize_image_with_pad') +def resize_image_with_pad(image, + target_height, + target_width, + method=ResizeMethod.BILINEAR): + """Resizes and pads an image to a target width and height. + + Resizes an image to a target width and height by keeping + the aspect ratio the same without distortion. If the target + dimensions don't match the image dimensions, the image + is resized and then padded with zeroes to match requested + dimensions. + + Args: + image: 4-D Tensor of shape `[batch, height, width, channels]` or + 3-D Tensor of shape `[height, width, channels]`. + target_height: Target height. + target_width: Target width. + method: Method to use for resizing image. See `resize_images()` + + Raises: + ValueError: if `target_height` or `target_width` are zero or negative. + + Returns: + Resized and padded image. + If `images` was 4-D, a 4-D float Tensor of shape + `[batch, new_height, new_width, channels]`. + If `images` was 3-D, a 3-D float Tensor of shape + `[new_height, new_width, channels]`. + """ + with ops.name_scope(None, 'resize_image_with_pad', [image]): + image = ops.convert_to_tensor(image, name='image') + image_shape = image.get_shape() + is_batch = True + if image_shape.ndims == 3: + is_batch = False + image = array_ops.expand_dims(image, 0) + elif image_shape.ndims is None: + is_batch = False + image = array_ops.expand_dims(image, 0) + image.set_shape([None] * 4) + elif image_shape.ndims != 4: + raise ValueError('\'image\' must have either 3 or 4 dimensions.') + + assert_ops = _CheckAtLeast3DImage(image, require_static=False) + assert_ops += _assert(target_width > 0, ValueError, + 'target_width must be > 0.') + assert_ops += _assert(target_height > 0, ValueError, + 'target_height must be > 0.') + + image = control_flow_ops.with_dependencies(assert_ops, image) + + def max_(x, y): + if _is_tensor(x) or _is_tensor(y): + return math_ops.maximum(x, y) + else: + return max(x, y) + + _, height, width, _ = _ImageDimensions(image, rank=4) + + # convert values to float, to ease divisions + f_height = math_ops.cast(height, dtype=dtypes.float64) + f_width = math_ops.cast(width, dtype=dtypes.float64) + f_target_height = math_ops.cast(target_height, dtype=dtypes.float64) + f_target_width = math_ops.cast(target_width, dtype=dtypes.float64) + + # Find the ratio by which the image must be adjusted + # to fit within the target + ratio = max_(f_width / f_target_width, f_height / f_target_height) + resized_height_float = f_height / ratio + resized_width_float = f_width / ratio + resized_height = math_ops.cast( + math_ops.floor(resized_height_float), dtype=dtypes.int32) + resized_width = math_ops.cast( + math_ops.floor(resized_width_float), dtype=dtypes.int32) + + padding_height = (f_target_height - resized_height_float) / 2 + padding_width = (f_target_width - resized_width_float) / 2 + f_padding_height = math_ops.floor(padding_height) + f_padding_width = math_ops.floor(padding_width) + p_height = max_(0, math_ops.cast(f_padding_height, dtype=dtypes.int32)) + p_width = max_(0, math_ops.cast(f_padding_width, dtype=dtypes.int32)) + + # Resize first, then pad to meet requested dimensions + resized = resize_images(image, [resized_height, resized_width], method) + + padded = pad_to_bounding_box(resized, p_height, p_width, target_height, + target_width) + + if padded.get_shape().ndims is None: + raise ValueError('padded contains no shape.') + + _ImageDimensions(padded, rank=4) + + if not is_batch: + padded = array_ops.squeeze(padded, squeeze_dims=[0]) + + return padded + + @tf_export('image.per_image_standardization') def per_image_standardization(image): """Linearly scales `image` to have zero mean and unit norm. diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py index 2a6ab26e96..cf9761803b 100644 --- a/tensorflow/python/ops/image_ops_test.py +++ b/tensorflow/python/ops/image_ops_test.py @@ -2680,6 +2680,102 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3]) +class ResizeImageWithPadTest(test_util.TensorFlowTestCase): + + def _ResizeImageWithPad(self, x, target_height, target_width, + use_tensor_inputs): + if use_tensor_inputs: + target_height = ops.convert_to_tensor(target_height) + target_width = ops.convert_to_tensor(target_width) + x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim) + feed_dict = {x_tensor: x} + else: + x_tensor = x + feed_dict = {} + + y = image_ops.resize_image_with_pad(x_tensor, target_height, + target_width) + if not use_tensor_inputs: + self.assertTrue(y.get_shape().is_fully_defined()) + + with self.test_session(use_gpu=True): + return y.eval(feed_dict=feed_dict) + + def _assertReturns(self, + x, + x_shape, + y, + y_shape, + use_tensor_inputs_options=None): + use_tensor_inputs_options = use_tensor_inputs_options or [False, True] + target_height, target_width, _ = y_shape + x = np.array(x).reshape(x_shape) + y = np.array(y).reshape(y_shape) + + for use_tensor_inputs in use_tensor_inputs_options: + y_tf = self._ResizeImageWithPad(x, target_height, target_width, + use_tensor_inputs) + self.assertAllClose(y, y_tf) + + def _assertRaises(self, + x, + x_shape, + target_height, + target_width, + err_msg, + use_tensor_inputs_options=None): + use_tensor_inputs_options = use_tensor_inputs_options or [False, True] + x = np.array(x).reshape(x_shape) + + for use_tensor_inputs in use_tensor_inputs_options: + try: + self._ResizeImageWithPad(x, target_height, target_width, + use_tensor_inputs) + except Exception as e: # pylint: disable=broad-except + if err_msg not in str(e): + raise + else: + raise AssertionError("Exception not raised: %s" % err_msg) + + def _assertShapeInference(self, pre_shape, height, width, post_shape): + image = array_ops.placeholder(dtypes.float32, shape=pre_shape) + y = image_ops.resize_image_with_pad(image, height, width) + self.assertEqual(y.get_shape().as_list(), post_shape) + + def testNoOp(self): + x_shape = [10, 10, 10] + x = np.random.uniform(size=x_shape) + + self._assertReturns(x, x_shape, x, x_shape) + + def testPad(self): + # Reduce vertical dimension + x = [1, 2, 3, 4, 5, 6, 7, 8] + x_shape = [2, 4, 1] + + y = [0, 1, 3, 0] + y_shape = [1, 4, 1] + + self._assertReturns(x, x_shape, y, y_shape) + + # Reduce horizontal dimension + x = [1, 2, 3, 4, 5, 6, 7, 8] + x_shape = [2, 4, 1] + + y = [1, 3, 0, 0] + y_shape = [2, 2, 1] + + self._assertReturns(x, x_shape, y, y_shape) + + x = [1, 2, 3, 4, 5, 6, 7, 8] + x_shape = [2, 4, 1] + + y = [1, 3] + y_shape = [1, 2, 1] + + self._assertReturns(x, x_shape, y, y_shape) + + class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase): def _ResizeImageWithCropOrPad(self, x, target_height, target_width, diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index 8417d8a7b1..6b709e5e7f 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -235,6 +235,15 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase): z_tf = self.evaluate(math_ops.approximate_equal(x, y, tolerance=0.0001)) self.assertAllEqual(z, z_tf) + def testApproximateEqualShape(self): + for dtype in [np.float32, np.double]: + x = np.array([1, 2], dtype=dtype) + y = np.array([[1, 2]], dtype=dtype) + # The inputs 'x' and 'y' must have the same shape. + with self.assertRaisesRegexp( + ValueError, "Shapes must be equal rank, but are 1 and 2"): + math_ops.approximate_equal(x, y) + class ScalarMulTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py index d06b0c318d..9a10abfcf7 100644 --- a/tensorflow/python/ops/special_math_ops.py +++ b/tensorflow/python/ops/special_math_ops.py @@ -201,6 +201,8 @@ def einsum(equation, *inputs, **kwargs): indices in its subscript, or - the input shapes are inconsistent along a particular axis. """ + equation = equation.replace(' ', '') + name = kwargs.pop('name', None) if kwargs: raise TypeError('invalid keyword arguments for this function: ' + ', '.join( diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py index 8646e48571..9bc4098d5b 100644 --- a/tensorflow/python/ops/special_math_ops_test.py +++ b/tensorflow/python/ops/special_math_ops_test.py @@ -241,6 +241,12 @@ class EinsumTest(test.TestCase): 'iJ,Jk->ik', 'iJ,Ki->JK', 'iJk,Jklm->Jk' + 'ij, jk, kl -> il', + 'a, ab, abc -> abc', + 'ab, ab, cd, cd, ef, ef -> ', + 'abc, bac', + 'iJ, Ki -> JK', + 'iJk, Jklm -> Jk' ] long_cases = [ @@ -249,6 +255,8 @@ class EinsumTest(test.TestCase): 'ea,fb,gc,hd,abcd->efgh', 'ea,fb,abcd,gc,hd->efgh', 'abhe,hidj,jgba,hiab,gab', + 'efc, dbc, acf, fd -> abe', + 'abhe, hidj, jgba, hiab, gab', ] invalid_cases = [ @@ -319,7 +327,7 @@ class EinsumTest(test.TestCase): input_axes, _, _ = axes.partition('->') for idx in input_axes.split(','): - shape = [all_axes[ax] for ax in idx] + shape = [all_axes[ax] for ax in idx if ax.isalpha()] input_vals.append(np.random.random(shape)) input_tensors = [constant_op.constant(val) for val in input_vals] diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py index 08b7cda73b..8cb6a0537e 100644 --- a/tensorflow/python/ops/state_ops.py +++ b/tensorflow/python/ops/state_ops.py @@ -394,7 +394,7 @@ def scatter_add(ref, indices, updates, use_locking=False, name=None): A tensor of indices into the first dimension of `ref`. updates: A `Tensor`. Must have the same type as `ref`. A tensor of updated values to store in `ref`. - use_locking: An optional `bool`. Defaults to `True`. + use_locking: An optional `bool`. Defaults to `False`. If True, the assignment will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. name: A name for the operation (optional). @@ -458,7 +458,7 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None): A tensor of indices into ref. updates: A `Tensor`. Must have the same type as `ref`. A tensor of updated values to add to ref. - use_locking: An optional `bool`. Defaults to `True`. + use_locking: An optional `bool`. Defaults to `False`. An optional bool. Defaults to True. If True, the assignment will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py index c2f0e9d3e6..5b372e82b3 100644 --- a/tensorflow/python/training/checkpoint_utils.py +++ b/tensorflow/python/training/checkpoint_utils.py @@ -147,7 +147,7 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map): partitioner=lambda shape, dtype: [5, 1]) # Initialize all variables in `new_scope_1` from `old_scope_1`. - init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/', 'new_scope_1'}) + init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/': 'new_scope_1'}) # Use names to specify which variables to initialize from checkpoint. init_from_checkpoint('/tmp/model.ckpt', diff --git a/tensorflow/tf_framework_version_script.lds b/tensorflow/tf_framework_version_script.lds new file mode 100644 index 0000000000..d4977f88c0 --- /dev/null +++ b/tensorflow/tf_framework_version_script.lds @@ -0,0 +1,11 @@ +VERS_1.0 { + # Hide libjpeg symbols to avoid symbol conflict with OpenCV + local: + jpeg_*; + jinit_*; + jdiv_round_up; + jround_up; + jzero_far; + jcopy_*; + jsimd_*; +}; diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt index 5398d3cf28..e89b4dbffd 100644 --- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt @@ -177,6 +177,10 @@ tf_module { argspec: "args=[\'image\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None" } member_method { + name: "resize_image_with_pad" + argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], " + } + member_method { name: "resize_images" argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], " } diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt new file mode 100644 index 0000000000..56e32e9d36 --- /dev/null +++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt @@ -0,0 +1,176 @@ +path: "tensorflow.keras.layers.Minimum" +tf_class { + is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>" + is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>" + is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>" + is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>" + is_instance: "<type \'object\'>" + member { + name: "activity_regularizer" + mtype: "<type \'property\'>" + } + member { + name: "dtype" + mtype: "<type \'property\'>" + } + member { + name: "inbound_nodes" + mtype: "<type \'property\'>" + } + member { + name: "input" + mtype: "<type \'property\'>" + } + member { + name: "input_mask" + mtype: "<type \'property\'>" + } + member { + name: "input_shape" + mtype: "<type \'property\'>" + } + member { + name: "losses" + mtype: "<type \'property\'>" + } + member { + name: "name" + mtype: "<type \'property\'>" + } + member { + name: "non_trainable_variables" + mtype: "<type \'property\'>" + } + member { + name: "non_trainable_weights" + mtype: "<type \'property\'>" + } + member { + name: "outbound_nodes" + mtype: "<type \'property\'>" + } + member { + name: "output" + mtype: "<type \'property\'>" + } + member { + name: "output_mask" + mtype: "<type \'property\'>" + } + member { + name: "output_shape" + mtype: "<type \'property\'>" + } + member { + name: "trainable_variables" + mtype: "<type \'property\'>" + } + member { + name: "trainable_weights" + mtype: "<type \'property\'>" + } + member { + name: "updates" + mtype: "<type \'property\'>" + } + member { + name: "variables" + mtype: "<type \'property\'>" + } + member { + name: "weights" + mtype: "<type \'property\'>" + } + member_method { + name: "__init__" + argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None" + } + member_method { + name: "add_loss" + argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "add_update" + argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "add_variable" + argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None" + } + member_method { + name: "add_weight" + argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], " + } + member_method { + name: "apply" + argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None" + } + member_method { + name: "build" + argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "call" + argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "compute_mask" + argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "compute_output_shape" + argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "count_params" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "from_config" + argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_config" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_input_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_input_mask_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_input_shape_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_losses_for" + argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_output_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_output_mask_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_output_shape_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_updates_for" + argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_weights" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "set_weights" + argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt new file mode 100644 index 0000000000..35ad87ad5d --- /dev/null +++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt @@ -0,0 +1,176 @@ +path: "tensorflow.keras.layers.Subtract" +tf_class { + is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>" + is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>" + is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>" + is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>" + is_instance: "<type \'object\'>" + member { + name: "activity_regularizer" + mtype: "<type \'property\'>" + } + member { + name: "dtype" + mtype: "<type \'property\'>" + } + member { + name: "inbound_nodes" + mtype: "<type \'property\'>" + } + member { + name: "input" + mtype: "<type \'property\'>" + } + member { + name: "input_mask" + mtype: "<type \'property\'>" + } + member { + name: "input_shape" + mtype: "<type \'property\'>" + } + member { + name: "losses" + mtype: "<type \'property\'>" + } + member { + name: "name" + mtype: "<type \'property\'>" + } + member { + name: "non_trainable_variables" + mtype: "<type \'property\'>" + } + member { + name: "non_trainable_weights" + mtype: "<type \'property\'>" + } + member { + name: "outbound_nodes" + mtype: "<type \'property\'>" + } + member { + name: "output" + mtype: "<type \'property\'>" + } + member { + name: "output_mask" + mtype: "<type \'property\'>" + } + member { + name: "output_shape" + mtype: "<type \'property\'>" + } + member { + name: "trainable_variables" + mtype: "<type \'property\'>" + } + member { + name: "trainable_weights" + mtype: "<type \'property\'>" + } + member { + name: "updates" + mtype: "<type \'property\'>" + } + member { + name: "variables" + mtype: "<type \'property\'>" + } + member { + name: "weights" + mtype: "<type \'property\'>" + } + member_method { + name: "__init__" + argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None" + } + member_method { + name: "add_loss" + argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "add_update" + argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "add_variable" + argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None" + } + member_method { + name: "add_weight" + argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], " + } + member_method { + name: "apply" + argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None" + } + member_method { + name: "build" + argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "call" + argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "compute_mask" + argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "compute_output_shape" + argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "count_params" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "from_config" + argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_config" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_input_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_input_mask_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_input_shape_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_losses_for" + argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_output_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_output_mask_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_output_shape_at" + argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_updates_for" + argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_weights" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "set_weights" + argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt index 0df5a1b91e..9d7e5bb8c7 100644 --- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt @@ -281,6 +281,10 @@ tf_module { mtype: "<type \'type\'>" } member { + name: "Minimum" + mtype: "<type \'type\'>" + } + member { name: "Multiply" mtype: "<type \'type\'>" } @@ -353,6 +357,10 @@ tf_module { mtype: "<type \'type\'>" } member { + name: "Subtract" + mtype: "<type \'type\'>" + } + member { name: "ThresholdedReLU" mtype: "<type \'type\'>" } @@ -413,7 +421,15 @@ tf_module { argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None" } member_method { + name: "minimum" + argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None" + } + member_method { name: "multiply" argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None" } + member_method { + name: "subtract" + argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None" + } } diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le new file mode 100644 index 0000000000..f496ac59b6 --- /dev/null +++ b/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le @@ -0,0 +1,19 @@ +FROM ubuntu:16.04 + +LABEL maintainer="William Irons <wdirons@us.ibm.com>" + +# Copy and run the install scripts. +COPY install/*.sh /install/ +RUN /install/install_bootstrap_deb_packages.sh +RUN add-apt-repository -y ppa:openjdk-r/ppa +RUN /install/install_deb_packages.sh +RUN apt-get update && apt-get install -y libopenblas-dev +RUN /install/install_pip_packages.sh +RUN /install/install_bazel_from_source.sh +RUN /install/install_proto3.sh +RUN /install/install_buildifier_from_source.sh +RUN /install/install_auditwheel.sh +RUN /install/install_golang_ppc64le.sh + +# Set up the master bazelrc configuration file. +COPY install/.bazelrc /etc/bazel.bazelrc diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le new file mode 100644 index 0000000000..3eddc56550 --- /dev/null +++ b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le @@ -0,0 +1,27 @@ +FROM nvidia/cuda-ppc64le:9.0-cudnn7-devel-ubuntu16.04 + +LABEL maintainer="William Irons <wdirons@us.ibm.com>" + +# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to +# /usr/local/cuda +RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include +RUN cp -P /usr/lib/powerpc64le-linux-gnu/libcudnn* /usr/local/cuda/lib64 + +# Copy and run the install scripts. +COPY install/*.sh /install/ +ARG DEBIAN_FRONTEND=noninteractive +RUN /install/install_bootstrap_deb_packages.sh +RUN add-apt-repository -y ppa:openjdk-r/ppa +RUN /install/install_deb_packages.sh +RUN apt-get update && apt-get install -y libopenblas-dev +RUN /install/install_pip_packages.sh +RUN /install/install_bazel_from_source.sh +RUN /install/install_golang_ppc64le.sh + +# Set up the master bazelrc configuration file. +COPY install/.bazelrc /etc/bazel.bazelrc +ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH + +# Configure the build for our CUDA configuration. +ENV TF_NEED_CUDA 1 +ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0 diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh index 1f0fd0387a..f6a50d3d4c 100755 --- a/tensorflow/tools/ci_build/ci_build.sh +++ b/tensorflow/tools/ci_build/ci_build.sh @@ -79,7 +79,7 @@ if [[ "${CONTAINER_TYPE}" == "cmake" ]]; then fi # Use nvidia-docker if the container is GPU. -if [[ "${CONTAINER_TYPE}" == "gpu" ]]; then +if [[ "${CONTAINER_TYPE}" == gpu* ]]; then DOCKER_BINARY="nvidia-docker" else DOCKER_BINARY="docker" @@ -99,7 +99,7 @@ BUILD_TAG="${BUILD_TAG:-tf_ci}" # Add extra params for cuda devices and libraries for GPU container. # And clear them if we are not building for GPU. -if [[ "${CONTAINER_TYPE}" != "gpu" ]]; then +if [[ "${CONTAINER_TYPE}" != gpu* ]]; then GPU_EXTRA_PARAMS="" fi diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh index 90bd8bc3d0..300ba8ea0b 100755 --- a/tensorflow/tools/ci_build/ci_parameterized_build.sh +++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh @@ -258,9 +258,9 @@ function set_script_variable() { # Process container type -if [[ ${CTYPE} == "cpu" ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then +if [[ ${CTYPE} == cpu* ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then : -elif [[ ${CTYPE} == "gpu" ]]; then +elif [[ ${CTYPE} == gpu* ]]; then set_script_variable TF_NEED_CUDA 1 if [[ $TF_CUDA_CLANG == "1" ]]; then @@ -418,12 +418,12 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] || BAZEL_TARGET=${TF_BUILD_BAZEL_TARGET} fi - if [[ ${CTYPE} == "cpu" ]] || \ + if [[ ${CTYPE} == cpu* ]] || \ [[ ${CTYPE} == "debian.jessie.cpu" ]]; then # CPU only command, fully parallel. NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} ${EXTRA_ARGS} -- "\ "${BAZEL_TARGET}" - elif [[ ${CTYPE} == "gpu" ]]; then + elif [[ ${CTYPE} == gpu* ]]; then # GPU only command, run as many jobs as the GPU count only. NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\ "--local_test_jobs=${TF_GPU_COUNT} "\ diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh new file mode 100755 index 0000000000..ddad00c5f0 --- /dev/null +++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# This script is to be used to install bzel on non x86_64 systems +# It will compile bazel from source and install it in /usr/local/bin + +# Select bazel version. +BAZEL_VERSION="0.11.0" + +set +e +local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}') + +if [[ "$local_bazel_ver" == "$BAZEL_VERSION" ]]; then + exit 0 +fi + +set -e + +# Compile bazel from source +mkdir -p /bazel +cd /bazel + +curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip +unzip bazel-$BAZEL_VERSION-dist.zip +bash ./compile.sh +cp output/bazel /usr/local/bin/ +rm -rf /bazel diff --git a/tensorflow/tools/ci_build/install/install_buildifier_from_source.sh b/tensorflow/tools/ci_build/install/install_buildifier_from_source.sh new file mode 100755 index 0000000000..a93c258fad --- /dev/null +++ b/tensorflow/tools/ci_build/install/install_buildifier_from_source.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e +BUILDTOOLS_VERSION="0.11.1" + +# Clone buildtools +git clone -b $BUILDTOOLS_VERSION https://github.com/bazelbuild/buildtools +cd buildtools + +# Build buildifier +bazel build //buildifier +sudo mv bazel-bin/buildifier/linux*stripped/buildifier /usr/local/bin + +# Build buildozer +bazel build //buildozer +sudo mv bazel-bin/buildozer/linux*stripped/buildozer /usr/local/bin diff --git a/tensorflow/tools/ci_build/install/install_golang_ppc64le.sh b/tensorflow/tools/ci_build/install/install_golang_ppc64le.sh new file mode 100755 index 0000000000..47d23a59b3 --- /dev/null +++ b/tensorflow/tools/ci_build/install/install_golang_ppc64le.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -ex + +GOLANG_URL="https://storage.googleapis.com/golang/go1.10.linux-ppc64le.tar.gz" + +sudo mkdir -p /usr/local +wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh index fbed4574e0..221b5b80fb 100755 --- a/tensorflow/tools/ci_build/install/install_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh @@ -110,6 +110,10 @@ pip3 install --upgrade gast pip2 install --upgrade termcolor pip3 install --upgrade termcolor +# Install last working version of setuptools. +pip2 install --upgrade setuptools==39.1.0 +pip3 install --upgrade setuptools==39.1.0 + # Keras pip2 install keras_applications==1.0.2 pip3 install keras_applications==1.0.2 diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh index 037fc0e2e1..45a30c6e82 100755 --- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh @@ -81,6 +81,9 @@ pip3.5 install --upgrade astor pip3.5 install --upgrade gast pip3.5 install --upgrade termcolor +# Install last working version of setuptools. +pip3.5 install --upgrade setuptools==39.1.0 + # Keras pip3.5 install keras_applications==1.0.2 pip3.5 install keras_preprocessing==1.0.1 diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh index 8fd65a3ee2..d66b2aa18a 100755 --- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh @@ -97,11 +97,11 @@ pip3 install --upgrade astor pip3 install --upgrade gast pip3 install --upgrade termcolor +# Install last working version of setuptools. +pip3 install --upgrade setuptools==39.1.0 + # Keras pip3.5 install keras_applications==1.0.2 pip3.5 install keras_preprocessing==1.0.1 -# Install last working version of setuptools. -pip3 install --upgrade setuptools==39.1.0 - # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh) diff --git a/tensorflow/tools/ci_build/linux/gpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/gpu/run_mkl.sh new file mode 100755 index 0000000000..50ee07e727 --- /dev/null +++ b/tensorflow/tools/ci_build/linux/gpu/run_mkl.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ============================================================================== + +set -e +set -x + +N_JOBS=$(grep -c ^processor /proc/cpuinfo) + +echo "" +echo "Bazel will use ${N_JOBS} concurrent job(s)." +echo "" + +# Run configure. +export PYTHON_BIN_PATH=`which python2` + +export TF_NEED_CUDA=1 +export TF_CUDA_VERSION=9.0 +export TF_CUDNN_VERSION=7 +export TF_CUDA_COMPUTE_CAPABILITIES=3.7 + +yes "" | $PYTHON_BIN_PATH configure.py + +# Run bazel test command. Double test timeouts to avoid flakes. +# Setting KMP_BLOCKTIME to 0 lets OpenMP threads to sleep right after parallel execution +# in an MKL primitive. This reduces the effects of an oversubscription of OpenMP threads +# caused by executing multiple tests concurrently. +bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test \ + --test_lang_filters=cc,py -k --jobs="${N_JOBS}" \ + --test_timeout 300,450,1200,3600 --build_tests_only --test_env=KMP_BLOCKTIME=0\ + --config=mkl --config=opt --test_output=errors --local_test_jobs=8 \ + --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \ + //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... + diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh new file mode 100755 index 0000000000..68354bf7c1 --- /dev/null +++ b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Usage: basic_mkl_test.sh + +# Helper function to traverse directories up until given file is found. +function upsearch () { + test / == "$PWD" && return || \ + test -e "$1" && echo "$PWD" && return || \ + cd .. && upsearch "$1" +} + +# Set up WORKSPACE. +WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}" + +BUILD_TAG=mkl-gpu-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh gpu tensorflow/tools/ci_build/linux/gpu/run_mkl.sh diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py index 73dee98bae..cc2288a7fa 100755 --- a/tensorflow/tools/git/gen_git_source.py +++ b/tensorflow/tools/git/gen_git_source.py @@ -164,14 +164,17 @@ def get_git_version(git_base_path, git_tag_override): "git", str("--git-dir=%s/.git" % git_base_path), str("--work-tree=" + git_base_path), "describe", "--long", "--tags" ]).strip()) - if git_tag_override: + if git_tag_override and val: split_val = val.split("-") - if len(split_val) != 3: + if len(split_val) < 3: raise Exception( ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' " "but got '%s'") % val) - split_val[0] = git_tag_override - val = bytes("-".join(split_val)) + # There might be "-" in the tag name. But we can be sure that the final + # two "-" are those inserted by the git describe command. + abbrev_commit = split_val[-1] + val = bytes( + "-".join([git_tag_override, "0", abbrev_commit])) return val if val else unknown_label except (subprocess.CalledProcessError, OSError): return unknown_label diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD index 77f83b77a0..05c23cd3ee 100644 --- a/tensorflow/tools/lib_package/BUILD +++ b/tensorflow/tools/lib_package/BUILD @@ -130,7 +130,7 @@ genrule( "@highwayhash//:LICENSE", "@jemalloc//:COPYING", "@jpeg//:LICENSE.md", - "@libxsmm_archive//:LICENSE", + "@libxsmm_archive//:LICENSE.md", "@llvm//:LICENSE.TXT", "@lmdb//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", @@ -168,7 +168,7 @@ genrule( "@highwayhash//:LICENSE", "@jemalloc//:COPYING", "@jpeg//:LICENSE.md", - "@libxsmm_archive//:LICENSE", + "@libxsmm_archive//:LICENSE.md", "@llvm//:LICENSE.TXT", "@lmdb//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index 6cfd271968..a0caf42331 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -147,7 +147,7 @@ filegroup( "@jemalloc//:COPYING", "@jpeg//:LICENSE.md", "@kafka//:LICENSE", - "@libxsmm_archive//:LICENSE", + "@libxsmm_archive//:LICENSE.md", "@lmdb//:LICENSE", "@local_config_nccl//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh index f7e42ce536..9e41514cfa 100755 --- a/tensorflow/tools/pip_package/build_pip_package.sh +++ b/tensorflow/tools/pip_package/build_pip_package.sh @@ -24,9 +24,15 @@ function real_path() { function cp_external() { local src_dir=$1 local dest_dir=$2 - for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*local_config_tensorrt*' ! -name '*org_tensorflow*'`; do - cp -R "$f" "$dest_dir" + + pushd . + cd "$src_dir" + for f in `find . ! -type d ! -name '*.py' ! -name '*local_config_cuda*' ! -name '*local_config_tensorrt*' ! -name '*org_tensorflow*'`; do + mkdir -p "${dest_dir}/$(dirname ${f})" + cp "${f}" "${dest_dir}/$(dirname ${f})/" done + popd + mkdir -p "${dest_dir}/local_config_cuda/cuda/cuda/" cp "${src_dir}/local_config_cuda/cuda/cuda/cuda_config.h" "${dest_dir}/local_config_cuda/cuda/cuda/" } @@ -49,6 +55,8 @@ function prepare_src() { TMPDIR="$1" mkdir -p "$TMPDIR" + EXTERNAL_INCLUDES="${TMPDIR}/tensorflow/include/external" + echo $(date) : "=== Preparing sources in dir: ${TMPDIR}" if [ ! -d bazel-bin/tensorflow ]; then @@ -66,10 +74,9 @@ function prepare_src() { cp -R \ bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow/tensorflow \ "${TMPDIR}" - mkdir "${TMPDIR}/external" cp_external \ bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \ - "${TMPDIR}/external" + "${EXTERNAL_INCLUDES}/" RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow else RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow @@ -78,10 +85,9 @@ function prepare_src() { cp -R \ bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \ "${TMPDIR}" - mkdir "${TMPDIR}/external" cp_external \ bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \ - "${TMPDIR}/external" + "${EXTERNAL_INCLUDES}" # Copy MKL libs over so they can be loaded at runtime so_lib_dir=$(ls $RUNFILES | grep solib) || true if [ -n "${so_lib_dir}" ]; then @@ -96,10 +102,9 @@ function prepare_src() { cp -R \ bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \ "${TMPDIR}" - mkdir "${TMPDIR}/external" cp_external \ bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \ - "${TMPDIR}/external" + "${EXTERNAL_INCLUDES}" # Copy MKL libs over so they can be loaded at runtime so_lib_dir=$(ls $RUNFILES | grep solib) || true if [ -n "${so_lib_dir}" ]; then diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index 253802b959..55cd4f37c6 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -53,7 +53,7 @@ REQUIRED_PACKAGES = [ 'gast >= 0.2.0', 'numpy >= 1.13.3', 'six >= 1.10.0', - 'protobuf >= 3.6.0', + 'protobuf >= 3.4.0', 'setuptools <= 39.1.0', 'tensorboard >= 1.8.0, < 1.9.0', 'termcolor >= 1.1.0', @@ -84,7 +84,7 @@ else: if 'tf_nightly' in project_name: for i, pkg in enumerate(REQUIRED_PACKAGES): if 'tensorboard' in pkg: - REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.9.0a0, < 1.10.0a0' + REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.10.0a0, < 1.11.0a0' break # weakref.finalize and enum were introduced in Python 3.4 diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 3a5e0d1163..ba679e0055 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -131,11 +131,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "libxsmm_archive", urls = [ - "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.1.tar.gz", - "https://github.com/hfp/libxsmm/archive/1.8.1.tar.gz", + "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.9.tar.gz", + "https://github.com/hfp/libxsmm/archive/1.9.tar.gz", ], - sha256 = "2ade869c3f42f23b5263c7d594aa3c7e5e61ac6a3afcaf5d6e42899d2a7986ce", - strip_prefix = "libxsmm-1.8.1", + sha256 = "cd8532021352b4a0290d209f7f9bfd7c2411e08286a893af3577a43457287bfa", + strip_prefix = "libxsmm-1.9", build_file = clean_dep("//third_party:libxsmm.BUILD"), ) @@ -155,12 +155,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "com_googlesource_code_re2", urls = [ - "https://mirror.bazel.build/github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz", - "https://github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz", + "https://mirror.bazel.build/github.com/google/re2/archive/2018-04-01.tar.gz", + "https://github.com/google/re2/archive/2018-04-01.tar.gz", ], - sha256 = "e57eeb837ac40b5be37b2c6197438766e73343ffb32368efea793dfd8b28653b", - strip_prefix = "re2-26cd968b735e227361c9703683266f01e5df7857", + sha256 = "2f945446b71336e7f5a2bcace1abcf0b23fbba368266c6a1be33de3de3b3c912", + strip_prefix = "re2-2018-04-01", ) tf_http_archive( @@ -200,6 +200,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""): urls = [ "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2", "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.12.02.tar.bz2/d15843c3fb7db39af80571ee27ec6fad/nasm-2.12.02.tar.bz2", + "http://www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2", ], sha256 = "00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324", strip_prefix = "nasm-2.12.02", @@ -298,11 +299,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "absl_py", urls = [ - "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/ea8c4d2ddbf3fba610c4d613260561699b776db8.tar.gz", - "https://github.com/abseil/abseil-py/archive/ea8c4d2ddbf3fba610c4d613260561699b776db8.tar.gz", + "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz", + "https://github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz", ], - sha256 = "c30b48e0d2580ef1412e55c5c0e1dab8db2ee4ab56e2075eccff29c90c7c7059", - strip_prefix = "abseil-py-ea8c4d2ddbf3fba610c4d613260561699b776db8", + sha256 = "95160f778a62c7a60ddeadc7bf2d83f85a23a27359814aca12cf949e896fa82c", + strip_prefix = "abseil-py-pypi-v0.2.2", ) tf_http_archive( @@ -392,12 +393,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "pcre", - sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7", + sha256 = "69acbc2fbdefb955d42a4c606dfde800c2885711d2979e356c0636efde9ec3b5", urls = [ - "https://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz", - "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz", + "https://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.42.tar.gz", + "http://ftp.exim.org/pub/pcre/pcre-8.42.tar.gz", ], - strip_prefix = "pcre-8.39", + strip_prefix = "pcre-8.42", build_file = clean_dep("//third_party:pcre.BUILD"), ) @@ -415,12 +416,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "curl", - sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6", + sha256 = "e9c37986337743f37fd14fe8737f246e97aec94b39d1b71e8a5973f72a9fc4f5", urls = [ - "https://mirror.bazel.build/curl.haxx.se/download/curl-7.49.1.tar.gz", - "https://curl.haxx.se/download/curl-7.49.1.tar.gz", + "https://mirror.bazel.build/curl.haxx.se/download/curl-7.60.0.tar.gz", + "https://curl.haxx.se/download/curl-7.60.0.tar.gz", ], - strip_prefix = "curl-7.49.1", + strip_prefix = "curl-7.60.0", build_file = clean_dep("//third_party:curl.BUILD"), ) @@ -462,22 +463,22 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "lmdb", urls = [ - "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz", - "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz", + "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz", + "https://github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz", ], - sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326", - strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb", + sha256 = "f3927859882eb608868c8c31586bb7eb84562a40a6bf5cc3e13b6b564641ea28", + strip_prefix = "lmdb-LMDB_0.9.22/libraries/liblmdb", build_file = clean_dep("//third_party:lmdb.BUILD"), ) tf_http_archive( name = "jsoncpp_git", urls = [ - "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz", - "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz", + "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz", + "https://github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz", ], - sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2", - strip_prefix = "jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70", + sha256 = "c49deac9e0933bcb7044f08516861a2d560988540b23de2ac1ad443b219afdb6", + strip_prefix = "jsoncpp-1.8.4", build_file = clean_dep("//third_party:jsoncpp.BUILD"), ) @@ -627,6 +628,16 @@ def tf_workspace(path_prefix="", tf_repo_name=""): licenses = ["notice"], # Apache 2.0 ) + java_import_external( + name = "com_squareup_javapoet", + jar_sha256 = "5bb5abdfe4366c15c0da3332c57d484e238bd48260d6f9d6acf2b08fdde1efea", + jar_urls = [ + "http://mirror.bazel.build/repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar", + "http://repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar", + ], + licenses = ["notice"], # Apache 2.0 + ) + tf_http_archive( name = "com_google_pprof", urls = [ @@ -684,11 +695,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "flatbuffers", - strip_prefix = "flatbuffers-971a68110e4fc1bace10fcb6deeb189e7e1a34ce", - sha256 = "874088d2ee0d9f8524191f77209556415f03dd44e156276edf19e5b90ceb5f55", + strip_prefix = "flatbuffers-1.9.0", + sha256 = "5ca5491e4260cacae30f1a5786d109230db3f3a6e5a0eb45d0d0608293d247e3", urls = [ - "https://mirror.bazel.build/github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz", - "https://github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz", + "https://mirror.bazel.build/github.com/google/flatbuffers/archive/v1.9.0.tar.gz", + "https://github.com/google/flatbuffers/archive/v1.9.0.tar.gz", ], build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"), ) @@ -793,6 +804,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): actual = "@grpc//:grpc++", ) + native.bind( + name = "grpc_lib_unsecure", + actual = "@grpc//:grpc++_unsecure", + ) + # Needed by gRPC native.bind( name = "libssl", diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD index 4def6f9489..1638b72161 100644 --- a/third_party/curl.BUILD +++ b/third_party/curl.BUILD @@ -7,6 +7,7 @@ exports_files(["COPYING"]) CURL_WIN_COPTS = [ "/Iexternal/curl/lib", + "/DBUILDING_LIBCURL", "/DHAVE_CONFIG_H", "/DCURL_DISABLE_FTP", "/DCURL_DISABLE_NTLM", @@ -49,6 +50,8 @@ cc_library( "lib/curl_addrinfo.c", "lib/curl_addrinfo.h", "lib/curl_base64.h", + "lib/curl_ctype.c", + "lib/curl_ctype.h", "lib/curl_des.h", "lib/curl_endian.h", "lib/curl_fnmatch.c", @@ -75,6 +78,7 @@ cc_library( "lib/curl_sec.h", "lib/curl_setup.h", "lib/curl_setup_once.h", + "lib/curl_sha256.h", "lib/curl_sspi.c", "lib/curl_sspi.h", "lib/curl_threads.c", @@ -134,6 +138,8 @@ cc_library( "lib/md5.c", "lib/memdebug.c", "lib/memdebug.h", + "lib/mime.c", + "lib/mime.h", "lib/mprintf.c", "lib/multi.c", "lib/multihandle.h", @@ -153,8 +159,8 @@ cc_library( "lib/pop3.h", "lib/progress.c", "lib/progress.h", - "lib/rawstr.c", - "lib/rawstr.h", + "lib/rand.c", + "lib/rand.h", "lib/rtsp.c", "lib/rtsp.h", "lib/security.c", @@ -162,8 +168,11 @@ cc_library( "lib/select.h", "lib/sendf.c", "lib/sendf.h", + "lib/setopt.c", + "lib/setopt.h", "lib/setup-os400.h", "lib/setup-vms.h", + "lib/sha256.c", "lib/share.c", "lib/share.h", "lib/sigpipe.h", @@ -179,10 +188,10 @@ cc_library( "lib/splay.c", "lib/splay.h", "lib/ssh.h", + "lib/strcase.c", + "lib/strcase.h", "lib/strdup.c", "lib/strdup.h", - "lib/strequal.c", - "lib/strequal.h", "lib/strerror.c", "lib/strerror.h", "lib/strtok.c", @@ -241,13 +250,12 @@ cc_library( }), hdrs = [ "include/curl/curl.h", - "include/curl/curlbuild.h", - "include/curl/curlrules.h", "include/curl/curlver.h", "include/curl/easy.h", "include/curl/mprintf.h", "include/curl/multi.h", "include/curl/stdcheaders.h", + "include/curl/system.h", "include/curl/typecheck-gcc.h", ], copts = select({ @@ -256,6 +264,7 @@ cc_library( "//conditions:default": [ "-Iexternal/curl/lib", "-D_GNU_SOURCE", + "-DBUILDING_LIBCURL", "-DHAVE_CONFIG_H", "-DCURL_DISABLE_FTP", "-DCURL_DISABLE_NTLM", # turning it off in configure is not enough @@ -676,6 +685,7 @@ genrule( "# define SIZEOF_INT 4", "# define SIZEOF_LONG 8", "# define SIZEOF_OFF_T 8", + "# define SIZEOF_CURL_OFF_T 8", "# define SIZEOF_SHORT 2", "# define SIZEOF_SIZE_T 8", "# define SIZEOF_TIME_T 8", diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD index 824c97be60..639dff2cd0 100644 --- a/third_party/flatbuffers/flatbuffers.BUILD +++ b/third_party/flatbuffers/flatbuffers.BUILD @@ -98,6 +98,8 @@ cc_binary( "grpc/src/compiler/cpp_generator.h", "grpc/src/compiler/go_generator.cc", "grpc/src/compiler/go_generator.h", + "grpc/src/compiler/java_generator.cc", + "grpc/src/compiler/java_generator.h", "grpc/src/compiler/schema_interface.h", "src/flatc_main.cpp", "src/idl_gen_cpp.cpp", diff --git a/third_party/jsoncpp.BUILD b/third_party/jsoncpp.BUILD index 65f98410b2..cf3cba0555 100644 --- a/third_party/jsoncpp.BUILD +++ b/third_party/jsoncpp.BUILD @@ -6,7 +6,6 @@ cc_library( name = "jsoncpp", srcs = [ "include/json/assertions.h", - "src/lib_json/json_batchallocator.h", "src/lib_json/json_reader.cpp", "src/lib_json/json_tool.h", "src/lib_json/json_value.cpp", @@ -20,9 +19,13 @@ cc_library( "include/json/json.h", "include/json/reader.h", "include/json/value.h", + "include/json/version.h", "include/json/writer.h", ], - copts = ["-DJSON_USE_EXCEPTION=0"], + copts = [ + "-DJSON_USE_EXCEPTION=0", + "-DJSON_HAS_INT64", + ], includes = ["include"], visibility = ["//visibility:public"], deps = [":private"], diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD index 78ed1f4e16..ee49d281ab 100644 --- a/third_party/libxsmm.BUILD +++ b/third_party/libxsmm.BUILD @@ -3,7 +3,7 @@ licenses(["notice"]) # BSD 3-clause -exports_files(["LICENSE"]) +exports_files(["LICENSE.md"]) # Arguments to ./scripts/libxsmm_interface.py, see that file for detailed description. # precision: SP & DP |