aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Mingxing Tan <tanmingxing@google.com>2018-06-28 19:13:20 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-06-28 19:16:41 -0700
commit1e7b0e4ad6d0f57f3241fe0b80a65f2c2a7f11b0 (patch)
treeaf92d172cedfc41e544c01a349c1d3b30bc3ff85
parent3cee10e61c1c90734317c62ea3388ec44acc8d08 (diff)
Merge changes from github.
PiperOrigin-RevId: 202585094
-rw-r--r--.gitignore1
-rw-r--r--configure.py72
-rw-r--r--tensorflow/BUILD26
-rw-r--r--tensorflow/c/c_api.cc6
-rw-r--r--tensorflow/compiler/aot/codegen.cc2
-rw-r--r--tensorflow/compiler/xla/rpc/BUILD6
-rw-r--r--tensorflow/compiler/xla/service/BUILD1
-rw-r--r--tensorflow/compiler/xla/service/hlo_instruction.cc1
-rw-r--r--tensorflow/compiler/xla/service/hlo_instruction_test.cc34
-rw-r--r--tensorflow/contrib/autograph/converters/BUILD5
-rw-r--r--tensorflow/contrib/autograph/operators/control_flow.py2
-rw-r--r--tensorflow/contrib/autograph/pyct/static_analysis/cfg.py2
-rw-r--r--tensorflow/contrib/autograph/pyct/transformer.py4
-rw-r--r--tensorflow/contrib/cmake/CMakeLists.txt36
-rw-r--r--tensorflow/contrib/cmake/external/double_conversion.cmake6
-rw-r--r--tensorflow/contrib/cmake/external/mkl.cmake68
-rw-r--r--tensorflow/contrib/cmake/external/mkldnn.cmake12
-rwxr-xr-xtensorflow/contrib/cmake/tf_python.cmake77
-rw-r--r--tensorflow/contrib/cmake/tf_shared_lib.cmake5
-rw-r--r--tensorflow/contrib/constrained_optimization/README.md2
-rw-r--r--tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py8
-rw-r--r--tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py42
-rw-r--r--tensorflow/contrib/data/python/ops/sliding.py2
-rw-r--r--tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb909
-rw-r--r--tensorflow/contrib/gan/python/estimator/python/head_impl.py6
-rw-r--r--tensorflow/contrib/gan/python/estimator/python/head_test.py9
-rw-r--r--tensorflow/contrib/gdr/gdr_server_lib.cc2
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h2
-rw-r--r--tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h2
-rw-r--r--tensorflow/contrib/opt/BUILD20
-rw-r--r--tensorflow/contrib/opt/__init__.py11
-rw-r--r--tensorflow/contrib/opt/python/training/weight_decay_optimizers.py362
-rw-r--r--tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py188
-rw-r--r--tensorflow/contrib/solvers/python/ops/linear_equations.py1
-rw-r--r--tensorflow/contrib/tensorrt/BUILD20
-rw-r--r--tensorflow/contrib/tensorrt/convert/convert_graph.cc1027
-rw-r--r--tensorflow/contrib/tensorrt/convert/convert_graph.h61
-rw-r--r--tensorflow/contrib/tensorrt/convert/convert_nodes.cc801
-rw-r--r--tensorflow/contrib/tensorrt/convert/convert_nodes.h133
-rw-r--r--tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc48
-rw-r--r--tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h3
-rw-r--r--tensorflow/contrib/tensorrt/convert/utils.h37
-rw-r--r--tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc588
-rw-r--r--tensorflow/contrib/tensorrt/kernels/trt_engine_op.h98
-rw-r--r--tensorflow/contrib/tensorrt/ops/trt_engine_op.cc18
-rw-r--r--tensorflow/contrib/tensorrt/python/trt_convert.py55
-rw-r--r--tensorflow/contrib/tensorrt/resources/trt_allocator.cc2
-rw-r--r--tensorflow/contrib/tensorrt/resources/trt_allocator.h5
-rw-r--r--tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc34
-rw-r--r--tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h35
-rw-r--r--tensorflow/contrib/tensorrt/resources/trt_resources.h49
-rw-r--r--tensorflow/contrib/tensorrt/segment/segment.h7
-rw-r--r--tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc76
-rw-r--r--tensorflow/contrib/tensorrt/test/test_tftrt.py138
-rw-r--r--tensorflow/contrib/tensorrt/trt_conversion.i98
-rw-r--r--tensorflow/contrib/tpu/profiler/BUILD2
-rw-r--r--tensorflow/contrib/verbs/BUILD4
-rw-r--r--tensorflow/core/api_def/BUILD7
-rw-r--r--tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt2
-rw-r--r--tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt2
-rw-r--r--tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt2
-rw-r--r--tensorflow/core/api_def/java_api/api_def_Assert.pbtxt4
-rw-r--r--tensorflow/core/api_def/java_api/api_def_Const.pbtxt4
-rw-r--r--tensorflow/core/api_def/java_api/api_def_Switch.pbtxt4
-rw-r--r--tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc18
-rw-r--r--tensorflow/core/common_runtime/mkl_cpu_allocator.cc7
-rw-r--r--tensorflow/core/debug/BUILD4
-rw-r--r--tensorflow/core/distributed_runtime/BUILD4
-rw-r--r--tensorflow/core/distributed_runtime/eager/BUILD4
-rw-r--r--tensorflow/core/distributed_runtime/rpc/BUILD36
-rw-r--r--tensorflow/core/distributed_runtime/rpc/eager/BUILD7
-rw-r--r--tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc6
-rw-r--r--tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h3
-rw-r--r--tensorflow/core/graph/mkl_layout_pass_test.cc21
-rw-r--r--tensorflow/core/kernels/data/slide_dataset_op.cc51
-rw-r--r--tensorflow/core/kernels/mkl_conv_ops.cc332
-rw-r--r--tensorflow/core/kernels/reduction_gpu_kernels.cu.h2
-rw-r--r--tensorflow/core/kernels/segment_reduction_ops.h6
-rw-r--r--tensorflow/core/ops/math_ops.cc8
-rw-r--r--tensorflow/core/platform/cloud/oauth_client.cc4
-rw-r--r--tensorflow/core/platform/default/build_config.bzl5
-rw-r--r--tensorflow/core/platform/windows/port.cc5
-rw-r--r--tensorflow/core/profiler/internal/tfprof_timeline.cc16
-rw-r--r--tensorflow/core/util/mkl_util.h32
-rw-r--r--tensorflow/docs_src/get_started/index.md29
-rw-r--r--tensorflow/docs_src/guide/debugger.md2
-rw-r--r--tensorflow/go/attrs.go245
-rw-r--r--tensorflow/go/attrs_test.go193
-rw-r--r--tensorflow/go/op/wrappers.go9
-rw-r--r--tensorflow/go/operation.go66
-rw-r--r--tensorflow/go/operation_test.go62
-rw-r--r--tensorflow/java/BUILD5
-rw-r--r--tensorflow/java/maven/.gitignore6
-rw-r--r--tensorflow/java/maven/README.md6
-rw-r--r--tensorflow/java/maven/hadoop/pom.xml24
-rw-r--r--tensorflow/java/maven/pom.xml2
-rw-r--r--tensorflow/java/maven/run_inside_container.sh47
-rw-r--r--tensorflow/java/maven/spark-connector/pom.xml24
-rw-r--r--tensorflow/java/src/gen/cc/op_generator.cc11
-rw-r--r--tensorflow/java/src/gen/cc/op_specs.h2
-rw-r--r--tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java348
-rw-r--r--tensorflow/python/estimator/canned/baseline.py4
-rw-r--r--tensorflow/python/estimator/export/export.py6
-rw-r--r--tensorflow/python/keras/datasets/boston_housing.py7
-rw-r--r--tensorflow/python/keras/datasets/mnist.py10
-rw-r--r--tensorflow/python/keras/datasets/reuters.py6
-rw-r--r--tensorflow/python/keras/layers/__init__.py2
-rw-r--r--tensorflow/python/keras/layers/merge.py4
-rw-r--r--tensorflow/python/kernel_tests/dynamic_stitch_op_test.py1
-rw-r--r--tensorflow/python/lib/core/numpy.h2
-rw-r--r--tensorflow/python/lib/core/py_util.cc2
-rw-r--r--tensorflow/python/ops/image_ops_impl.py103
-rw-r--r--tensorflow/python/ops/image_ops_test.py96
-rw-r--r--tensorflow/python/ops/math_ops_test.py9
-rw-r--r--tensorflow/python/ops/special_math_ops.py2
-rw-r--r--tensorflow/python/ops/special_math_ops_test.py10
-rw-r--r--tensorflow/python/ops/state_ops.py4
-rw-r--r--tensorflow/python/training/checkpoint_utils.py2
-rw-r--r--tensorflow/tf_framework_version_script.lds11
-rw-r--r--tensorflow/tools/api/golden/tensorflow.image.pbtxt4
-rw-r--r--tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt176
-rw-r--r--tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt176
-rw-r--r--tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt16
-rw-r--r--tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le19
-rw-r--r--tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le27
-rwxr-xr-xtensorflow/tools/ci_build/ci_build.sh4
-rwxr-xr-xtensorflow/tools/ci_build/ci_parameterized_build.sh8
-rwxr-xr-xtensorflow/tools/ci_build/install/install_bazel_from_source.sh40
-rwxr-xr-xtensorflow/tools/ci_build/install/install_buildifier_from_source.sh30
-rwxr-xr-xtensorflow/tools/ci_build/install/install_golang_ppc64le.sh22
-rwxr-xr-xtensorflow/tools/ci_build/install/install_pip_packages.sh4
-rwxr-xr-xtensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh3
-rwxr-xr-xtensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh6
-rwxr-xr-xtensorflow/tools/ci_build/linux/gpu/run_mkl.sh47
-rwxr-xr-xtensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh29
-rwxr-xr-xtensorflow/tools/git/gen_git_source.py11
-rw-r--r--tensorflow/tools/lib_package/BUILD4
-rw-r--r--tensorflow/tools/pip_package/BUILD2
-rwxr-xr-xtensorflow/tools/pip_package/build_pip_package.sh21
-rw-r--r--tensorflow/tools/pip_package/setup.py4
-rw-r--r--tensorflow/workspace.bzl80
-rw-r--r--third_party/curl.BUILD22
-rw-r--r--third_party/flatbuffers/flatbuffers.BUILD2
-rw-r--r--third_party/jsoncpp.BUILD7
-rw-r--r--third_party/libxsmm.BUILD2
145 files changed, 6294 insertions, 1701 deletions
diff --git a/.gitignore b/.gitignore
index 828bbe9bd3..b5306b8b79 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ __pycache__
cmake_build/
.idea/**
/build/
+[Bb]uild/
/tensorflow/core/util/version_info.cc
/tensorflow/python/framework/fast_tensor_util.cpp
Pods
diff --git a/configure.py b/configure.py
index ada342a50a..ad585fa52e 100644
--- a/configure.py
+++ b/configure.py
@@ -943,6 +943,35 @@ def set_tf_cudnn_version(environ_cp):
write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version)
+def is_cuda_compatible(lib, cuda_ver, cudnn_ver):
+ """Check compatibility between given library and cudnn/cudart libraries."""
+ ldd_bin = which('ldd') or '/usr/bin/ldd'
+ ldd_out = run_shell([ldd_bin, lib], True)
+ ldd_out = ldd_out.split(os.linesep)
+ cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$')
+ cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$')
+ cudnn = None
+ cudart = None
+ cudnn_ok = True # assume no cudnn dependency by default
+ cuda_ok = True # assume no cuda dependency by default
+ for line in ldd_out:
+ if 'libcudnn.so' in line:
+ cudnn = cudnn_pattern.search(line)
+ cudnn_ok = False
+ elif 'libcudart.so' in line:
+ cudart = cuda_pattern.search(line)
+ cuda_ok = False
+ if cudnn and len(cudnn.group(1)):
+ cudnn = convert_version_to_int(cudnn.group(1))
+ if cudart and len(cudart.group(1)):
+ cudart = convert_version_to_int(cudart.group(1))
+ if cudnn is not None:
+ cudnn_ok = (cudnn == cudnn_ver)
+ if cudart is not None:
+ cuda_ok = (cudart == cuda_ver)
+ return cudnn_ok and cuda_ok
+
+
def set_tf_tensorrt_install_path(environ_cp):
"""Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION.
@@ -959,8 +988,8 @@ def set_tf_tensorrt_install_path(environ_cp):
raise ValueError('Currently TensorRT is only supported on Linux platform.')
# Ask user whether to add TensorRT support.
- if str(int(get_var(
- environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False))) != '1':
+ if str(int(get_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT',
+ False))) != '1':
return
for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
@@ -973,47 +1002,29 @@ def set_tf_tensorrt_install_path(environ_cp):
# Result returned from "read" will be used unexpanded. That make "~"
# unusable. Going through one more level of expansion to handle that.
- trt_install_path = os.path.realpath(
- os.path.expanduser(trt_install_path))
+ trt_install_path = os.path.realpath(os.path.expanduser(trt_install_path))
def find_libs(search_path):
"""Search for libnvinfer.so in "search_path"."""
fl = set()
if os.path.exists(search_path) and os.path.isdir(search_path):
- fl.update([os.path.realpath(os.path.join(search_path, x))
- for x in os.listdir(search_path) if 'libnvinfer.so' in x])
+ fl.update([
+ os.path.realpath(os.path.join(search_path, x))
+ for x in os.listdir(search_path)
+ if 'libnvinfer.so' in x
+ ])
return fl
possible_files = find_libs(trt_install_path)
possible_files.update(find_libs(os.path.join(trt_install_path, 'lib')))
possible_files.update(find_libs(os.path.join(trt_install_path, 'lib64')))
-
- def is_compatible(tensorrt_lib, cuda_ver, cudnn_ver):
- """Check the compatibility between tensorrt and cudnn/cudart libraries."""
- ldd_bin = which('ldd') or '/usr/bin/ldd'
- ldd_out = run_shell([ldd_bin, tensorrt_lib]).split(os.linesep)
- cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$')
- cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$')
- cudnn = None
- cudart = None
- for line in ldd_out:
- if 'libcudnn.so' in line:
- cudnn = cudnn_pattern.search(line)
- elif 'libcudart.so' in line:
- cudart = cuda_pattern.search(line)
- if cudnn and len(cudnn.group(1)):
- cudnn = convert_version_to_int(cudnn.group(1))
- if cudart and len(cudart.group(1)):
- cudart = convert_version_to_int(cudart.group(1))
- return (cudnn == cudnn_ver) and (cudart == cuda_ver)
-
cuda_ver = convert_version_to_int(environ_cp['TF_CUDA_VERSION'])
cudnn_ver = convert_version_to_int(environ_cp['TF_CUDNN_VERSION'])
nvinfer_pattern = re.compile('.*libnvinfer.so.?(.*)$')
highest_ver = [0, None, None]
for lib_file in possible_files:
- if is_compatible(lib_file, cuda_ver, cudnn_ver):
+ if is_cuda_compatible(lib_file, cuda_ver, cudnn_ver):
matches = nvinfer_pattern.search(lib_file)
if len(matches.groups()) == 0:
continue
@@ -1029,12 +1040,13 @@ def set_tf_tensorrt_install_path(environ_cp):
# Try another alternative from ldconfig.
ldconfig_bin = which('ldconfig') or '/sbin/ldconfig'
ldconfig_output = run_shell([ldconfig_bin, '-p'])
- search_result = re.search(
- '.*libnvinfer.so\\.?([0-9.]*).* => (.*)', ldconfig_output)
+ search_result = re.search('.*libnvinfer.so\\.?([0-9.]*).* => (.*)',
+ ldconfig_output)
if search_result:
libnvinfer_path_from_ldconfig = search_result.group(2)
if os.path.exists(libnvinfer_path_from_ldconfig):
- if is_compatible(libnvinfer_path_from_ldconfig, cuda_ver, cudnn_ver):
+ if is_cuda_compatible(libnvinfer_path_from_ldconfig, cuda_ver,
+ cudnn_ver):
trt_install_path = os.path.dirname(libnvinfer_path_from_ldconfig)
tf_tensorrt_version = search_result.group(1)
break
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index e4530a5962..233fe21fbf 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -155,6 +155,12 @@ config_setting(
)
config_setting(
+ name = "linux_s390x",
+ values = {"cpu": "s390x"},
+ visibility = ["//visibility:public"],
+)
+
+config_setting(
name = "debug",
values = {
"compilation_mode": "dbg",
@@ -459,6 +465,15 @@ filegroup(
tf_cc_shared_object(
name = "libtensorflow_framework.so",
framework_so = [],
+ linkopts = select({
+ "//tensorflow:darwin": [],
+ "//tensorflow:windows": [],
+ "//tensorflow:windows_msvc": [],
+ "//conditions:default": [
+ "-Wl,--version-script", # This line must be directly followed by the version_script.lds file
+ "$(location //tensorflow:tf_framework_version_script.lds)",
+ ],
+ }),
linkstatic = 1,
visibility = ["//visibility:public"],
deps = [
@@ -468,6 +483,7 @@ tf_cc_shared_object(
"//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
"//tensorflow/core:lib_internal_impl",
"//tensorflow/stream_executor:stream_executor_impl",
+ "//tensorflow:tf_framework_version_script.lds",
] + tf_additional_binary_deps(),
)
@@ -571,3 +587,13 @@ py_library(
visibility = ["//visibility:public"],
deps = ["//tensorflow/python:no_contrib"],
)
+
+cc_library(
+ name = "grpc",
+ deps = ["@grpc"],
+)
+
+cc_library(
+ name = "grpc++",
+ deps = ["@grpc//:grpc++"],
+)
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 37c8302e08..5c218d3f25 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2068,7 +2068,8 @@ TF_ImportGraphDefResults* TF_GraphImportGraphDefWithResults(
TF_Graph* graph, const TF_Buffer* graph_def,
const TF_ImportGraphDefOptions* options, TF_Status* status) {
GraphDef def;
- if (!def.ParseFromArray(graph_def->data, graph_def->length)) {
+ if (!tensorflow::ParseProtoUnlimited(&def, graph_def->data,
+ graph_def->length)) {
status->status = InvalidArgument("Invalid GraphDef");
return nullptr;
}
@@ -2098,7 +2099,8 @@ void TF_GraphImportGraphDefWithReturnOutputs(
return;
}
GraphDef def;
- if (!def.ParseFromArray(graph_def->data, graph_def->length)) {
+ if (!tensorflow::ParseProtoUnlimited(&def, graph_def->data,
+ graph_def->length)) {
status->status = InvalidArgument("Invalid GraphDef");
return;
}
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 0025842aea..28070d60db 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -287,7 +287,7 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
TF_RETURN_IF_ERROR(ValidateFeedFetchCppNames(config));
const int64 result_index = compile_result.aot->result_buffer_index();
const xla::BufferSizes& temp_sizes = compile_result.aot->buffer_sizes();
- if (result_index < 0 || result_index > temp_sizes.size()) {
+ if (result_index < 0 || result_index >= temp_sizes.size()) {
return errors::InvalidArgument("result index: ", result_index,
" is outside the range of temp sizes: [0,",
temp_sizes.size(), ")");
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 1775666652..0b1cec1925 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -39,10 +39,10 @@ tf_cc_binary(
srcs = ["grpc_service_main.cc"],
deps = [
":grpc_service",
+ "//tensorflow:grpc++",
"//tensorflow/compiler/xla/service:cpu_plugin",
"//tensorflow/core:framework_internal",
"//tensorflow/core:lib",
- "@grpc//:grpc++",
],
)
@@ -54,6 +54,7 @@ tf_cc_test(
],
deps = [
":grpc_stub",
+ "//tensorflow:grpc++",
"//tensorflow/compiler/xla/client",
"//tensorflow/compiler/xla/client/xla_client:xla_builder",
"//tensorflow/compiler/xla/tests:literal_test_util",
@@ -61,7 +62,6 @@ tf_cc_test(
"//tensorflow/core:lib",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
- "@grpc//:grpc++",
],
)
@@ -71,9 +71,9 @@ cc_library(
hdrs = ["grpc_service.h"],
deps = [
":xla_service_proto",
+ "//tensorflow:grpc++",
"//tensorflow/compiler/xla/service",
"//tensorflow/compiler/xla/service:platform_util",
"//tensorflow/core/distributed_runtime/rpc:grpc_util",
- "@grpc//:grpc++",
],
)
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index ae0749edb9..fe99f700d2 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2550,7 +2550,6 @@ cc_library(
name = "hlo_tfgraph_builder",
srcs = ["hlo_tfgraph_builder.cc"],
hdrs = ["hlo_tfgraph_builder.h"],
- visibility = ["//tensorflow/compiler/xla/tools:__pkg__"],
deps = [
":hlo",
"//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 088c97fbe3..5aaeec802f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1515,6 +1515,7 @@ bool HloInstruction::IdenticalSlowPath(
// Remaining instructions with special values.
case HloOpcode::kCall:
+ return eq_computations(to_apply(), other.to_apply());
case HloOpcode::kConditional:
return eq_computations(true_computation(), other.true_computation()) &&
eq_computations(false_computation(), other.false_computation());
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index e1c5123774..d8ca99dfd1 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -924,6 +924,40 @@ TEST_F(HloInstructionTest, IdenticalInstructions) {
*HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op1, op2)));
}
+TEST_F(HloInstructionTest, IdenticalCallInstructions) {
+ const char* const hlo_string = R"(
+HloModule Module
+
+subcomp1 (x: f32[]) -> f32[] {
+ x = f32[] parameter(0)
+ ROOT n = f32[] sine(x)
+}
+
+subcomp2 (x: f32[]) -> f32[] {
+ x = f32[] parameter(0)
+ ROOT n = f32[] cosine(x)
+}
+
+ENTRY entry (param: f32[]) -> (f32[], f32[], f32[]) {
+ p = f32[] parameter(0)
+ t1 = f32[] call(p), to_apply=subcomp1
+ t2 = f32[] call(p), to_apply=subcomp1
+ t3 = f32[] call(p), to_apply=subcomp2
+ ROOT t = (f32[], f32[], f32[]) tuple(t1, t2, t3)
+ }
+)";
+ TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+ ParseHloString(hlo_string));
+
+ auto* root = module->entry_computation()->root_instruction();
+ auto* t1 = root->operand(0);
+ auto* t2 = root->operand(1);
+ auto* t3 = root->operand(2);
+
+ EXPECT_TRUE(StructuralEqual(*t1, *t2));
+ EXPECT_FALSE(StructuralEqual(*t1, *t3));
+}
+
TEST_F(HloInstructionTest, FunctionVisitor) {
// Verify the function visitor HloInstruction::Accept visits all instructions
// from a root properly given the following graph:
diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
index 931ff62064..b2e2e27673 100644
--- a/tensorflow/contrib/autograph/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -120,7 +120,10 @@ py_test(
name = "decorators_test",
srcs = ["decorators_test.py"],
srcs_version = "PY2AND3",
- tags = ["no_windows"],
+ tags = [
+ "no_pip",
+ "no_windows",
+ ],
deps = [
":converters",
"//tensorflow/contrib/autograph/core:test_lib",
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
index 671c9ccc13..988df70157 100644
--- a/tensorflow/contrib/autograph/operators/control_flow.py
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -51,7 +51,7 @@ def for_stmt(iter_, extra_test, body, init_state):
Args:
iter_: The entity being iterated over.
extra_test: Callable with the state as arguments, and boolean return type.
- An additionnal loop condition.
+ An additional loop condition.
body: Callable with the iterate and the state as arguments, and
state as return type. The actual loop body.
init_state: Tuple containing the initial state.
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
index 358d56ce20..4acc4ed66a 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
@@ -286,7 +286,7 @@ class Forward(object):
# TODO(alexbw): see if we can simplify by visiting breadth-first
def visit(self, node):
- """Depth-first walking the CFG, applying dataflow information propagtion."""
+ """Depth-first walking the CFG, applying dataflow info propagation."""
# node.value is None only for the exit CfgNode.
if not node.value:
return
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index 3328dde7aa..7655811830 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -218,7 +218,7 @@ class Base(gast.NodeTransformer):
# TODO(mdan): Once we have error tracing, we may be able to just go to SSA.
def apply_to_single_assignments(self, targets, values, apply_fn):
- """Applies a fuction to each individual assignment.
+ """Applies a function to each individual assignment.
This function can process a possibly-unpacked (e.g. a, b = c, d) assignment.
It tries to break down the unpacking if possible. In effect, it has the same
@@ -246,7 +246,7 @@ class Base(gast.NodeTransformer):
targets field of an ast.Assign node.
values: an AST node.
apply_fn: a function of a single argument, which will be called with the
- respective nodes of each single assignment. The signaure is
+ respective nodes of each single assignment. The signature is
apply_fn(target, value), no return value.
"""
if not isinstance(targets, (list, tuple)):
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index e524e9e743..4ca7a1b28c 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -336,40 +336,14 @@ endif()
# MKL Support
if (tensorflow_ENABLE_MKL_SUPPORT)
add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
- if (WIN32)
- find_path(MKL_HOME_PLATFORM mkl
- PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
- $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
- PATH_SUFFIXES windows)
- set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
- set(MKL_LINK_DIRS
- ${MKL_HOME_PLATFORM}/mkl/lib/intel64
- ${MKL_HOME_PLATFORM}/tbb/lib/intel64/vc_mt
- ${MKL_HOME_PLATFORM}/compiler/lib/intel64
- ${MKL_HOME_PLATFORM}/mkl/tools/builder/lib)
- set(MKL_REDIST_DLL_DIRS
- ${MKL_HOME_PLATFORM}/redist/intel64/mkl
- ${MKL_HOME_PLATFORM}/redist/intel64/tbb/vc_mt
- ${MKL_HOME_PLATFORM}/redist/intel64/compiler)
- list(APPEND tensorflow_EXTERNAL_LIBRARIES
- mkl_intel_lp64_dll mkl_sequential_dll mkl_core_dll mkl_rt mkl_cdll_intel64)
- endif()
- if (UNIX)
- # Fix me: complete the path on linux
- find_path(MKL_HOME_PLATFORM mkl
- HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
- $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
- PATH_SUFFIXES linux)
- set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
- set(MKL_LINK_DIRS) # incompleted
- set(MKL_REDIST_SO_DIRS) # incompleted
- endif()
- include_directories(${MKL_INCLUDE_DIRS})
- link_directories(${MKL_LINK_DIRS})
+ include(mkl)
+ list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkl_STATIC_LIBRARIES})
+ list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkl_copy_shared_to_destination)
+ include_directories(${mkl_INCLUDE_DIRS})
if (tensorflow_ENABLE_MKLDNN_SUPPORT)
include(mkldnn)
list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
- list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn)
+ list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn_copy_shared_to_destination)
include_directories(${mkldnn_INCLUDE_DIRS})
else (tensorflow_ENABLE_MKLDNN_SUPPORT)
add_definitions(-DINTEL_MKL_ML)
diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake
index 527ccdc8d8..5c5adaf579 100644
--- a/tensorflow/contrib/cmake/external/double_conversion.cmake
+++ b/tensorflow/contrib/cmake/external/double_conversion.cmake
@@ -16,15 +16,15 @@ include (ExternalProject)
set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion)
set(double_conversion_URL https://github.com/google/double-conversion.git)
-set(double_conversion_TAG 5664746)
+set(double_conversion_TAG 3992066a95b823efc8ccc1baf82a1cfc73f6e9b8)
set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR})
set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so)
set(double_conversion_INCLUDES ${double_conversion_BUILD})
if(WIN32)
- set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib)
+ set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/$(Configuration)/double-conversion.lib)
else()
- set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a)
+ set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/libdouble-conversion.a)
endif()
set(double_conversion_HEADERS
diff --git a/tensorflow/contrib/cmake/external/mkl.cmake b/tensorflow/contrib/cmake/external/mkl.cmake
new file mode 100644
index 0000000000..a172e3a41a
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/mkl.cmake
@@ -0,0 +1,68 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+# NOTE: Different from mkldnn.cmake, this file is meant to download mkl libraries
+set(mkl_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include)
+set(mkl_BIN_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkl/bin)
+set(mkl_WIN mklml_win_2018.0.3.20180406.zip) # match for v0.14
+set(mkl_MAC mklml_mac_2018.0.3.20180406.tgz)
+set(mkl_LNX mklml_lnx_2018.0.3.20180406.tgz)
+set(mkl_TAG v0.14)
+set(mkl_URL https://github.com/intel/mkl-dnn/releases)
+
+if (WIN32)
+ set(mkl_DOWNLOAD_URL ${mkl_URL}/download/${mkl_TAG}/${mkl_WIN})
+ list(APPEND mkl_STATIC_LIBRARIES
+ ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/mklml.lib)
+ list(APPEND mkl_STATIC_LIBRARIES
+ ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libiomp5md.lib)
+ list(APPEND mkl_SHARED_LIBRARIES
+ ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/mklml.dll)
+ list(APPEND mkl_SHARED_LIBRARIES
+ ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libiomp5md.dll)
+elseif (UNIX)
+ set(mkl_DOWNLOAD_URL ${mkl_URL}/download/${mkl_TAG}/${mkl_LNX})
+ list(APPEND mkl_SHARED_LIBRARIES
+ ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libiomp5.so)
+ list(APPEND mkl_SHARED_LIBRARIES
+ ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libmklml_gnu.so)
+ list(APPEND mkl_SHARED_LIBRARIES
+ ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/lib/libmklml_intel.so)
+elseif (APPLE)
+ set(mkl_DOWNLOAD_URL ${mkl_URL}/download/${mkl_TAG}/${mkl_MAC})
+ #TODO need more information
+endif ()
+
+ExternalProject_Add(mkl
+ PREFIX mkl
+ URL ${mkl_DOWNLOAD_URL}
+ DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+ UPDATE_COMMAND ""
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ INSTALL_COMMAND "")
+
+# put mkl dynamic libraries in one bin directory
+add_custom_target(mkl_create_destination_dir
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${mkl_BIN_DIRS}
+ DEPENDS mkl)
+
+add_custom_target(mkl_copy_shared_to_destination DEPENDS mkl_create_destination_dir)
+
+foreach(dll_file ${mkl_SHARED_LIBRARIES})
+ add_custom_command(TARGET mkl_copy_shared_to_destination PRE_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${dll_file} ${mkl_BIN_DIRS})
+endforeach()
diff --git a/tensorflow/contrib/cmake/external/mkldnn.cmake b/tensorflow/contrib/cmake/external/mkldnn.cmake
index a639fdee36..8123ee1f39 100644
--- a/tensorflow/contrib/cmake/external/mkldnn.cmake
+++ b/tensorflow/contrib/cmake/external/mkldnn.cmake
@@ -22,8 +22,11 @@ set(mkldnn_TAG 3063b2e4c943983f6bf5f2fb9a490d4a998cd291)
if(WIN32)
if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.lib)
+ set(mkldnn_SHARED_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.dll)
+ set(mkldnn_BUILD ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release)
else()
set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.lib)
+ set(mkldnn_SHARED_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.dll)
endif()
else()
set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/libmkldnn.a)
@@ -31,6 +34,7 @@ endif()
ExternalProject_Add(mkldnn
PREFIX mkldnn
+ DEPENDS mkl
GIT_REPOSITORY ${mkldnn_URL}
GIT_TAG ${mkldnn_TAG}
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
@@ -40,5 +44,11 @@ ExternalProject_Add(mkldnn
CMAKE_CACHE_ARGS
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
- -DMKLINC:STRING=${MKL_INCLUDE_DIRS}
+ -DMKLINC:STRING=${mkl_INCLUDE_DIRS}
)
+
+# since mkldnn depends on mkl, copy the mkldnn.dll together with mklml.dll to mkl_bin_dirs
+add_custom_target(mkldnn_copy_shared_to_destination DEPENDS mkldnn)
+
+add_custom_command(TARGET mkldnn_copy_shared_to_destination PRE_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${mkldnn_SHARED_LIBRARIES} ${mkl_BIN_DIRS})
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index df6702a42c..e3b59001bc 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -755,26 +755,65 @@ set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
file(WRITE "${api_init_list_file}" "${api_init_files}")
# Run create_python_api.py to generate __init__.py files.
-add_custom_command(
- OUTPUT ${api_init_files}
- DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
-
- # tensorflow/__init__.py depends on files generated in this step. So, remove it while
- # this step is running since the files aren't there yet.
- COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-
- # Run create_python_api.py to generate API init files.
- COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
- "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
- "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
- "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
- "--package=tensorflow.python"
- "--apiname=tensorflow"
- "${api_init_list_file}"
- COMMENT "Generating __init__.py files for Python API."
- WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
-)
+### TODO
+# In order to download and compile MKL/MKL-DNN automatically in cmake script, mkl-built libraries should be added to system path
+# to be loaded by python executor. However `add_custom_command` has an issue with `COMMAND ${CMAKE_COMMAND} -E env PATH=`, where
+# arguments of multiple paths (such as D:/;D:/mkl) will be parsed in to seperate string without semicolon and that command fail to
+# recongnize paths. As CUDA isn't built with MKL, the MKL built directory is the only path to this command to work around that issue.
+# To not override the CUDA and system path in other circumstances, `if-else` branch used here to handle this problem,
+# and should be removed if the path issue can be resolved.
+###
+
+if (tensorflow_ENABLE_MKL_SUPPORT)
+ # add mkl dist dlls to system path for python
+ # TODO: In current cmake version, PY_RUNTIME_ENV behaves strange with multiple paths,
+ # so we have to specify only one path in it to work around the issue. We need this if/else
+ # to protect overwriting CUDA environments
+ set(PY_RUNTIME_ENV ${mkl_BIN_DIRS})
+ add_custom_command(
+ OUTPUT ${api_init_files}
+ DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+ # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+ # this step is running since the files aren't there yet.
+ COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+ # Run create_python_api.py to generate API init files.
+ COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python PATH=${PY_RUNTIME_ENV} ${PYTHON_EXECUTABLE}
+ "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
+ "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
+ "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
+ "--package=tensorflow.python"
+ "--apiname=tensorflow"
+ "${api_init_list_file}"
+
+ COMMENT "Generating __init__.py files for Python API."
+ WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+ VERBATIM
+ )
+else (tensorflow_ENABLE_MKL_SUPPORT)
+ add_custom_command(
+ OUTPUT ${api_init_files}
+ DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+ # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+ # this step is running since the files aren't there yet.
+ COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+ # Run create_python_api.py to generate API init files.
+ COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
+ "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
+ "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
+ "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
+ "--package=tensorflow.python"
+ "--apiname=tensorflow"
+ "${api_init_list_file}"
+
+ COMMENT "Generating __init__.py files for Python API."
+ WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+ )
+endif (tensorflow_ENABLE_MKL_SUPPORT)
add_custom_target(tf_python_api SOURCES ${api_init_files})
add_dependencies(tf_python_api tf_python_ops)
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 38f40452b5..fdf522f1fd 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -145,3 +145,8 @@ install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
# unsupported Eigen directory
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
DESTINATION include/unsupported/Eigen)
+# mkl
+if (tensorflow_ENABLE_MKL_SUPPORT)
+ install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include/
+ DESTINATION include/mkl)
+endif (tensorflow_ENABLE_MKL_SUPPORT)
diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md
index c65a150464..cb1dd7d836 100644
--- a/tensorflow/contrib/constrained_optimization/README.md
+++ b/tensorflow/contrib/constrained_optimization/README.md
@@ -46,7 +46,7 @@ document.
Imagine that we want to constrain the recall of a binary classifier to be at
least 90%. Since the recall is proportional to the number of true positive
classifications, which itself is a sum of indicator functions, this constraint
-is non-differentible, and therefore cannot be used in a problem that will be
+is non-differentiable, and therefore cannot be used in a problem that will be
optimized using a (stochastic) gradient-based algorithm.
For this and similar problems, TFCO supports so-called *proxy constraints*,
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
index 04014ab4ae..3791dae8d7 100644
--- a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
@@ -169,8 +169,8 @@ def _project_stochastic_matrix_wrt_euclidean_norm(matrix):
del old_inactive # Needed by the condition, but not the body.
iteration += 1
scale = (1.0 - standard_ops.reduce_sum(
- matrix, axis=0, keep_dims=True)) / standard_ops.maximum(
- 1.0, standard_ops.reduce_sum(inactive, axis=0, keep_dims=True))
+ matrix, axis=0, keepdims=True)) / standard_ops.maximum(
+ 1.0, standard_ops.reduce_sum(inactive, axis=0, keepdims=True))
matrix += scale * inactive
new_inactive = standard_ops.to_float(matrix > 0)
matrix *= new_inactive
@@ -206,10 +206,10 @@ def _project_log_stochastic_matrix_wrt_kl_divergence(log_matrix):
# For numerical reasons, make sure that the largest matrix element is zero
# before exponentiating.
- log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keep_dims=True)
+ log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keepdims=True)
log_matrix -= standard_ops.log(
standard_ops.reduce_sum(
- standard_ops.exp(log_matrix), axis=0, keep_dims=True))
+ standard_ops.exp(log_matrix), axis=0, keepdims=True))
return log_matrix
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index 33c48e20be..5590a4bf78 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -58,6 +58,7 @@ class SlideDatasetTest(test.TestCase):
[t.shape.as_list() for t in get_next])
with self.test_session() as sess:
+ # stride < window_size.
# Slide over a finite input, where the window_size divides the
# total number of elements.
sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 7})
@@ -71,11 +72,9 @@ class SlideDatasetTest(test.TestCase):
result_component[j])
with self.assertRaises(errors.OutOfRangeError):
sess.run(get_next)
-
# Slide over a finite input, where the window_size does not
# divide the total number of elements.
sess.run(init_op, feed_dict={count: 20, window_size: 17, stride: 9})
-
num_batches = (20 * 7 - 17) // 9 + 1
for i in range(num_batches):
result = sess.run(get_next)
@@ -86,6 +85,41 @@ class SlideDatasetTest(test.TestCase):
with self.assertRaises(errors.OutOfRangeError):
sess.run(get_next)
+ # stride == window_size.
+ sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 14})
+ num_batches = 20 * 7 // 14
+ for i in range(num_batches):
+ result = sess.run(get_next)
+ for component, result_component in zip(components, result):
+ for j in range(14):
+ self.assertAllEqual(component[(i*14 + j) % 7]**2,
+ result_component[j])
+ with self.assertRaises(errors.OutOfRangeError):
+ sess.run(get_next)
+
+ # stride > window_size.
+ sess.run(init_op, feed_dict={count: 20, window_size: 10, stride: 14})
+ num_batches = 20 * 7 // 14
+ for i in range(num_batches):
+ result = sess.run(get_next)
+ for component, result_component in zip(components, result):
+ for j in range(10):
+ self.assertAllEqual(component[(i*14 + j) % 7]**2,
+ result_component[j])
+ with self.assertRaises(errors.OutOfRangeError):
+ sess.run(get_next)
+ # Drop the last batch which is smaller than window_size.
+ sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 19})
+ num_batches = (20 * 7 - 7) // 19 # = 19 * 7 // 19
+ for i in range(num_batches):
+ result = sess.run(get_next)
+ for component, result_component in zip(components, result):
+ for j in range(14):
+ self.assertAllEqual(component[(i*19 + j) % 7]**2,
+ result_component[j])
+ with self.assertRaises(errors.OutOfRangeError):
+ sess.run(get_next)
+
# Slide over a finite input, which is less than window_size,
# should fail straight away.
sess.run(init_op, feed_dict={count: 1, window_size: 10, stride: 4})
@@ -108,10 +142,6 @@ class SlideDatasetTest(test.TestCase):
# Invalid stride should be an initialization time error.
with self.assertRaises(errors.InvalidArgumentError):
sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 0})
- with self.assertRaises(errors.InvalidArgumentError):
- sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 3})
- with self.assertRaises(errors.InvalidArgumentError):
- sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 5})
def assertSparseValuesEqual(self, a, b):
self.assertAllEqual(a.indices, b.indices)
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
index f935beb1a9..3f3c5ca17c 100644
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -86,7 +86,7 @@ def sliding_window_batch(window_size, stride=1):
elements in the sliding window.
stride: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
steps moving the sliding window forward for one iteration. The default
- is `1`. It must be in `[1, window_size)`.
+ is `1`. It must be positive.
Returns:
A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
new file mode 100644
index 0000000000..54ebcad8e9
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -0,0 +1,909 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "nmt_with_attention.ipynb",
+ "version": "0.3.2",
+ "views": {},
+ "default_view": {},
+ "provenance": [
+ {
+ "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U",
+ "timestamp": 1527858391290
+ },
+ {
+ "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv",
+ "timestamp": 1527776041613
+ }
+ ],
+ "private_outputs": true,
+ "collapsed_sections": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "accelerator": "GPU"
+ },
+ "cells": [
+ {
+ "metadata": {
+ "id": "AOpGoE2T-YXS",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "##### Copyright 2018 The TensorFlow Authors.\n",
+ "\n",
+ "Licensed under the Apache License, Version 2.0 (the \"License\").\n",
+ "\n",
+ "# Neural Machine Translation with Attention\n",
+ "\n",
+ "<table align=\"left\"><td>\n",
+ "<a target=\"_blank\" href=\"https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\">\n",
+ " <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a> \n",
+ "</td><td>\n",
+ "<a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on Github</a></td></table>"
+ ]
+ },
+ {
+ "metadata": {
+ "id": "CiwtNgENbx2g",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "This notebook trains a sequence to sequence (seq2seq) model for Spanish to English translation using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). This is an advanced example that assumes some knowledge of sequence to sequence models.\n",
+ "\n",
+ "After training the model in this notebook, you will be able to input a Spanish sentence, such as *\"¿todavia estan en casa?\"*, and return the English translation: *\"are you still at home?\"*\n",
+ "\n",
+ "The translation quality is reasonable for a toy example, but the generated attention plot is perhaps more interesting. This shows which parts of the input sentence has the model's attention while translating:\n",
+ "\n",
+ "<img src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\">\n",
+ "\n",
+ "Note: This example takes approximately 10 mintues to run on a single P100 GPU."
+ ]
+ },
+ {
+ "metadata": {
+ "id": "tnxXKDjq3jEL",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "from __future__ import absolute_import, division, print_function\n",
+ "\n",
+ "# Import TensorFlow >= 1.9 and enable eager execution\n",
+ "import tensorflow as tf\n",
+ "\n",
+ "tf.enable_eager_execution()\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "import unicodedata\n",
+ "import re\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import time\n",
+ "\n",
+ "print(tf.__version__)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "wfodePkj3jEa",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Download and prepare the dataset\n",
+ "\n",
+ "We'll use a language dataset provided by http://www.manythings.org/anki/. This dataset contains language translation pairs in the format:\n",
+ "\n",
+ "```\n",
+ "May I borrow this book?\t¿Puedo tomar prestado este libro?\n",
+ "```\n",
+ "\n",
+ "There are a variety of languages available, but we'll use the English-Spanish dataset. For convenience, we've hosted a copy of this dataset on Google Cloud, but you can also download your own copy. After downloading the dataset, here are the steps we'll take to prepare the data:\n",
+ "\n",
+ "1. Add a *start* and *end* token to each sentence.\n",
+ "2. Clean the sentences by removing special characters.\n",
+ "3. Create a word index and reverse word index (dictionaries mapping from word → id and id → word).\n",
+ "4. Pad each sentence to a maximum length."
+ ]
+ },
+ {
+ "metadata": {
+ "id": "kRVATYOgJs1b",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# Download the file\n",
+ "path_to_zip = tf.keras.utils.get_file(\n",
+ " 'spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', \n",
+ " extract=True)\n",
+ "\n",
+ "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\""
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "rd0jw-eC3jEh",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# Converts the unicode file to ascii\n",
+ "def unicode_to_ascii(s):\n",
+ " return ''.join(c for c in unicodedata.normalize('NFD', s)\n",
+ " if unicodedata.category(c) != 'Mn')\n",
+ "\n",
+ "\n",
+ "def preprocess_sentence(w):\n",
+ " w = unicode_to_ascii(w.lower().strip())\n",
+ " \n",
+ " # creating a space between a word and the punctuation following it\n",
+ " # eg: \"he is a boy.\" => \"he is a boy .\" \n",
+ " # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n",
+ " w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n",
+ " w = re.sub(r'[\" \"]+', \" \", w)\n",
+ " \n",
+ " # replacing everything with space except (a-z, A-Z, \".\", \"?\", \"!\", \",\")\n",
+ " w = re.sub(r\"[^a-zA-Z?.!,¿]+\", \" \", w)\n",
+ " \n",
+ " w = w.rstrip().strip()\n",
+ " \n",
+ " # adding a start and an end token to the sentence\n",
+ " # so that the model know when to start and stop predicting.\n",
+ " w = '<start> ' + w + ' <end>'\n",
+ " return w"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "OHn4Dct23jEm",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# 1. Remove the accents\n",
+ "# 2. Clean the sentences\n",
+ "# 3. Return word pairs in the format: [ENGLISH, SPANISH]\n",
+ "def create_dataset(path, num_examples):\n",
+ " lines = open(path, encoding='UTF-8').read().strip().split('\\n')\n",
+ " \n",
+ " word_pairs = [[preprocess_sentence(w) for w in l.split('\\t')] for l in lines[:num_examples]]\n",
+ " \n",
+ " return word_pairs"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "9xbqO7Iie9bb",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# This class creates a word -> index mapping (e.g,. \"dad\" -> 5) and vice-versa \n",
+ "# (e.g., 5 -> \"dad\") for each language,\n",
+ "class LanguageIndex():\n",
+ " def __init__(self, lang):\n",
+ " self.lang = lang\n",
+ " self.word2idx = {}\n",
+ " self.idx2word = {}\n",
+ " self.vocab = set()\n",
+ " \n",
+ " self.create_index()\n",
+ " \n",
+ " def create_index(self):\n",
+ " for phrase in self.lang:\n",
+ " self.vocab.update(phrase.split(' '))\n",
+ " \n",
+ " self.vocab = sorted(self.vocab)\n",
+ " \n",
+ " self.word2idx['<pad>'] = 0\n",
+ " for index, word in enumerate(self.vocab):\n",
+ " self.word2idx[word] = index + 1\n",
+ " \n",
+ " for word, index in self.word2idx.items():\n",
+ " self.idx2word[index] = word"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "eAY9k49G3jE_",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "def max_length(tensor):\n",
+ " return max(len(t) for t in tensor)\n",
+ "\n",
+ "\n",
+ "def load_dataset(path, num_examples):\n",
+ " # creating cleaned input, output pairs\n",
+ " pairs = create_dataset(path, num_examples)\n",
+ "\n",
+ " # index language using the class defined above \n",
+ " inp_lang = LanguageIndex(sp for en, sp in pairs)\n",
+ " targ_lang = LanguageIndex(en for en, sp in pairs)\n",
+ " \n",
+ " # Vectorize the input and target languages\n",
+ " \n",
+ " # Spanish sentences\n",
+ " input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]\n",
+ " \n",
+ " # English sentences\n",
+ " target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]\n",
+ " \n",
+ " # Calculate max_length of input and output tensor\n",
+ " # Here, we'll set those to the longest sentence in the dataset\n",
+ " max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)\n",
+ " \n",
+ " # Padding the input and output tensor to the maximum length\n",
+ " input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, \n",
+ " maxlen=max_length_inp,\n",
+ " padding='post')\n",
+ " \n",
+ " target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, \n",
+ " maxlen=max_length_tar, \n",
+ " padding='post')\n",
+ " \n",
+ " return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "GOi42V79Ydlr",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "### Limit the size of the dataset to experiment faster (optional)\n",
+ "\n",
+ "Training on the complete dataset of >100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
+ ]
+ },
+ {
+ "metadata": {
+ "id": "cnxC7q-j3jFD",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# Try experimenting with the size of that dataset\n",
+ "num_examples = 30000\n",
+ "input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "4QILQkOs3jFG",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# Creating training and validation sets using an 80-20 split\n",
+ "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n",
+ "\n",
+ "# Show length\n",
+ "len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "rgCLkfv5uO3d",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "### Create a tf.data dataset"
+ ]
+ },
+ {
+ "metadata": {
+ "id": "TqHsArVZ3jFS",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "BUFFER_SIZE = len(input_tensor_train)\n",
+ "BATCH_SIZE = 64\n",
+ "embedding_dim = 256\n",
+ "units = 1024\n",
+ "vocab_inp_size = len(inp_lang.word2idx)\n",
+ "vocab_tar_size = len(targ_lang.word2idx)\n",
+ "\n",
+ "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n",
+ "dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "TNfHIF71ulLu",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Write the encoder and decoder model\n",
+ "\n",
+ "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://www.tensorflow.org/tutorials/seq2seq). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://www.tensorflow.org/tutorials/seq2seq#background_on_the_attention_mechanism) from the seq2seq tutorial. The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n",
+ "\n",
+ "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\">\n",
+ "\n",
+ "The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*. \n",
+ "\n",
+ "Here are the equations that are implemented:\n",
+ "\n",
+ "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\">\n",
+ "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\">\n",
+ "\n",
+ "We're using *Bahdanau attention*. Lets decide on notation before writing the simplified form:\n",
+ "\n",
+ "* FC = Fully connected (dense) layer\n",
+ "* EO = Encoder output\n",
+ "* H = hidden state\n",
+ "* X = input to the decoder\n",
+ "\n",
+ "And the pseudo-code:\n",
+ "\n",
+ "* `score = FC(tanh(FC(EO) + FC(H)))`\n",
+ "* `attention weights = softmax(score, axis = 1)`. Softmax by default is applied on the last axis but here we want to apply it on the *1st axis*, since the shape of score is *(batch_size, max_length, hidden_size)*. `Max_length` is the length of our input. Since we are trying to assign a weight to each input, softmax should be applied on that axis.\n",
+ "* `context vector = sum(attention weights * EO, axis = 1)`. Same reason as above for choosing axis as 1.\n",
+ "* `embedding output` = The input to the decoder X is passed through an embedding layer.\n",
+ "* `merged vector = concat(embedding output, context vector)`\n",
+ "* This merged vector is then given to the GRU\n",
+ " \n",
+ "The shapes of all the vectors at each step have been specified in the comments in the code:"
+ ]
+ },
+ {
+ "metadata": {
+ "id": "avyJ_4VIUoHb",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "def gru(units):\n",
+ " # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)\n",
+ " # the code automatically does that.\n",
+ " if tf.test.is_gpu_available():\n",
+ " return tf.keras.layers.CuDNNGRU(units, \n",
+ " return_sequences=True, \n",
+ " return_state=True, \n",
+ " recurrent_initializer='glorot_uniform')\n",
+ " else:\n",
+ " return tf.keras.layers.GRU(units, \n",
+ " return_sequences=True, \n",
+ " return_state=True, \n",
+ " recurrent_activation='sigmoid', \n",
+ " recurrent_initializer='glorot_uniform')"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "nZ2rI24i3jFg",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "class Encoder(tf.keras.Model):\n",
+ " def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n",
+ " super(Encoder, self).__init__()\n",
+ " self.batch_sz = batch_sz\n",
+ " self.enc_units = enc_units\n",
+ " self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+ " self.gru = gru(self.enc_units)\n",
+ " \n",
+ " def call(self, x, hidden):\n",
+ " x = self.embedding(x)\n",
+ " output, state = self.gru(x, initial_state = hidden) \n",
+ " return output, state\n",
+ " \n",
+ " def initialize_hidden_state(self):\n",
+ " return tf.zeros((self.batch_sz, self.enc_units))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "yJ_B3mhW3jFk",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "class Decoder(tf.keras.Model):\n",
+ " def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):\n",
+ " super(Decoder, self).__init__()\n",
+ " self.batch_sz = batch_sz\n",
+ " self.dec_units = dec_units\n",
+ " self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+ " self.gru = gru(self.dec_units)\n",
+ " self.fc = tf.keras.layers.Dense(vocab_size)\n",
+ " \n",
+ " # used for attention\n",
+ " self.W1 = tf.keras.layers.Dense(self.dec_units)\n",
+ " self.W2 = tf.keras.layers.Dense(self.dec_units)\n",
+ " self.V = tf.keras.layers.Dense(1)\n",
+ " \n",
+ " def call(self, x, hidden, enc_output):\n",
+ " # enc_output shape == (batch_size, max_length, hidden_size)\n",
+ " \n",
+ " # hidden shape == (batch_size, hidden size)\n",
+ " # hidden_with_time_axis shape == (batch_size, 1, hidden size)\n",
+ " # we are doing this to perform addition to calculate the score\n",
+ " hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
+ " \n",
+ " # score shape == (batch_size, max_length, hidden_size)\n",
+ " score = tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))\n",
+ " \n",
+ " # attention_weights shape == (batch_size, max_length, 1)\n",
+ " # we get 1 at the last axis because we are applying score to self.V\n",
+ " attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
+ " \n",
+ " # context_vector shape after sum == (batch_size, hidden_size)\n",
+ " context_vector = attention_weights * enc_output\n",
+ " context_vector = tf.reduce_sum(context_vector, axis=1)\n",
+ " \n",
+ " # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
+ " x = self.embedding(x)\n",
+ " \n",
+ " # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
+ " x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
+ " \n",
+ " # passing the concatenated vector to the GRU\n",
+ " output, state = self.gru(x)\n",
+ " \n",
+ " # output shape == (batch_size * max_length, hidden_size)\n",
+ " output = tf.reshape(output, (-1, output.shape[2]))\n",
+ " \n",
+ " # output shape == (batch_size * max_length, vocab)\n",
+ " x = self.fc(output)\n",
+ " \n",
+ " return x, state, attention_weights\n",
+ " \n",
+ " def initialize_hidden_state(self):\n",
+ " return tf.zeros((self.batch_sz, self.dec_units))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "P5UY8wko3jFp",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n",
+ "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "_ch_71VbIRfK",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Define the optimizer and the loss function"
+ ]
+ },
+ {
+ "metadata": {
+ "id": "WmTHr5iV3jFr",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "optimizer = tf.train.AdamOptimizer()\n",
+ "\n",
+ "\n",
+ "def loss_function(real, pred):\n",
+ " mask = 1 - np.equal(real, 0)\n",
+ " loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
+ " return tf.reduce_mean(loss_)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "hpObfY22IddU",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Training\n",
+ "\n",
+ "1. Pass the *input* through the *encoder* which return *encoder output* and the *encoder hidden state*.\n",
+ "2. The encoder output, encoder hidden state and the decoder input (which is the *start token*) is passed to the decoder.\n",
+ "3. The decoder returns the *predictions* and the *decoder hidden state*.\n",
+ "4. The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
+ "5. Use *teacher forcing* to decide the next input to the decoder.\n",
+ "6. *Teacher forcing* is the technique where the *target word* is passed as the *next input* to the decoder.\n",
+ "7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate."
+ ]
+ },
+ {
+ "metadata": {
+ "id": "ddefjBMa3jF0",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "EPOCHS = 10\n",
+ "\n",
+ "for epoch in range(EPOCHS):\n",
+ " start = time.time()\n",
+ " \n",
+ " hidden = encoder.initialize_hidden_state()\n",
+ " total_loss = 0\n",
+ " \n",
+ " for (batch, (inp, targ)) in enumerate(dataset):\n",
+ " loss = 0\n",
+ " \n",
+ " with tf.GradientTape() as tape:\n",
+ " enc_output, enc_hidden = encoder(inp, hidden)\n",
+ " \n",
+ " dec_hidden = enc_hidden\n",
+ " \n",
+ " dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1) \n",
+ " \n",
+ " # Teacher forcing - feeding the target as the next input\n",
+ " for t in range(1, targ.shape[1]):\n",
+ " # passing enc_output to the decoder\n",
+ " predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)\n",
+ " \n",
+ " loss += loss_function(targ[:, t], predictions)\n",
+ " \n",
+ " # using teacher forcing\n",
+ " dec_input = tf.expand_dims(targ[:, t], 1)\n",
+ " \n",
+ " total_loss += (loss / int(targ.shape[1]))\n",
+ " \n",
+ " variables = encoder.variables + decoder.variables\n",
+ " \n",
+ " gradients = tape.gradient(loss, variables)\n",
+ " \n",
+ " optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
+ "\n",
+ " if batch % 100 == 0:\n",
+ " print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,\n",
+ " batch,\n",
+ " loss.numpy() / int(targ.shape[1])))\n",
+ " \n",
+ " print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n",
+ " total_loss/len(input_tensor)))\n",
+ " print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "mU3Ce8M6I3rz",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Translate\n",
+ "\n",
+ "* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
+ "* Stop predicting when the model predicts the *end token*.\n",
+ "* And store the *attention weights for every time step*.\n",
+ "\n",
+ "Note: The encoder output is calculated only once for one input."
+ ]
+ },
+ {
+ "metadata": {
+ "id": "EbQpyYs13jF_",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
+ " attention_plot = np.zeros((max_length_targ, max_length_inp))\n",
+ " \n",
+ " sentence = preprocess_sentence(sentence)\n",
+ "\n",
+ " inputs = [inp_lang.word2idx[i] for i in sentence.split(' ')]\n",
+ " inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')\n",
+ " inputs = tf.convert_to_tensor(inputs)\n",
+ " \n",
+ " result = ''\n",
+ "\n",
+ " hidden = [tf.zeros((1, units))]\n",
+ " enc_out, enc_hidden = encoder(inputs, hidden)\n",
+ "\n",
+ " dec_hidden = enc_hidden\n",
+ " dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)\n",
+ "\n",
+ " for t in range(max_length_targ):\n",
+ " predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)\n",
+ " \n",
+ " # storing the attention weigths to plot later on\n",
+ " attention_weights = tf.reshape(attention_weights, (-1, ))\n",
+ " attention_plot[t] = attention_weights.numpy()\n",
+ "\n",
+ " predicted_id = tf.multinomial(tf.exp(predictions), num_samples=1)[0][0].numpy()\n",
+ "\n",
+ " result += targ_lang.idx2word[predicted_id] + ' '\n",
+ "\n",
+ " if targ_lang.idx2word[predicted_id] == '<end>':\n",
+ " return result, sentence, attention_plot\n",
+ " \n",
+ " # the predicted ID is fed back into the model\n",
+ " dec_input = tf.expand_dims([predicted_id], 0)\n",
+ "\n",
+ " return result, sentence, attention_plot"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "s5hQWlbN3jGF",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# function for plotting the attention weights\n",
+ "def plot_attention(attention, sentence, predicted_sentence):\n",
+ " fig = plt.figure(figsize=(10,10))\n",
+ " ax = fig.add_subplot(1, 1, 1)\n",
+ " ax.matshow(attention, cmap='viridis')\n",
+ " \n",
+ " fontdict = {'fontsize': 14}\n",
+ " \n",
+ " ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)\n",
+ " ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)\n",
+ "\n",
+ " plt.show()"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "sl9zUHzg3jGI",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
+ " result, sentence, attention_plot = evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)\n",
+ " \n",
+ " print('Input: {}'.format(sentence))\n",
+ " print('Predicted translation: {}'.format(result))\n",
+ " \n",
+ " attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]\n",
+ " plot_attention(attention_plot, sentence.split(' '), result.split(' '))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "WrAM0FDomq3E",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "zSx2iM36EZQZ",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "A3LLCx3ZE0Ls",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "DUQVLVqUE1YW",
+ "colab_type": "code",
+ "colab": {
+ "autoexec": {
+ "startup": false,
+ "wait_interval": 0
+ }
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "# wrong translation\n",
+ "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ },
+ {
+ "metadata": {
+ "id": "RTe5P5ioMJwN",
+ "colab_type": "text"
+ },
+ "cell_type": "markdown",
+ "source": [
+ "## Next steps\n",
+ "\n",
+ "* [Download a different dataset](http://www.manythings.org/anki/) to experiment with translations, for example, English to German, or English to French.\n",
+ "* Experiment with training on a larger dataset, or using more epochs\n"
+ ]
+ }
+ ]
+} \ No newline at end of file
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index ff903a78cc..5b5557bd8f 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
from tensorflow.contrib.gan.python import train as tfgan_train
from tensorflow.python.estimator import model_fn as model_fn_lib
from tensorflow.python.estimator.canned import head
+from tensorflow.python.estimator.export import export_output
from tensorflow.python.framework import ops
from tensorflow.python.ops import metrics as metrics_lib
@@ -182,7 +183,10 @@ class GANHead(head._Head): # pylint: disable=protected-access
if mode == model_fn_lib.ModeKeys.PREDICT:
return model_fn_lib.EstimatorSpec(
mode=model_fn_lib.ModeKeys.PREDICT,
- predictions=gan_model.generated_data)
+ predictions=gan_model.generated_data,
+ export_outputs={
+ 'predict': export_output.PredictOutput(gan_model.generated_data)
+ })
elif mode == model_fn_lib.ModeKeys.EVAL:
gan_loss = self.create_loss(
features=None, mode=mode, logits=gan_model, labels=None)
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py
index 6587f1fc60..5309d87765 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_test.py
@@ -26,8 +26,11 @@ from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
from tensorflow.python.training import training
+_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
def dummy_loss(gan_model, add_summaries=True): # pylint:disable=unused-argument
return math_ops.reduce_sum(gan_model.discriminator_real_outputs -
@@ -71,13 +74,15 @@ class GANHeadTest(test.TestCase):
return {}
def _test_modes_helper(self, mode):
- self.gan_head.create_estimator_spec(
+ return self.gan_head.create_estimator_spec(
features=None,
mode=mode,
logits=get_gan_model())
def test_modes_predict(self):
- self._test_modes_helper(model_fn_lib.ModeKeys.PREDICT)
+ spec = self._test_modes_helper(model_fn_lib.ModeKeys.PREDICT)
+ self.assertItemsEqual((_DEFAULT_SERVING_KEY, 'predict'),
+ spec.export_outputs.keys())
def test_modes_eval(self):
self._test_modes_helper(model_fn_lib.ModeKeys.EVAL)
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
index 1f9dd0decb..9025c992a4 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.cc
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -57,7 +57,7 @@ Status GdrServer::Init() {
new GdrWorker(env, remote_memory_manager_.get()));
};
TF_RETURN_IF_ERROR(
- GrpcServer::Init(nullptr, rendezvous_mgr_func, worker_func));
+ GrpcServer::Init(nullptr, rendezvous_mgr_func, nullptr, worker_func));
return remote_memory_manager_->Init();
}
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index a7b0d805a3..4cfaa0f36d 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -26,7 +26,7 @@ namespace optimized_ops {
// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
// Jetson TX-2. This compiler does not support the offsetof() macro.
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
-
+#include <stddef.h>
// clang-format gets confused with this file and ends up formatting lines to
// be larger than 80 characters. Turn off here and back on at the end of the
// file.
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index cbeb53bee7..681448be20 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -19,7 +19,9 @@ limitations under the License.
#include <string>
#include <vector>
+// Place `<locale>` before <Python.h> to avoid build failures in macOS.
#include <Python.h>
+#include <locale>
// We forward declare TFLite classes here to avoid exposing them to SWIG.
namespace tflite {
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 4f35de4e5d..bbdf962d04 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -29,6 +29,7 @@ py_library(
"python/training/reg_adagrad_optimizer.py",
"python/training/sign_decay.py",
"python/training/variable_clipping_optimizer.py",
+ "python/training/weight_decay_optimizers.py",
],
srcs_version = "PY2AND3",
deps = [
@@ -198,6 +199,25 @@ py_test(
],
)
+py_test(
+ name = "weight_decay_optimizers_test",
+ srcs = ["python/training/weight_decay_optimizers_test.py"],
+ srcs_version = "PY2AND3",
+ deps = [
+ ":opt_py",
+ "//tensorflow/python:array_ops",
+ "//tensorflow/python:client_testlib",
+ "//tensorflow/python:constant_op",
+ "//tensorflow/python:dtypes",
+ "//tensorflow/python:framework_ops",
+ "//tensorflow/python:math_ops",
+ "//tensorflow/python:resource_variable_ops",
+ "//tensorflow/python:session",
+ "//tensorflow/python:variables",
+ "//third_party/py/numpy",
+ ],
+)
+
tf_py_test(
name = "drop_stale_gradient_optimizer_test",
srcs = ["python/training/drop_stale_gradient_optimizer_test.py"],
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index b41148329d..65777b1323 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -22,16 +22,17 @@ from __future__ import print_function
from tensorflow.contrib.opt.python.training.adamax import *
from tensorflow.contrib.opt.python.training.addsign import *
from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
from tensorflow.contrib.opt.python.training.external_optimizer import *
+from tensorflow.contrib.opt.python.training.ggt import *
from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
+from tensorflow.contrib.opt.python.training.model_average_optimizer import *
from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
from tensorflow.contrib.opt.python.training.nadam_optimizer import *
from tensorflow.contrib.opt.python.training.powersign import *
from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
-from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
-from tensorflow.contrib.opt.python.training.model_average_optimizer import *
-from tensorflow.contrib.opt.python.training.ggt import *
+from tensorflow.contrib.opt.python.training.weight_decay_optimizers import *
# pylint: enable=wildcard-import
from tensorflow.python.util.all_util import remove_undocumented
@@ -47,6 +48,10 @@ _allowed_symbols = [
'LazyAdamOptimizer',
'NadamOptimizer',
'MovingAverageOptimizer',
+ 'MomentumWOptimizer',
+ 'AdamWOptimizer',
+ 'DecoupledWeightDecayExtension',
+ 'extend_with_decoupled_weight_decay',
'ScipyOptimizerInterface',
'VariableClippingOptimizer',
'MultitaskOptimizerWrapper',
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
new file mode 100644
index 0000000000..b9cf40eb7b
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
@@ -0,0 +1,362 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Base class to make optimizers weight decay ready."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import momentum as momentum_opt
+from tensorflow.python.training import optimizer
+from tensorflow.python.util.tf_export import tf_export
+
+
+class DecoupledWeightDecayExtension(object):
+ """This class allows to extend optimizers with decoupled weight decay.
+
+ It implements the decoupled weight decay described by Loshchilov & Hutter
+ (https://arxiv.org/pdf/1711.05101.pdf), in which the weight decay is
+ decoupled from the optimization steps w.r.t. to the loss function.
+ For SGD variants, this simplifies hyperparameter search since it decouples
+ the settings of weight decay and learning rate.
+ For adaptive gradient algorithms, it regularizes variables with large
+ gradients more than L2 regularization would, which was shown to yield better
+ training loss and generalization error in the paper above.
+
+ This class alone is not an optimizer but rather extends existing
+ optimizers with decoupled weight decay. We explicitly define the two examples
+ used in the above paper (SGDW and AdamW), but in general this can extend
+ any OptimizerX by using
+ `extend_with_weight_decay(OptimizerX, weight_decay=weight_decay)`.
+ In order for it to work, it must be the first class the Optimizer with
+ weight decay inherits from, e.g.
+
+ ```python
+ class AdamWOptimizer(DecoupledWeightDecayExtension, adam.AdamOptimizer):
+ def __init__(self, weight_decay, *args, **kwargs):
+ super(AdamWOptimizer, self).__init__(weight_decay, *args, **kwargs).
+ ```
+
+ Note that this extension decays weights BEFORE applying the update based
+ on the gradient, i.e. this extension only has the desired behaviour for
+ optimizers which do not depend on the value of'var' in the update step!
+ """
+
+ def __init__(self, weight_decay, **kwargs):
+ """Construct the extension class that adds weight decay to an optimizer.
+
+ Args:
+ weight_decay: A `Tensor` or a floating point value, the factor by which
+ a variable is decayed in the update step.
+ **kwargs: Optional list or tuple or set of `Variable` objects to
+ decay.
+ """
+ self._decay_var_list = None # is set in minimize or apply_gradients
+ self._weight_decay = weight_decay
+ # The tensors are initialized in call to _prepare
+ self._weight_decay_tensor = None
+ super(DecoupledWeightDecayExtension, self).__init__(**kwargs)
+
+ def minimize(self, loss, global_step=None, var_list=None,
+ gate_gradients=optimizer.Optimizer.GATE_OP,
+ aggregation_method=None, colocate_gradients_with_ops=False,
+ name=None, grad_loss=None, decay_var_list=None):
+ """Add operations to minimize `loss` by updating `var_list` with decay.
+
+ This function is the same as Optimizer.minimize except that it allows to
+ specify the variables that should be decayed using decay_var_list.
+ If decay_var_list is None, all variables in var_list are decayed.
+
+ For more information see the documentation of Optimizer.minimize.
+
+ Args:
+ loss: A `Tensor` containing the value to minimize.
+ global_step: Optional `Variable` to increment by one after the
+ variables have been updated.
+ var_list: Optional list or tuple of `Variable` objects to update to
+ minimize `loss`. Defaults to the list of variables collected in
+ the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
+ gate_gradients: How to gate the computation of gradients. Can be
+ `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+ aggregation_method: Specifies the method used to combine gradient terms.
+ Valid values are defined in the class `AggregationMethod`.
+ colocate_gradients_with_ops: If True, try colocating gradients with
+ the corresponding op.
+ name: Optional name for the returned operation.
+ grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+ decay_var_list: Optional list of decay variables.
+
+ Returns:
+ An Operation that updates the variables in `var_list`. If `global_step`
+ was not `None`, that operation also increments `global_step`.
+
+ """
+ self._decay_var_list = set(decay_var_list) if decay_var_list else False
+ return super(DecoupledWeightDecayExtension, self).minimize(
+ loss, global_step=global_step, var_list=var_list,
+ gate_gradients=gate_gradients, aggregation_method=aggregation_method,
+ colocate_gradients_with_ops=colocate_gradients_with_ops, name=name,
+ grad_loss=grad_loss)
+
+ def apply_gradients(self, grads_and_vars, global_step=None, name=None,
+ decay_var_list=None):
+ """Apply gradients to variables and decay the variables.
+
+ This function is the same as Optimizer.apply_gradients except that it
+ allows to specify the variables that should be decayed using
+ decay_var_list. If decay_var_list is None, all variables in var_list
+ are decayed.
+
+ For more information see the documentation of Optimizer.apply_gradients.
+
+ Args:
+ grads_and_vars: List of (gradient, variable) pairs as returned by
+ `compute_gradients()`.
+ global_step: Optional `Variable` to increment by one after the
+ variables have been updated.
+ name: Optional name for the returned operation. Default to the
+ name passed to the `Optimizer` constructor.
+ decay_var_list: Optional list of decay variables.
+
+ Returns:
+ An `Operation` that applies the specified gradients. If `global_step`
+ was not None, that operation also increments `global_step`.
+ """
+ self._decay_var_list = set(decay_var_list) if decay_var_list else False
+ return super(DecoupledWeightDecayExtension, self).apply_gradients(
+ grads_and_vars, global_step=global_step, name=name)
+
+ def _prepare(self):
+ weight_decay = self._weight_decay
+ if callable(weight_decay):
+ weight_decay = weight_decay()
+ self._weight_decay_tensor = ops.convert_to_tensor(
+ weight_decay, name="weight_decay")
+ # Call the optimizers _prepare function.
+ super(DecoupledWeightDecayExtension, self)._prepare()
+
+ def _decay_weights_op(self, var):
+ if not self._decay_var_list or var in self._decay_var_list:
+ return var.assign_sub(self._weight_decay * var, self._use_locking)
+ return control_flow_ops.no_op()
+
+ def _decay_weights_sparse_op(self, var, indices, scatter_add):
+ if not self._decay_var_list or var in self._decay_var_list:
+ return scatter_add(var, indices, -self._weight_decay * var,
+ self._use_locking)
+ return control_flow_ops.no_op()
+
+ # Here, we overwrite the apply functions that the base optimizer calls.
+ # super().apply_x resolves to the apply_x function of the BaseOptimizer.
+ def _apply_dense(self, grad, var):
+ with ops.control_dependencies([self._decay_weights_op(var)]):
+ return super(DecoupledWeightDecayExtension, self)._apply_dense(grad, var)
+
+ def _resource_apply_dense(self, grad, var):
+ with ops.control_dependencies([self._decay_weights_op(var)]):
+ return super(DecoupledWeightDecayExtension, self)._resource_apply_dense(
+ grad, var)
+
+ def _apply_sparse(self, grad, var):
+ scatter_add = state_ops.scatter_add
+ decay_op = self._decay_weights_sparse_op(var, grad.indices, scatter_add)
+ with ops.control_dependencies([decay_op]):
+ return super(DecoupledWeightDecayExtension, self)._apply_sparse(
+ grad, var)
+
+ def _resource_scatter_add(self, x, i, v, _=None):
+ # last argument allows for one overflow argument, to have the same function
+ # signature as state_ops.scatter_add
+ with ops.control_dependencies(
+ [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+ return x.value()
+
+ def _resource_apply_sparse(self, grad, var, indices):
+ scatter_add = self._resource_scatter_add
+ decay_op = self._decay_weights_sparse_op(var, indices, scatter_add)
+ with ops.control_dependencies([decay_op]):
+ return super(DecoupledWeightDecayExtension, self)._resource_apply_sparse(
+ grad, var, indices)
+
+
+def extend_with_decoupled_weight_decay(base_optimizer):
+ """Factory function returning an optimizer class with decoupled weight decay.
+
+ Returns an optimizer class. An instance of the returned class computes the
+ update step of `base_optimizer` and additionally decays the weights.
+ E.g., the class returned by
+ `extend_with_decoupled_weight_decay(tf.train.AdamOptimizer)` is equivalent to
+ `tf.contrib.opt.AdamWOptimizer`.
+
+ The API of the new optimizer class slightly differs from the API of the
+ base optimizer:
+ - The first argument to the constructor is the weight decay rate.
+ - `minimize` and `apply_gradients` accept the optional keyword argument
+ `decay_var_list`, which specifies the variables that should be decayed.
+ If `None`, all variables that are optimized are decayed.
+
+ Usage example:
+ ```python
+ # MyAdamW is a new class
+ MyAdamW = extend_with_decoupled_weight_decay(tf.train.AdamOptimizer)
+ # Create a MyAdamW object
+ optimizer = MyAdamW(weight_decay=0.001, learning_rate=0.001)
+ sess.run(optimizer.minimize(loss, decay_variables=[var1, var2]))
+
+ Note that this extension decays weights BEFORE applying the update based
+ on the gradient, i.e. this extension only has the desired behaviour for
+ optimizers which do not depend on the value of'var' in the update step!
+ ```
+
+ Args:
+ base_optimizer: An optimizer class that inherits from tf.train.Optimizer.
+
+ Returns:
+ A new optimizer class that inherits from DecoupledWeightDecayExtension
+ and base_optimizer.
+ """
+
+ class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecayExtension,
+ base_optimizer):
+ """Base_optimizer with decoupled weight decay.
+
+ This class computes the update step of `base_optimizer` and
+ additionally decays the variable with the weight decay being decoupled from
+ the optimization steps w.r.t. to the loss function, as described by
+ Loshchilov & Hutter (https://arxiv.org/pdf/1711.05101.pdf).
+ For SGD variants, this simplifies hyperparameter search since
+ it decouples the settings of weight decay and learning rate.
+ For adaptive gradient algorithms, it regularizes variables with large
+ gradients more than L2 regularization would, which was shown to yield
+ better training loss and generalization error in the paper above.
+ """
+
+ def __init__(self, weight_decay, *args, **kwargs):
+ # super delegation is necessary here
+ # pylint: disable=useless-super-delegation
+ super(OptimizerWithDecoupledWeightDecay, self).__init__(
+ weight_decay, *args, **kwargs)
+ # pylint: enable=useless-super-delegation
+
+ return OptimizerWithDecoupledWeightDecay
+
+
+@tf_export("contrib.opt.MomentumWOptimizer")
+class MomentumWOptimizer(DecoupledWeightDecayExtension,
+ momentum_opt.MomentumOptimizer):
+ """Optimizer that implements the Momentum algorithm with weight_decay.
+
+ This is an implementation of the SGDW optimizer described in "Fixing
+ Weight Decay Regularization in Adam" by Loshchilov & Hutter
+ (https://arxiv.org/abs/1711.05101)
+ ([pdf])(https://arxiv.org/pdf/1711.05101.pdf).
+ It computes the update step of `train.MomentumOptimizer` and additionally
+ decays the variable. Note that this is different from adding
+ L2 regularization on the variables to the loss. Decoupling the weight decay
+ from other hyperparameters (in particular the learning rate) simplifies
+ hyperparameter search.
+
+ For further information see the documentation of the Momentum Optimizer.
+
+ Note that this optimizer can also be instantiated as
+ ```python
+ extend_with_weight_decay(tf.train.MomentumOptimizer,
+ weight_decay=weight_decay)
+ ```
+ """
+
+ def __init__(self, weight_decay, learning_rate, momentum,
+ use_locking=False, name="MomentumW", use_nesterov=False):
+ """Construct a new MomentumW optimizer.
+
+ For further information see the documentation of the Momentum Optimizer.
+
+ Args:
+ weight_decay: A `Tensor` or a floating point value. The weight decay.
+ learning_rate: A `Tensor` or a floating point value. The learning rate.
+ momentum: A `Tensor` or a floating point value. The momentum.
+ use_locking: If `True` use locks for update operations.
+ name: Optional name prefix for the operations created when applying
+ gradients. Defaults to "Momentum".
+ use_nesterov: If `True` use Nesterov Momentum.
+ See [Sutskever et al., 2013](
+ http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
+ This implementation always computes gradients at the value of the
+ variable(s) passed to the optimizer. Using Nesterov Momentum makes the
+ variable(s) track the values called `theta_t + mu*v_t` in the paper.
+
+ @compatibility(eager)
+ When eager execution is enabled, learning_rate, weight_decay and momentum
+ can each be a callable that takes no arguments and returns the actual value
+ to use. This can be useful for changing these values across different
+ invocations of optimizer functions.
+ @end_compatibility
+ """
+ super(MomentumWOptimizer, self).__init__(
+ weight_decay, learning_rate=learning_rate, momentum=momentum,
+ use_locking=use_locking, name=name, use_nesterov=use_nesterov)
+
+
+@tf_export("contrib.opt.AdamWOptimizer")
+class AdamWOptimizer(DecoupledWeightDecayExtension, adam.AdamOptimizer):
+ """Optimizer that implements the Adam algorithm with weight decay.
+
+ This is an implementation of the AdamW optimizer described in "Fixing
+ Weight Decay Regularization in Adam" by Loshchilov & Hutter
+ (https://arxiv.org/abs/1711.05101)
+ ([pdf])(https://arxiv.org/pdf/1711.05101.pdf).
+
+ It computes the update step of `train.AdamOptimizer` and additionally decays
+ the variable. Note that this is different from adding L2 regularization on
+ the variables to the loss: it regularizes variables with large
+ gradients more than L2 regularization would, which was shown to yield better
+ training loss and generalization error in the paper above.
+
+ For further information see the documentation of the Adam Optimizer.
+
+ Note that this optimizer can also be instantiated as
+ ```python
+ extend_with_weight_decay(tf.train.AdamOptimizer, weight_decay=weight_decay)
+ ```
+ """
+
+ def __init__(self, weight_decay, learning_rate=0.001, beta1=0.9, beta2=0.999,
+ epsilon=1e-8, use_locking=False, name="AdamW"):
+ """Construct a new AdamW optimizer.
+
+ For further information see the documentation of the Adam Optimizer.
+
+ Args:
+ weight_decay: A `Tensor` or a floating point value. The weight decay.
+ learning_rate: A Tensor or a floating point value. The learning rate.
+ beta1: A float value or a constant float tensor.
+ The exponential decay rate for the 1st moment estimates.
+ beta2: A float value or a constant float tensor.
+ The exponential decay rate for the 2nd moment estimates.
+ epsilon: A small constant for numerical stability. This epsilon is
+ "epsilon hat" in the Kingma and Ba paper (in the formula just before
+ Section 2.1), not the epsilon in Algorithm 1 of the paper.
+ use_locking: If True use locks for update operations.
+ name: Optional name for the operations created when applying gradients.
+ Defaults to "Adam".
+ """
+ super(AdamWOptimizer, self).__init__(
+ weight_decay, learning_rate=learning_rate, beta1=beta1, beta2=beta2,
+ epsilon=epsilon, use_locking=use_locking, name=name)
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py
new file mode 100644
index 0000000000..76d8a5697a
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py
@@ -0,0 +1,188 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for optimizers with weight decay."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import weight_decay_optimizers
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+WEIGHT_DECAY = 0.01
+
+
+def adamw_update_numpy(param, g_t, t, m, v, lr=0.001, beta1=0.9,
+ beta2=0.999, epsilon=1e-8):
+ lr_t = lr * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+ m_t = beta1 * m + (1 - beta1) * g_t
+ v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+ param_t = (param - lr_t * m_t / (np.sqrt(v_t) + epsilon) -
+ (param * WEIGHT_DECAY))
+ return param_t, m_t, v_t
+
+
+def momentumw_update_numpy(param, g_t, m, lr=0.001, momentum=0.9, **_):
+ # v, t are not needed for momentum optimizer
+ m = momentum * m + g_t
+ param_t = param - lr * m - param * WEIGHT_DECAY
+ return param_t, m, None
+
+
+class WeightDecayOptimizerTest(test.TestCase):
+
+ def doTest(self, optimizer, update_fn, optimizer_name, slot_name,
+ use_resource=False, do_sparse=False):
+ for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+ with self.test_session(graph=ops.Graph()):
+ # Initialize variables for numpy implementation.
+ m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+ var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+ grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+ var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+ grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+ if use_resource:
+ var0 = resource_variable_ops.ResourceVariable(
+ var0_np, name="var0_%d" % i)
+ var1 = resource_variable_ops.ResourceVariable(
+ var1_np, name="var1_%d" % i)
+ else:
+ var0 = variables.Variable(var0_np)
+ var1 = variables.Variable(var1_np)
+
+ if do_sparse:
+ grads0_np_indices = np.array([0, 1], dtype=np.int32)
+ grads0 = ops.IndexedSlices(constant_op.constant(grads0_np),
+ constant_op.constant(grads0_np_indices),
+ constant_op.constant([2]))
+ grads1_np_indices = np.array([0, 1], dtype=np.int32)
+ grads1 = ops.IndexedSlices(constant_op.constant(grads1_np),
+ constant_op.constant(grads1_np_indices),
+ constant_op.constant([2]))
+ else:
+ grads0 = constant_op.constant(grads0_np)
+ grads1 = constant_op.constant(grads1_np)
+
+ opt = optimizer()
+ update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+ if not context.executing_eagerly():
+ with ops.Graph().as_default():
+ # Shouldn't return non-slot variables from other graphs.
+ self.assertEqual(0, len(opt.variables()))
+ self.evaluate(variables.global_variables_initializer())
+ # Fetch params to validate initial values
+ self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+ self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+ # Run 3 steps of the optimizer
+ for t in range(1, 4):
+ if not context.executing_eagerly():
+ self.evaluate(update)
+ elif t > 1:
+ opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+ var0_np, m0, v0 = update_fn(var0_np, grads0_np, t=t, m=m0, v=v0)
+ var1_np, m1, v1 = update_fn(var1_np, grads1_np, t=t, m=m1, v=v1)
+
+ # Validate updated params
+ self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+ self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+ if use_resource:
+ self.assertEqual("var0_%d/%s:0" % (i, optimizer_name),
+ opt.get_slot(var=var0, name=slot_name).name)
+
+
+class AdamWOptimizerTest(WeightDecayOptimizerTest):
+
+ @staticmethod
+ def get_optimizer():
+ return weight_decay_optimizers.AdamWOptimizer(WEIGHT_DECAY)
+
+ def testSparse(self):
+ self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m",
+ use_resource=False, do_sparse=True)
+
+ def testResourceSparse(self):
+ self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m",
+ use_resource=True, do_sparse=True)
+
+ def testBasic(self):
+ self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m",
+ use_resource=False)
+
+ @test_util.run_in_graph_and_eager_modes(reset_test=True)
+ def testResourceBasic(self):
+ self.doTest(self.get_optimizer, adamw_update_numpy, "AdamW", "m",
+ use_resource=True)
+
+
+class MomentumWOptimizerTest(WeightDecayOptimizerTest):
+
+ @staticmethod
+ def get_optimizer():
+ return weight_decay_optimizers.MomentumWOptimizer(WEIGHT_DECAY, 0.001, 0.9)
+
+ def testSparse(self):
+ self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW",
+ "momentum", use_resource=False, do_sparse=True)
+
+ def testResourceSparse(self):
+ self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW",
+ "momentum", use_resource=True, do_sparse=True)
+
+ def testBasic(self):
+ self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW",
+ "momentum", use_resource=False)
+
+ @test_util.run_in_graph_and_eager_modes(reset_test=True)
+ def testResourceBasic(self):
+ self.doTest(self.get_optimizer, momentumw_update_numpy, "MomentumW",
+ "momentum", use_resource=True)
+
+
+class ExtendWithWeightDecayTest(WeightDecayOptimizerTest):
+
+ @staticmethod
+ def get_optimizer():
+ adamw = weight_decay_optimizers.extend_with_decoupled_weight_decay(
+ adam.AdamOptimizer)
+ return adamw(WEIGHT_DECAY)
+
+ def testBasic(self):
+ self.doTest(self.get_optimizer, adamw_update_numpy, "Adam", "m",
+ use_resource=False)
+
+ @test_util.run_in_graph_and_eager_modes(reset_test=True)
+ def testResourceBasic(self):
+ self.doTest(self.get_optimizer, adamw_update_numpy, "Adam", "m",
+ use_resource=True)
+
+
+if __name__ == "__main__":
+ test.main()
diff --git a/tensorflow/contrib/solvers/python/ops/linear_equations.py b/tensorflow/contrib/solvers/python/ops/linear_equations.py
index 9305c6a11c..85918bf850 100644
--- a/tensorflow/contrib/solvers/python/ops/linear_equations.py
+++ b/tensorflow/contrib/solvers/python/ops/linear_equations.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import linalg_ops
def conjugate_gradient(operator,
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index a5d8b061b6..adda0b758b 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -49,7 +49,6 @@ tf_cuda_cc_test(
tf_custom_op_library(
name = "python/ops/_trt_engine_op.so",
srcs = [
- "ops/trt_calib_op.cc",
"ops/trt_engine_op.cc",
],
deps = [
@@ -76,11 +75,9 @@ tf_cuda_library(
cc_library(
name = "trt_engine_op_kernel",
srcs = [
- "kernels/trt_calib_op.cc",
"kernels/trt_engine_op.cc",
],
hdrs = [
- "kernels/trt_calib_op.h",
"kernels/trt_engine_op.h",
],
copts = tf_copts(),
@@ -89,20 +86,22 @@ cc_library(
":trt_logging",
":trt_plugins",
":trt_resources",
+ ":trt_conversion",
+ ":utils",
"//tensorflow/core:gpu_headers_lib",
"//tensorflow/core:lib_proto_parsing",
"//tensorflow/core:stream_executor_headers_lib",
+ "//tensorflow/core/grappler/costs:graph_properties",
] + if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]) + tf_custom_op_library_additional_deps(),
- # TODO(laigd)
+ # TODO(laigd): fix this by merging header file in cc file.
alwayslink = 1, # buildozer: disable=alwayslink-with-hdrs
)
tf_gen_op_libs(
op_lib_names = [
"trt_engine_op",
- "trt_calib_op",
],
)
@@ -122,7 +121,6 @@ tf_gen_op_wrapper_py(
name = "trt_engine_op",
gen_locally = True,
deps = [
- ":trt_calib_op_op_lib",
":trt_engine_op_op_lib",
":trt_logging",
":trt_shape_function",
@@ -140,7 +138,6 @@ tf_custom_op_py_library(
kernels = [
":trt_engine_op_kernel",
":trt_engine_op_op_lib",
- ":trt_calib_op_op_lib",
":trt_shape_function",
],
srcs_version = "PY2AND3",
@@ -191,7 +188,6 @@ tf_py_wrap_cc(
deps = [
":trt_conversion",
":trt_engine_op_kernel",
- "//tensorflow/core:framework_lite",
"//third_party/python_runtime:headers",
],
)
@@ -211,6 +207,7 @@ tf_cuda_library(
],
deps = [
":trt_logging",
+ ":utils",
"//tensorflow/core:framework_headers_lib",
"//tensorflow/core:framework_lite",
"//tensorflow/core:lib_proto_parsing",
@@ -237,12 +234,12 @@ tf_cuda_library(
":trt_plugins",
":trt_logging",
":trt_resources",
+ ":utils",
"//tensorflow/core/grappler/clusters:cluster",
"//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
"//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
"//tensorflow/core/grappler:grappler_item",
"//tensorflow/core/grappler:utils",
- "//tensorflow/core:framework",
"//tensorflow/core:gpu_runtime",
"//tensorflow/core:framework_lite",
"//tensorflow/core:graph",
@@ -343,3 +340,8 @@ py_test(
"//tensorflow/python:framework_test_lib",
],
)
+
+cc_library(
+ name = "utils",
+ hdrs = ["convert/utils.h"],
+)
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index da4dd5a14c..4dc1c551cc 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -14,8 +14,8 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include <fstream>
#include <list>
#include <map>
#include <set>
@@ -24,10 +24,17 @@ limitations under the License.
#include <vector>
#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
#include "tensorflow/contrib/tensorrt/segment/segment.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
#include "tensorflow/core/common_runtime/gpu/process_state.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
#include "tensorflow/core/graph/algorithm.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/graph/graph_constructor.h"
@@ -39,17 +46,39 @@ limitations under the License.
#include "tensorflow/core/grappler/utils.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/numbers.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h" // NOLINT
#include "tensorflow/core/protobuf/device_properties.pb.h" // NOLINT
+#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT
+#include "tensorflow/core/util/device_name_utils.h"
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
#include "tensorrt/include/NvInfer.h"
-
namespace tensorflow {
namespace tensorrt {
namespace convert {
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
+// Returns compiled TRT version information {Maj, Min, Patch}
+std::vector<int> GetLinkedTensorRTVersion() {
+ return {NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH};
+}
+
+// Returns loaded TRT library version {Maj, Min, Patch}
+std::vector<int> GetLoadedTensorRTVersion() {
+ int ver = getInferLibVersion();
+ int ver_major = ver / 1000;
+ ver = ver - ver_major * 1000;
+ int ver_minor = ver / 100;
+ int ver_patch = ver - ver_minor * 100;
+ return {ver_major, ver_minor, ver_patch};
+}
+
namespace {
bool IsTensorRTCandidate(const tensorflow::Node* node) {
@@ -82,229 +111,6 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) {
PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
}
-void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
- const std::set<int>& subgraph_node_ids,
- tensorflow::EdgeSet* incoming_edges) {
- for (int node_id : subgraph_node_ids) {
- const tensorflow::Node* node = graph.FindNodeId(node_id);
- for (const tensorflow::Edge* edge : node->in_edges()) {
- if (!subgraph_node_ids.count(edge->src()->id()) &&
- !edge->src()->IsSource() && !edge->IsControlEdge()) {
- incoming_edges->insert(edge);
- VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
- << " Y, ";
- } else {
- VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
- << " N, ";
- }
- }
- }
-}
-
-void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
- const std::set<int>& subgraph_node_ids,
- tensorflow::EdgeSet* outgoing_edges) {
- for (int node_id : subgraph_node_ids) {
- const tensorflow::Node* node = graph.FindNodeId(node_id);
- for (const tensorflow::Edge* edge : node->out_edges()) {
- if (!subgraph_node_ids.count(edge->dst()->id()) &&
- !edge->dst()->IsSink() && !edge->IsControlEdge()) {
- VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
- << " Y, ";
- outgoing_edges->insert(edge);
- } else {
- VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
- << " N, ";
- }
- }
- }
-}
-
-std::pair<string, int> ParseTensorName(const string& name,
- int default_idx = 0) {
- string name_no_idx = name;
- int idx = default_idx;
- const size_t sep = name_no_idx.find_last_of(':');
- if (sep != string::npos) {
- name_no_idx = name_no_idx.substr(0, sep);
- idx = std::stoi(name.substr(sep + 1));
- }
- return std::make_pair(name_no_idx, idx);
-}
-
-std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
- const std::vector<string>& tensor_names) {
- std::unordered_map<string, std::vector<int>> result;
- for (const string& tensor_name : tensor_names) {
- string node_name;
- int index;
- std::tie(node_name, index) = ParseTensorName(tensor_name);
- result[node_name].push_back(index);
- }
- return result;
-}
-
-// TODO(sami): convert references to pointers
-struct ConvertGraphParams {
- ConvertGraphParams(
- tensorflow::Graph& inp_graph,
- const std::vector<string>& output_node_names,
- const std::set<int>& subgraph_node_id_numbers,
- size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
- const tensorflow::grappler::GraphProperties& current_graph_properties,
- std::unordered_map<string, std::pair<int, string>>* output_edges,
- int engine_precision_mode, const string& device_name,
- std::shared_ptr<nvinfer1::IGpuAllocator> allocator, int cuda_gpu_id)
- : graph(inp_graph),
- output_names(output_node_names),
- subgraph_node_ids(subgraph_node_id_numbers),
- max_batch_size(max_supported_batch_size),
- max_workspace_size_bytes(max_consumed_workspace_size_bytes),
- graph_properties(current_graph_properties),
- output_edge_map(output_edges),
- precision_mode(engine_precision_mode),
- device_name_(device_name),
- allocator_(allocator),
- cuda_gpu_id_(cuda_gpu_id) {}
- tensorflow::Graph& graph;
- const std::vector<string>& output_names;
- const std::set<int>& subgraph_node_ids;
- size_t max_batch_size;
- size_t max_workspace_size_bytes;
- const tensorflow::grappler::GraphProperties& graph_properties;
- std::unordered_map<string, std::pair<int, string>>* output_edge_map;
- int precision_mode;
- string device_name_;
- std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
- int cuda_gpu_id_;
- std::vector<std::pair<int, int>> subgraph_inputs;
- std::vector<std::pair<int, int>> subgraph_outputs;
- tensorflow::EdgeSet subgraph_incoming_edges;
- tensorflow::EdgeSet subgraph_outgoing_edges;
-};
-
-static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
- GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
- &p->subgraph_incoming_edges);
-
- std::set<std::pair<int, int>> unique_tensors;
- // Add only unique input source nodes. If output of an outside node is shared
- // between multiple nodes inside the engine, only one edge should be created
- for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
- unique_tensors.insert({edge->src()->id(), edge->src_output()});
- }
- p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
- unique_tensors.end());
- GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
- &p->subgraph_outgoing_edges);
- unique_tensors.clear();
- // Similar to above, if multiple ouside nodes are sharing the output of an
- // internal node only one output port should be created and shared between
- // outputs
- for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
- unique_tensors.insert({edge->src()->id(), edge->src_output()});
- }
- p->subgraph_outputs.reserve(unique_tensors.size());
- p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
- unique_tensors.begin(), unique_tensors.end());
- return tensorflow::Status::OK();
-}
-
-tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
- TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
- tensorflow::NodeDef trt_node_def;
- SubGraphParams s(params->graph, params->subgraph_node_ids,
- params->subgraph_inputs, params->subgraph_outputs,
- params->max_batch_size, params->max_workspace_size_bytes,
- params->graph_properties, params->output_edge_map,
- &trt_node_def, params->precision_mode, params->device_name_,
- params->allocator_, params->cuda_gpu_id_);
- TF_RETURN_IF_ERROR(InjectCalibrationNode(s));
- tensorflow::Status status;
- tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
-
- TF_RETURN_IF_ERROR(status);
-
- for (auto in_edge :
- params->subgraph_incoming_edges) { // loop over incoming edges and
- // attach them to calib node
- auto src_output = in_edge->src_output();
- auto dst_node = in_edge->dst();
- auto dst_input = in_edge->dst_input();
- VLOG(1) << " update edge " << trt_node->name() << ":" << src_output
- << " -> " << dst_node->name() << ":" << dst_input;
- TF_RETURN_IF_ERROR(
- params->graph.UpdateEdge(trt_node, src_output, dst_node, dst_input));
- }
- return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
- TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
- tensorflow::NodeDef trt_node_def;
-
- SubGraphParams s(params->graph, params->subgraph_node_ids,
- params->subgraph_inputs, params->subgraph_outputs,
- params->max_batch_size, params->max_workspace_size_bytes,
- params->graph_properties, params->output_edge_map,
- &trt_node_def, params->precision_mode, params->device_name_,
- params->allocator_, params->cuda_gpu_id_);
- TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s));
- tensorflow::Status status;
- tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
-
- // AddNode does not wire edges.
- // Re-map incoming edges to use the new TRT node instead of the orig subgraph
- std::map<std::pair<int, int>, int> subgraph_edge_to_input_map;
- for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
- subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
- }
- std::set<std::pair<int, int>> unique_tensors;
- for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
- std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
- if (unique_tensors.count(old_src)) continue;
- unique_tensors.insert(old_src);
- int new_src_output = subgraph_edge_to_input_map.at(old_src);
- params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
- new_src_output);
- VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
- << " -> " << trt_node->name() << ":" << new_src_output;
- params->graph.RemoveEdge(edge);
- }
- if (VLOG_IS_ON(2)) {
- VLOG(2) << "new edge count: " << trt_node->in_edges().size();
- for (const tensorflow::Edge* edge : trt_node->in_edges()) {
- VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
- }
- }
- TF_RETURN_IF_ERROR(status);
-
- // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
- std::map<std::pair<int, int>, int> subgraph_edge_to_output_map;
- for (size_t i = 0; i < params->subgraph_outputs.size(); ++i) {
- subgraph_edge_to_output_map.insert({params->subgraph_outputs.at(i), i});
- }
- TF_RETURN_IF_ERROR(status);
- for (const tensorflow::Edge* edge : params->subgraph_outgoing_edges) {
- std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
- int new_src_output = subgraph_edge_to_output_map.at(old_src);
- TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
- trt_node, new_src_output, edge->dst(), edge->dst_input()));
- VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
- << edge->dst()->name() << ":" << edge->dst_input();
- }
- // Remove the original subgraph
- for (int node_id : params->subgraph_node_ids) {
- tensorflow::Node* node = params->graph.FindNodeId(node_id);
- // Don't remove the input placeholders
- if (node->type_string() == "Placeholder") {
- continue;
- }
- params->graph.RemoveNode(node);
- }
- return tensorflow::Status::OK();
-}
-
tensorflow::Status BuildNodeMap(
const tensorflow::Graph& graph,
std::unordered_map<string, tensorflow::Node*>* node_map) {
@@ -318,51 +124,77 @@ tensorflow::Status BuildNodeMap(
}
} // namespace
+
+// Function to get calibration from ResourceMgr and put them into nodedef.
tensorflow::Status ConvertCalibGraphToInferGraph(
- const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph) {
+ const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
+ bool is_dyn_op) {
VLOG(0) << "Starting Calib Conversion";
- tensorflow::Graph graph(tensorflow::OpRegistry::Global());
- TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
- tensorflow::GraphConstructorOptions(), graph_def, &graph));
- // get calib nodes
- std::vector<tensorflow::Node*> calib_nodes;
- std::vector<tensorflow::Node*> topo_order;
- tensorflow::GetPostOrder(graph, &topo_order);
- for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
- auto node = *rit;
- if (node->type_string() == "TRTCalibOp") {
- VLOG(1) << "Found Calib Node " << node->name();
- calib_nodes.push_back(node);
- }
+ infer_graph->CopyFrom(graph_def);
+ auto trt_rm = TRTResourceManager::instance();
+ auto calib_rm = trt_rm->getManager("TRTCalibration");
+ int num_nodes = infer_graph->node_size();
+ if (!is_dyn_op) {
+ LOG(WARNING) << "Construction of static int8 engine is not implemented "
+ "yet!. Dynamic engine will be constructed";
}
- VLOG(0) << "Num Calib nodes in graph= " << calib_nodes.size();
- if (calib_nodes.size() == 0)
- return tensorflow::errors::FailedPrecondition(
- "Graph doesn't contain any calibration nodes!."
- " Please generate calibration graph and run calibration first");
- for (auto n : calib_nodes) {
- TF_RETURN_IF_ERROR(
- tensorrt::convert::ConvertCalibrationNodeToEngineNode(graph, n));
+ for (int i = 0; i < num_nodes; ++i) {
+ auto n = infer_graph->mutable_node(i);
+ if (n->op() == "TRTEngineOp") {
+ VLOG(1) << "Processing " << n->name();
+ string container_name = n->attr().at("segment_funcdef_name").s();
+ TRTCalibrationResource* cres = nullptr;
+ auto status = calib_rm->Lookup(container_name, "Calibrator", &cres);
+ if (!status.ok()) {
+ LOG(ERROR) << "Could not get Calibration information. Did you run with "
+ "calibration data?";
+ return tensorflow::errors::FailedPrecondition(
+ "Need to run graph with calibration data first!");
+ }
+ if (cres->calibrator_) {
+ cres->calibrator_->setDone();
+ cres->thr_->join();
+ const auto& calibration_table =
+ cres->calibrator_->getCalibrationTableAsString();
+ if (!calibration_table.size()) {
+ LOG(ERROR) << "Calibration table is empty";
+ return tensorflow::errors::Unknown(
+ "Calibration table is missing. This shouldn't have happened!");
+ }
+ n->mutable_attr()->at("calibration_data").set_s(calibration_table);
+ } else {
+ LOG(ERROR) << "Can't get TRTCalibrator from resource manager!";
+ return tensorflow::errors::Unknown(
+ "Can't get TRTCalibrator from resource manager!");
+ }
+ cres->Unref();
+ }
}
- graph.ToGraphDef(infer_graph);
return tensorflow::Status::OK();
}
+// Entry function from Python.
tensorflow::Status ConvertGraphDefToTensorRT(
const tensorflow::GraphDef& graph_def,
const std::vector<string>& output_names, size_t max_batch_size,
size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
- int precision_mode = FP32MODE, int minimum_segment_size = 3) {
+ int precision_mode, int minimum_segment_size, bool is_dyn_op,
+ int max_cached_engines, std::vector<int> cached_engine_batches) {
// optimization pass
tensorflow::grappler::GrapplerItem item;
item.fetch = output_names;
item.graph = graph_def;
-
+ // grappler requires a virtual cluster with a proper GPU device
+ // in order to calculate flops>0 or fails with FATAL
+ // We add numbers from a Pascal card here to have flops>0
tensorflow::DeviceProperties device_properties;
device_properties.set_type("GPU");
device_properties.mutable_environment()->insert({"architecture", "6"});
- tensorflow::grappler::Cluster* cluster =
- new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
+ device_properties.set_num_cores(3584);
+ device_properties.set_frequency(1531);
+ std::unique_ptr<tensorflow::grappler::Cluster> cluster(
+ new tensorflow::grappler::VirtualCluster(
+ {{"/GPU:0", device_properties}}));
// single machine
int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
@@ -370,134 +202,633 @@ tensorflow::Status ConvertGraphDefToTensorRT(
VLOG(2) << "cpu_cores: " << num_cpu_cores;
VLOG(2) << "gpus: " << num_gpus;
tensorflow::RewriterConfig rw_cfg;
+ // use only const folding and layout for the time being since new optimizers
+ // break the graph for us
+ rw_cfg.add_optimizers("constfold");
+ rw_cfg.add_optimizers("layout");
+ rw_cfg.set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE);
tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
tensorflow::GraphDef gdef;
- TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster, item, &gdef));
+ TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, &gdef));
item.graph = gdef;
// AJ refactoring shape inference through grappler/GraphProperties.
tensorflow::grappler::GraphProperties static_graph_properties(item);
TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
// Build full graph
-
- return ConvertAfterShapes(gdef, output_names, max_batch_size,
- max_workspace_size_bytes, new_graph_def,
- precision_mode, minimum_segment_size,
- static_graph_properties, nullptr);
+ ConversionParams cp;
+ cp.input_graph_def = &gdef;
+ cp.output_names = &output_names;
+ cp.max_batch_size = max_batch_size;
+ cp.output_graph_def = new_graph_def;
+ cp.precision_mode = precision_mode;
+ cp.is_dyn_op = is_dyn_op;
+ cp.max_cached_engines = max_cached_engines;
+ cp.cached_engine_batches = cached_engine_batches;
+ cp.minimum_segment_size = minimum_segment_size;
+ cp.graph_properties = &static_graph_properties;
+ cp.max_workspace_size_bytes = max_workspace_size_bytes;
+ if (VLOG_IS_ON(5)) {
+ std::fstream f;
+ f.open("TRTConversionInput.pb",
+ std::fstream::out | std::fstream::binary | std::fstream::trunc);
+ f << gdef.SerializeAsString();
+ f.close();
+ }
+ return ConvertAfterShapes(cp);
}
-tensorflow::Status ConvertAfterShapes(
- const tensorflow::GraphDef& gdef, const std::vector<string>& output_names,
- size_t max_batch_size, size_t max_workspace_size_bytes,
- tensorflow::GraphDef* new_graph_def, int precision_mode,
- int minimum_segment_size,
+// Function to get subsegment information structure.
+tensorflow::Status GetEngineInfo(
+ const tensorflow::Graph* g,
const tensorflow::grappler::GraphProperties& graph_properties,
- const tensorflow::grappler::Cluster* cluster) {
- // Segment the graph into subgraphs that can be converted to TensorRT
- tensorflow::tensorrt::segment::SegmentOptions segment_options;
+ const std::set<string>& segment_nodes,
+ const std::unordered_map<string, tensorflow::Node*>& node_map,
+ const std::vector<tensorflow::Node*>& reverse_topo_order,
+ EngineInfo* info) {
+ std::vector<int> subgraph_node_ids;
+ std::set<string> segment_devices;
+ int input_port = 0;
+ int output_port = 0;
+
+ // Map from src_node_name+port to the unique port numbers of the TRT op, where
+ // the src_node_name is the name of the source node of the input/output
+ // edge, thus there must not be any duplicates since source nodes of
+ // input/output edges must be in different split of the graph.
+ // TODO(aaroey): consider using node id and port instead.
+ std::unordered_map<string, int> created_edges;
+ for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend();
+ ++it) {
+ const auto& node_name = (*it)->name();
+
+ if (segment_nodes.count(node_name) == 0) continue;
+ auto node = node_map.at(node_name);
+ auto node_device = node->requested_device();
+ if (!node_device.empty()) {
+ segment_devices.insert(node_device);
+ } else {
+ if (node->has_assigned_device_name()) {
+ segment_devices.insert(node->assigned_device_name());
+ } else {
+ VLOG(2) << "Node " << node->name()
+ << " neither have requested device nor assigned device";
+ }
+ }
+ int node_id = node->id();
+ subgraph_node_ids.push_back(node_id);
+ for (const auto edge : node->in_edges()) {
+ auto input_node = edge->src();
+ if (segment_nodes.count(input_node->name()) == 0) {
+ // Add constant input node into the segment. We don't care if it has
+ // other output edges going into other engines or TF nodes. Since we add
+ // it only to the subsegment node list, not the subsegment itself, it
+ // won't be removed from the graph. If it doesn't have any edges, TF
+ // will prune it out.
+ if (input_node->type_string() == "Const") {
+ subgraph_node_ids.push_back(input_node->id());
+ } else if (!edge->IsControlEdge() && !input_node->IsSource()) {
+ string s(input_node->name());
+ StrAppend(&s, ":", edge->src_output());
+ VLOG(1) << "Input edge = " << s;
+ int port = input_port;
+ if (created_edges.count(s)) {
+ port = created_edges.at(s);
+ } else {
+ created_edges.insert({s, port});
+ input_port++;
+ }
+ info->connections.emplace_back(input_node->name(), input_node->id(),
+ edge->src_output(), node_name, node_id,
+ edge->dst_input(), true, port);
+ }
+ }
+ }
+ for (const auto edge : node->out_edges()) {
+ auto output_node = edge->dst();
+ if (segment_nodes.count(output_node->name()) == 0 &&
+ !edge->IsControlEdge() && !output_node->IsSink()) {
+ string s(node_name);
+ StrAppend(&s, ":", edge->src_output());
+ VLOG(1) << "Output edge = " << s;
+ int port = output_port;
+ if (created_edges.count(s)) {
+ port = created_edges.at(s);
+ } else {
+ created_edges.insert({s, port});
+ output_port++;
+ }
+ info->connections.emplace_back(output_node->name(), output_node->id(),
+ edge->dst_input(), node_name, node_id,
+ edge->src_output(), false, port);
+ }
+ }
+ }
+
+ TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
+ g, graph_properties, subgraph_node_ids, &info->connections,
+ &info->segment_graph_def, &info->engine_name));
+ // TODO(sami): This should not happen once segmenter is updated.
+ if (segment_devices.size() == 1) {
+ info->device = *segment_devices.begin();
+ } else if (segment_devices.size() > 1) {
+ LOG(WARNING) << "Detected multiple(" << segment_devices.size()
+ << ") devices for the segment. Picking first one to continue "
+ << "but this shouldn't have happened";
+ info->device = *segment_devices.begin();
+ } else {
+ VLOG(1) << "Segment devices size is 0";
+ }
+ return Status::OK();
+}
+
+// Function to insert a TRT node into the graph. The graph is not modified if
+// the returned status is not ok.
+// 'alloc' is only used for creating static engine.
+tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
+ const std::vector<EngineInfo>& infos, int pos,
+ nvinfer1::IGpuAllocator* alloc,
+ int max_batch_size) {
+ const auto& info = infos.at(pos);
+ std::vector<tensorflow::TensorShapeProto> out_shapes;
+ std::vector<tensorflow::TensorShapeProto> input_shapes;
+ std::vector<tensorflow::PartialTensorShape> shapes;
+ std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs;
+ std::vector<tensorflow::DataType> out_types;
+ VLOG(1) << "Processing " << info.engine_name;
+
+ // Update the shape and data types of input/output nodes, and find all unique
+ // inputs.
+ for (const auto& conn : info.connections) {
+ if (!conn.is_input_edge) {
+ // Set the shapes and data types of output edge.
+ tensorflow::TensorShapeProto out_shape;
+ // shape of the output node inside segment
+ conn.inside_shape.AsProto(&out_shape);
+ if (out_shapes.size() <= conn.port_number) {
+ out_shapes.resize(conn.port_number + 1);
+ out_types.resize(conn.port_number + 1);
+ }
+ out_shapes.at(conn.port_number) = out_shape;
+ out_types.at(conn.port_number) = conn.connection_type;
+ continue;
+ }
+
+ // Set the shapes and data types of input edge.
+ tensorflow::TensorShapeProto in_shape;
+ conn.outside_shape.AsProto(&in_shape);
+ if (input_shapes.size() <= conn.port_number) {
+ input_shapes.resize(conn.port_number + 1);
+ shapes.resize(conn.port_number + 1);
+ }
+ input_shapes.at(conn.port_number) = in_shape;
+ shapes.at(conn.port_number) = conn.outside_shape;
+
+ string input_node = conn.outside_node_name;
+ int input_port = conn.outside_port;
+ bool found_engine = false;
+ // Rewire the inputs to other engines if they contain original input node.
+ // Note that we use the information of the engine here, not the information
+ // of the created TRT nodes, so we're able to find all the connections to
+ // any other engines beforehand.
+ for (size_t t = 0; t < infos.size(); ++t) {
+ if (t == pos) continue;
+ auto& engine_info = infos.at(t);
+ for (const auto& eng_conn : engine_info.connections) {
+ if (eng_conn.is_input_edge) continue;
+ if (eng_conn.inside_node_name == input_node) {
+ input_node = engine_info.engine_name;
+ if (eng_conn.inside_port == input_port) {
+ input_port = eng_conn.port_number;
+ found_engine = true;
+ break;
+ }
+ }
+ }
+ if (found_engine) break;
+ }
+ VLOG(1) << "Engine Input " << input_node << ":" << input_port << " -> "
+ << info.engine_name << ":" << inputs.size();
+ // Skip duplicate inputs.
+ bool new_input = true;
+ for (const auto& inp : inputs) {
+ if (inp.node == input_node && inp.index == input_port) {
+ new_input = false;
+ break;
+ }
+ }
+ if (new_input) {
+ inputs.emplace_back(input_node, input_port, conn.connection_type);
+ }
+ }
+
+ // Build the engine and get its serialized representation.
+ string segment_string;
+ if (info.engine_type == EngineInfo::EngineType::TRTStatic ||
+ info.precision_mode == INT8MODE) {
+ // Create static engine for fp32/fp16 mode, and test validity of the engine
+ // for int8 mode. We don't want engine to fail at the calibration time.
+ // So we are constructing a FP32 engine here to check its validity, and if
+ // it is a valid engine then we put the serialized graphdef to the op.
+ // Otherwise we skip node creation for this engine.
+ Logger trt_logger;
+ TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+ // TODO(sami): What happens if 1st dim is not batch?
+ TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
+ info.segment_graph_def,
+ info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode,
+ max_batch_size, info.max_workspace_size_bytes, shapes, &trt_logger,
+ alloc, /*calibrator=*/nullptr, &engine,
+ /*convert_successfully=*/nullptr));
+ TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
+ segment_string =
+ string((const char*)engine_data->data(), engine_data->size());
+ if (info.precision_mode == INT8MODE) {
+ // See above comment about why not putting this inside the 'else' branch.
+ segment_string = info.segment_graph_def.SerializeAsString();
+ }
+ } else {
+ segment_string = info.segment_graph_def.SerializeAsString();
+ }
+
+ // TODO(aaroey): use enum instead, and add a helper method to do the
+ // conversion.
+ string prec_string;
+ switch (info.precision_mode) {
+ case FP32MODE:
+ prec_string = "FP32";
+ break;
+ case FP16MODE:
+ prec_string = "FP16";
+ break;
+ case INT8MODE:
+ prec_string = "INT8";
+ if (!TRTResourceManager::instance()->getManager("TRTCalibration")) {
+ LOG(ERROR) << "Failed to construct calibration storage";
+ }
+ break;
+ default:
+ return tensorflow::errors::OutOfRange("Unknown precision mode");
+ }
+ tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
+ if (!info.device.empty()) node_builder.Device(info.device);
+ if (VLOG_IS_ON(1)) {
+ string ins = StrCat(info.engine_name, " inputs= ");
+ for (const auto& ii : inputs) {
+ StrAppend(&ins, ii.node, ":", ii.index, " ");
+ }
+ VLOG(1) << ins;
+ }
+ node_builder.Input(inputs);
+ if (info.engine_type == EngineInfo::EngineType::TRTStatic &&
+ info.cached_engine_batches.size()) {
+ LOG(WARNING) << "Cached engine batches are ignored for static engines";
+ }
+ tensorflow::NodeDef trt_node;
+ tensorflow::Status status =
+ node_builder.Attr("input_shapes", input_shapes)
+ .Attr("output_shapes", out_shapes)
+ .Attr("static_engine",
+ info.engine_type == EngineInfo::EngineType::TRTStatic)
+ .Attr("segment_funcdef_name",
+ StrCat(info.engine_name, "_native_segment"))
+ .Attr("serialized_segment", segment_string)
+ .Attr("calibration_data", "")
+ .Attr("max_cached_engines_count", info.maximum_cached_engines)
+ .Attr("cached_engine_batches", {max_batch_size})
+ .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
+ .Attr("precision_mode", prec_string)
+ .Attr("OutT", out_types)
+ .Finalize(&trt_node);
+ if (!status.ok()) {
+ LOG(ERROR) << "Node construction failed with" << status;
+ return status;
+ }
+ VLOG(1) << "Adding TRTEngine " << info.engine_name << " to graph";
+
+ // Up until this point, graph is not modified. If we return !status.ok() from
+ // here, this segment will be skipped
+ tensorflow::Node* engine_node = graph->AddNode(trt_node, &status);
+ if (!status.ok()) {
+ LOG(ERROR) << "Adding node failed " << status;
+ return status;
+ }
+ // Updates the inputs of output edges destination nodes, and point them to the
+ // engine node.
+ for (auto& conn : info.connections) {
+ if (conn.is_input_edge) continue;
+ VLOG(1) << " Updating DBG " << engine_node->name() << " out_port "
+ << conn.port_number << " out_id " << conn.outside_id
+ << " name=" << conn.outside_node_name;
+ auto dst_node = graph->FindNodeId(conn.outside_id);
+ // dst_node can only be removed if it is an input node of another engine.
+ // In this case, other engines input edge is updated in nodedef to point to
+ // this engine. Even though edge doesn't exists in the graph, when it is
+ // deserialized again, correct edges will be constructed. This is a problem
+ // of graph->AddNode().
+ if (!dst_node) continue;
+ VLOG(1) << "Updating " << engine_node->name() << ":" << conn.port_number
+ << " to " << dst_node->name() << ":" << conn.outside_port;
+ auto new_edge = graph->AddEdge(engine_node, conn.port_number, dst_node,
+ conn.outside_port);
+ CHECK(new_edge) << "Adding a new edge failed " << engine_node->name() << ":"
+ << conn.port_number << " -> " << dst_node->name() << ":"
+ << conn.outside_port;
+ }
+ return status;
+}
+
+// Function to construct a funcdef from the segment and add it to the graph.
+tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
+ tensorflow::Graph* graph, const tensorflow::GraphDef& segment,
+ const string& name) {
+ tensorflow::Graph sgraph(graph->flib_def());
+ tensorflow::GraphConstructorOptions gcopts;
+ TF_RETURN_IF_ERROR(
+ tensorflow::ConvertGraphDefToGraph(gcopts, segment, &sgraph));
+ std::map<string, tensorflow::Node*> io_nodes;
+ int num_inputs = 0;
+ for (auto n : sgraph.op_nodes()) {
+ if (tensorflow::str_util::StartsWith(n->name(), kInputPHName)) {
+ num_inputs++;
+ io_nodes.insert({n->name(), n});
+ } else if (tensorflow::str_util::StartsWith(n->name(), kOutputPHName)) {
+ io_nodes.insert({n->name(), n});
+ }
+ }
+
+ for (int i = 0; i < num_inputs; ++i) {
+ auto name = StrCat(kInputPHName, i);
+ auto node = io_nodes[name];
+ tensorflow::NodeDef nd;
+ tensorflow::NodeDefBuilder node_builder(
+ StrCat(name, "_Arg"), tensorflow::FunctionLibraryDefinition::kArgOp);
+ VLOG(1) << "Adding " << StrCat(name, "_Arg");
+ TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
+ .Attr("index", i)
+ .Finalize(&nd));
+ tensorflow::Status s;
+ auto node_arg = sgraph.AddNode(nd, &s);
+ if (!s.ok()) {
+ LOG(ERROR) << "Couldn't add _Arg node for " << name;
+ }
+ for (auto edge : node->out_edges()) {
+ sgraph.AddEdge(node_arg, 0, edge->dst(), edge->dst_input());
+ VLOG(1) << "Updating funcdef input " << node_arg->name() << ":" << 0
+ << " - > " << edge->dst()->name() << ":" << edge->dst_input();
+ if (!s.ok()) {
+ LOG(ERROR) << "Failed to update edge from " << node_arg->name()
+ << " to " << edge->dst()->name() << ":" << edge->dst_input();
+ }
+ }
+ sgraph.RemoveNode(node);
+ }
+
+ for (int i = 0; i < io_nodes.size() - num_inputs; ++i) {
+ auto name = StrCat(kOutputPHName, i);
+ auto node = io_nodes[name];
+ tensorflow::NodeDef nd;
+ tensorflow::NodeDefBuilder node_builder(
+ StrCat(name, "_Ret"), tensorflow::FunctionLibraryDefinition::kRetOp);
+ auto edge = *(node->in_edges().begin());
+ tensorflow::NodeDefBuilder::NodeOut nout(
+ edge->src()->name(), edge->src_output(),
+ edge->src()->output_type(edge->src_output()));
+ VLOG(1) << " input " << nout.node << ":" << nout.index
+ << " dtype=" << tensorflow::DataTypeString(nout.data_type);
+ node_builder.Input({nout});
+ TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
+ .Attr("index", i)
+ .Finalize(&nd));
+ if (VLOG_IS_ON(3)) {
+ VLOG(3) << nd.DebugString();
+ }
+ tensorflow::Status s;
+ auto node_ret = sgraph.AddNode(nd, &s);
+ if (!s.ok()) {
+ LOG(ERROR) << "Couldn't add _Ret node for " << name;
+ }
+ VLOG(1) << "Update edge from " << edge->src()->name() << ":"
+ << edge->src_output() << " - > " << node_ret->name() << ":" << 0;
+ sgraph.AddEdge(edge->src(), edge->src_output(), node_ret, 0);
+ s = sgraph.UpdateEdge(edge->src(), edge->src_output(), node_ret, 0);
+ if (!s.ok()) {
+ LOG(ERROR) << "Failed to update edge from " << edge->src()->name() << ":"
+ << edge->src_output() << " - > " << node_ret->name() << ":"
+ << 0;
+ }
+ sgraph.RemoveNode(node);
+ }
+ tensorflow::FunctionDefLibrary fdeflib;
+ auto native_segment = fdeflib.add_function();
+ TF_RETURN_IF_ERROR(tensorflow::GraphToFunctionDef(
+ sgraph, StrCat(name, "_native_segment"), native_segment));
+ if (VLOG_IS_ON(7)) {
+ VLOG(7) << name << " Function_Def ";
+ VLOG(7) << native_segment->DebugString();
+ }
+ VLOG(1) << "Adding funcdef to graphlib";
+ TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib));
+ return tensorflow::Status::OK();
+}
+
+std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
+ ConversionParams& params, EngineInfo& engine) {
+ int cuda_device_id = -1;
+ auto check_device_id = [](int tfid) -> int {
+ tensorflow::TfGpuId tf_gpu_id(tfid);
+ CudaGpuId cuda_gpu_id;
+ Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
+ if (s.ok()) {
+ VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
+ << cuda_gpu_id.value();
+ return cuda_gpu_id.value();
+ }
+ VLOG(2) << "TF GPU with id " << tfid << " do not exist " << s;
+ return -1;
+ };
+ tensorflow::Allocator* dev_allocator = nullptr;
+ // we need to us PM here since in python path there is no way to get
+ // to allocators.
+ // TODO(sami): when grappler devices become available else path will not be
+ // necessary
+ auto pm = tensorflow::ProcessState::singleton();
+ if (params.cluster) { // get allocator
+ tensorflow::Device* device = nullptr;
+ if (params.cluster->GetDeviceSet()) {
+ device = params.cluster->GetDeviceSet()->FindDeviceByName(engine.device);
+ }
+ if (device) {
+ tensorflow::AllocatorAttributes alloc_attr;
+ dev_allocator = device->GetAllocator(alloc_attr);
+ VLOG(1) << "Using allocator " << dev_allocator->Name();
+ } else {
+ LOG(WARNING) << "Cluster is set but device '" << engine.device
+ << "' is not found in the cluster";
+ }
+ } else { // cluster not found, possibly a python call
+ VLOG(1) << "Cluster is not set, probably called from python";
+ int found_device = 0;
+ bool try_gpu_ids = true;
+ // if device is set, try to find the device. Might be a problem for multi
+ // host case but TensorRT do not support multi host setups yet.
+ if (!engine.device.empty()) {
+ DeviceNameUtils::ParsedName parsed_name;
+ if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name)) {
+ cuda_device_id = parsed_name.has_id ? parsed_name.id : -1;
+ }
+ try_gpu_ids = !parsed_name.has_id;
+ }
+ if (try_gpu_ids) {
+ while (found_device < 100) {
+ cuda_device_id = check_device_id(found_device);
+ if (cuda_device_id >= 0) break;
+ found_device++;
+ }
+ }
+ if (found_device == 100) {
+ LOG(ERROR) << " Can't find a GPU device to work with. Please "
+ "instantiate a session to initialize devices";
+ return std::make_pair(cuda_device_id, dev_allocator);
+ }
+ LOG(WARNING)
+ << "Can't determine the device, constructing an allocator at device "
+ << found_device;
+ tensorflow::GPUOptions gpuoptions;
+ // this will be a noop if device is already initialized
+ gpuoptions.set_allow_growth(true);
+ tensorflow::TfGpuId tf_gpu_id(found_device);
+ dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
+ }
+ return std::make_pair(cuda_device_id, dev_allocator);
+}
+
+// Entry function from optimization pass.
+tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
+ // Convert graphdef to graph.
tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
- gdef.library());
+ params.input_graph_def->library());
tensorflow::Graph graph(flib);
TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
- tensorflow::GraphConstructorOptions(), gdef, &graph));
+ tensorflow::GraphConstructorOptions(), *params.input_graph_def, &graph));
+ // Segment the graph into subgraphs that can be converted to TensorRT
+ tensorflow::tensorrt::segment::SegmentOptions segment_options;
// TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
- for (auto node : output_names) {
+ for (auto node : *(params.output_names)) {
segment_options.exclude_node_list.insert(node);
}
-
- // TODO(sami): this should be passed as a knob!!!!
- segment_options.minimum_segment_size = minimum_segment_size;
- tensorflow::tensorrt::segment::SegmentNodesVector segments;
+ segment_options.minimum_segment_size = params.minimum_segment_size;
+ tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
- &graph, IsTensorRTCandidate, segment_options, &segments));
- if (segments.size() > 1) {
- VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
+ &graph, IsTensorRTCandidate, segment_options, &initial_segments));
+ if (initial_segments.size() > 1) {
+ VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
+ << initial_segments.size();
}
+
+ // Get the EngineInfo for each segment.
std::unordered_map<string, tensorflow::Node*> node_map;
TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
- std::unordered_map<string, std::pair<int, string>> output_edge_map;
- int count = 0;
float total_num_nodes_in_segments = 0.;
- for (auto s : segments) {
- total_num_nodes_in_segments += s.first.size();
- }
- // We create the map here since cluster may not be available in all cases.
- std::map<string, tensorflow::Device*> name_to_device_map;
- if (cluster) {
- // TODO(aaroey): consider using DeviceSet::FindDeviceByName(), as in a
- // distributed environment, devices from different workers can have same
- // short name.
- for (const auto dm : cluster->GetDeviceSet()->devices()) {
- name_to_device_map[dm->name()] = dm;
+ std::vector<EngineInfo> engine_segments;
+ engine_segments.reserve(initial_segments.size());
+ std::vector<tensorflow::Node*> reverse_topo_order;
+ tensorflow::GetPostOrder(graph, &reverse_topo_order);
+ size_t total_engine_bytes_size = 0;
+ std::vector<size_t> engine_bytes_size;
+ tensorflow::tensorrt::segment::SegmentNodesVector converted_segments;
+ converted_segments.reserve(initial_segments.size());
+ for (size_t t = 0; t < initial_segments.size(); t++) {
+ auto& curr_segment = initial_segments.at(t);
+ EngineInfo curr_engine;
+ Status status =
+ GetEngineInfo(&graph, *params.graph_properties, curr_segment.first,
+ node_map, reverse_topo_order, &curr_engine);
+ if (!status.ok()) {
+ LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
+ << status;
+ continue;
}
- }
- for (const auto& segment_nodes_and_device : segments) {
- const std::set<string>& subgraph_node_names =
- segment_nodes_and_device.first;
- std::set<int> subgraph_node_ids;
- size_t max_mem_per_engine =
- max_workspace_size_bytes *
- ((float)subgraph_node_names.size() / total_num_nodes_in_segments);
- std::stringstream oss;
- for (const string& node_name : subgraph_node_names) {
- oss << " " << node_name;
- subgraph_node_ids.insert(node_map.at(node_name)->id());
+ curr_engine.precision_mode = params.precision_mode;
+ curr_engine.engine_type =
+ (params.is_dyn_op || params.precision_mode == INT8MODE
+ ? EngineInfo::EngineType::TRTDynamic
+ : EngineInfo::EngineType::TRTStatic);
+ curr_engine.cached_engine_batches = params.cached_engine_batches;
+ curr_engine.maximum_cached_engines = params.max_cached_engines;
+ StrAppend(&curr_engine.engine_name, "my_trt_op_", t);
+ status = RegisterSegmentFunctionToFunctionLibrary(
+ &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
+ if (!status.ok()) {
+ LOG(WARNING) << "Failed to register segment graphdef as a function " << t
+ << ": " << status;
+ continue;
}
- VLOG(1) << "Subgraph nodes at device " << segment_nodes_and_device.second
- << " : " << oss.str();
- auto target_device =
- name_to_device_map.find(segment_nodes_and_device.second);
- std::shared_ptr<nvinfer1::IGpuAllocator> allocator(0);
+ engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
+ total_engine_bytes_size += engine_bytes_size.back();
+ total_num_nodes_in_segments += curr_segment.first.size();
+ engine_segments.push_back(std::move(curr_engine));
+ converted_segments.push_back(std::move(curr_segment));
+
+ if (VLOG_IS_ON(8)) {
+ string fname = curr_engine.engine_name;
+ StrAppend(&fname, ".pb");
+ std::fstream f;
+ f.open(fname.c_str(), std::fstream::out | std::fstream::binary);
+ f << engine_segments.at(t).segment_graph_def.SerializeAsString();
+ f.close();
+ }
+ }
+
+ // Create a TRT node for each segment using its EngineInfo.
+ int old_cuda_device = 0;
+ auto err = cudaGetDevice(&old_cuda_device);
+ if (err != cudaSuccess) {
+ LOG(ERROR) << "Couldn't get current device: " << cudaGetErrorString(err);
+ }
+ VLOG(1) << "Current cuda device is " << old_cuda_device;
+ for (int i = 0; i < engine_segments.size(); ++i) {
+ auto& engine = engine_segments.at(i);
+ // Partition the workspace size by the average of node ratio and segment
+ // graphdef size
+ engine.max_workspace_size_bytes =
+ params.max_workspace_size_bytes *
+ (engine_bytes_size.at(i) / total_engine_bytes_size +
+ converted_segments.at(i).first.size() / total_num_nodes_in_segments) /
+ 2.0;
+ // The allocator is used to build the engine. The build and the built engine
+ // will be destroyed after we get the serialized engine string, so it's fine
+ // to use unique_ptr here.
+ std::unique_ptr<nvinfer1::IGpuAllocator> alloc;
+ auto device_alloc = GetDeviceAndAllocator(params, engine);
int cuda_device_id = 0;
- if (target_device != name_to_device_map.end()) {
- tensorflow::TfGpuId tf_gpu_id(target_device->second->parsed_name().id);
- CudaGpuId cuda_gpu_id;
- Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
- if (!s.ok()) {
- LOG(ERROR)
- << "Cuda device identification failed, using device 0. Error= "
- << s;
- } else {
- cuda_device_id = cuda_gpu_id.value();
- }
- tensorflow::GPUOptions gpuoptions;
- // we need to us PM here since in python path there is no way to get to
- // allocators
- auto pm = tensorflow::ProcessState::singleton();
- // this should be instantiated by now
- auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
- VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value()
- << " cuda device= " << cuda_device_id << " at " << dev_allocator;
- allocator = std::make_shared<TRTDeviceAllocator>(dev_allocator);
- } else { // device unknown or not available
- allocator = std::make_shared<TRTCudaAllocator>();
+ if (device_alloc.first >= 0) {
+ cuda_device_id = device_alloc.first;
+ alloc.reset(new TRTDeviceAllocator(device_alloc.second));
+ } else {
+ // Setting allocator as nullptr should get revert to the cudamalloc
+ LOG(WARNING) << "Can't identify the cuda device. Running on device 0 ";
}
- ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size,
- max_mem_per_engine, graph_properties, &output_edge_map,
- precision_mode, segment_nodes_and_device.second,
- allocator, cuda_device_id);
- if (precision_mode == INT8MODE) {
- tensorflow::Status status = GetCalibNode(&p);
- if (status != tensorflow::Status::OK()) {
- LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
- << " due to: \"" << status.ToString()
- << "\" SKIPPING......( " << subgraph_node_names.size()
- << " nodes)";
+ cudaSetDevice(cuda_device_id);
+ auto status = CreateTRTNode(&graph, engine_segments, i, alloc.get(),
+ params.max_batch_size);
+ // If status is ok, we successfully added the node to the graph and can
+ // remove segment ops. Otherwise graph is not modified.
+ if (status.ok()) {
+ for (auto node_name : converted_segments.at(i).first) {
+ graph.RemoveNode(node_map.at(node_name));
}
} else {
- tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
- if (status != tensorflow::Status::OK()) {
- LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
- << " due to: \"" << status.ToString()
- << "\" SKIPPING......( " << subgraph_node_names.size()
- << " nodes)";
- }
+ // Graph is not modified.
+ LOG(WARNING) << "Engine creation for segment " << i << ", composed of "
+ << converted_segments.at(i).first.size()
+ << " nodes failed: " << status << ". Skipping...";
}
- count++;
}
- graph.ToGraphDef(new_graph_def);
+ cudaSetDevice(old_cuda_device);
+ graph.ToGraphDef(params.output_graph_def);
+ VLOG(1) << "Returning from conversion";
return tensorflow::Status::OK();
}
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 65a67d7e73..9d986e4890 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -30,29 +30,60 @@ namespace tensorflow {
namespace tensorrt {
namespace convert {
-// This method converts an already generated calibration graph which was used in
-// calibration runs to an inference graph
+struct ConversionParams {
+ ConversionParams()
+ : input_graph_def(nullptr),
+ max_batch_size(1),
+ max_workspace_size_bytes(1 << 30),
+ output_graph_def(nullptr),
+ precision_mode(1),
+ minimum_segment_size(3),
+ graph_properties(nullptr),
+ cluster(nullptr),
+ is_dyn_op(false),
+ fixed_input_size(true),
+ max_cached_engines(1) {}
+ const tensorflow::GraphDef* input_graph_def;
+ const std::vector<string>* output_names;
+ size_t max_batch_size;
+ size_t max_workspace_size_bytes;
+ tensorflow::GraphDef* output_graph_def;
+ int precision_mode;
+ int minimum_segment_size;
+ const tensorflow::grappler::GraphProperties* graph_properties;
+ const tensorflow::grappler::Cluster* cluster;
+ bool is_dyn_op; // Whether to create engine on conversion or execution time
+ bool fixed_input_size; // Assume non-batch ranks of input tensors are fixed
+ int max_cached_engines; // maximum number of cached engines
+ std::vector<int> cached_engine_batches; // list of cached engines
+};
+
+// This method extracts calibration information from the resource managers
+// and puts them in to engine nodedefs.
tensorflow::Status ConvertCalibGraphToInferGraph(
- const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def);
+ const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def,
+ bool is_dyn_op);
-// max_batch_size: maximum batch size which can be used for inference for
-// optimization targets inference run with max batch size.
-// max_workspace_size_bytes: The upper bound of memory allowance for
-// engine building.
+// - max_batch_size: maximum batch size which can be used for inference for
+// optimization targets inference run with max batch size.
+// - max_workspace_size_bytes: The upper bound of memory allowance for engine
+// building.
tensorflow::Status ConvertGraphDefToTensorRT(
const tensorflow::GraphDef& graph_def,
const std::vector<string>& output_names, size_t max_batch_size,
size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
- int precision_mode, int minimum_segment_size);
+ int precision_mode = 1, int minimum_segment_size = 3,
+ bool is_dyn_op = false, int max_cached_engines = 1,
+ std::vector<int> cached_engine_batches = {});
// Method to call from optimization pass
-tensorflow::Status ConvertAfterShapes(
- const tensorflow::GraphDef& graph, const std::vector<string>& output_names,
- size_t max_batch_size, size_t max_workspace_size_bytes,
- tensorflow::GraphDef* new_graph_def, int precision_mode,
- int minimum_segment_size,
- const tensorflow::grappler::GraphProperties& graph_properties,
- const tensorflow::grappler::Cluster* cluster);
+tensorflow::Status ConvertAfterShapes(ConversionParams& params);
+
+// Return compile time TensorRT library version information.
+std::vector<int> GetLinkedTensorRTVersion();
+
+// Return runtime time TensorRT library version information.
+std::vector<int> GetLoadedTensorRTVersion();
} // namespace convert
} // namespace tensorrt
} // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 4e4d295538..146b9c7344 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -14,7 +14,6 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
#include <algorithm>
#include <list>
@@ -25,7 +24,9 @@ limitations under the License.
#include <utility>
#include <vector>
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
#include "tensorflow/core/framework/node_def.pb.h" // NOLINT
@@ -37,6 +38,7 @@ limitations under the License.
#include "tensorflow/core/graph/graph_constructor.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/numbers.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/logging.h"
@@ -54,8 +56,11 @@ limitations under the License.
namespace tensorflow {
namespace tensorrt {
namespace convert {
+using ::tensorflow::str_util::Split;
+
using ::tensorflow::strings::StrAppend;
using ::tensorflow::strings::StrCat;
+
namespace {
inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
@@ -121,12 +126,10 @@ static std::vector<std::pair<int, int>> CreateSamePadding(
string GetCommonNameScope(const string& op_name_a, const string& op_name_b) {
size_t last_scope_separator = 0;
- for (size_t i = 0; i < std::min(op_name_a.size(), op_name_b.size()); ++i) {
- if (op_name_a[i] != op_name_b[i]) {
- break;
- } else if (op_name_a[i] == '/') {
- last_scope_separator = i + 1;
- }
+ const size_t min_size = std::min(op_name_a.size(), op_name_b.size());
+ for (size_t i = 0; i < min_size; ++i) {
+ if (op_name_a[i] != op_name_b[i]) break;
+ if (op_name_a[i] == '/') last_scope_separator = i + 1;
}
return op_name_a.substr(0, last_scope_separator);
}
@@ -417,20 +420,6 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
}
}
-struct InferDeleter {
- template <typename T>
- void operator()(T* obj) const {
- if (obj) {
- obj->destroy();
- }
- }
-};
-
-template <typename T>
-inline std::shared_ptr<T> infer_object(T* obj) {
- return std::shared_ptr<T>(obj, InferDeleter());
-}
-
class Converter;
using OpConverter =
@@ -444,7 +433,7 @@ class Converter {
OpConverter plugin_converter_;
nvinfer1::INetworkDefinition* trt_network_;
std::list<std::vector<uint8_t>> temp_bufs_;
- tensorflow::tensorrt::TRTWeightStore* weight_store_;
+ TRTWeightStore* weight_store_;
bool fp16_;
void register_op_converters();
tensorflow::Status get_inputs(const tensorflow::NodeDef& node_def,
@@ -486,11 +475,11 @@ class Converter {
public:
explicit Converter(nvinfer1::INetworkDefinition* trt_network,
- tensorflow::tensorrt::TRTWeightStore* ws, bool fp16)
+ TRTWeightStore* ws, bool fp16)
: trt_network_(trt_network), weight_store_(ws), fp16_(fp16) {
this->register_op_converters();
}
- tensorflow::tensorrt::TRTWeightStore* weight_store() { return weight_store_; }
+ TRTWeightStore* weight_store() { return weight_store_; }
TRT_ShapedWeights get_temp_weights(tensorflow::DataType type,
nvinfer1::Dims shape) {
TRT_ShapedWeights weights(type, nullptr, shape);
@@ -2140,559 +2129,265 @@ void Converter::register_op_converters() {
} // namespace
-tensorflow::Status ConvertCalibrationNodeToEngineNode(
- tensorflow::Graph& graph, tensorflow::Node* c_node) {
- const auto ndef = c_node->def();
-
- TFAttrs attrs(ndef);
- std::vector<string> segment_nodes(
- attrs.get<std::vector<string>>("segment_nodes"));
- std::vector<string> output_nodes(
- attrs.get<std::vector<string>>("segment_output_names"));
- std::vector<string> input_names(
- attrs.get<std::vector<string>>("input_names"));
- string res_name = attrs.get<string>("resource_name");
- VLOG(1) << "Node name " << c_node->name() << " res_name " << res_name;
- string engine_name = "my_trt_op";
- {
- const auto node_id = tensorflow::str_util::Split(res_name, "_");
- engine_name += node_id.back();
- }
- std::map<string, tensorflow::Node*> node_maps;
-
- for (auto n : graph.op_nodes()) {
- node_maps.insert({n->name(), n});
- }
- std::set<int> subgraph_ids;
- for (const auto internal_node : segment_nodes) {
- subgraph_ids.insert(node_maps.at(internal_node)->id());
- }
- if (VLOG_IS_ON(2)) {
- string node_names = StrCat(c_node->name(), " segment nodes= ");
-
- for (const auto& node_name : segment_nodes) {
- StrAppend(&node_names, node_name, ", ");
- }
- VLOG(2) << node_names;
+tensorflow::Status ConvertGraphDefToEngine(
+ const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
+ size_t max_workspace_size_bytes,
+ const std::vector<tensorflow::PartialTensorShape>& input_shapes,
+ Logger* logger, nvinfer1::IGpuAllocator* allocator,
+ TRTInt8Calibrator* calibrator,
+ TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+ bool* convert_successfully) {
+ engine->reset();
+ if (convert_successfully) *convert_successfully = false;
+
+ // Create the builder.
+ TrtUniquePtrType<nvinfer1::IBuilder> builder(
+ nvinfer1::createInferBuilder(*logger));
+ builder->setMaxBatchSize(max_batch_size);
+ // TODO(aaroey): use the allocator to allocate the TRT workspace.
+ builder->setMaxWorkspaceSize(max_workspace_size_bytes);
+#if NV_TENSORRT_MAJOR > 3
+ builder->setGpuAllocator(allocator);
+#endif
+ if (precision_mode == FP16MODE) {
+ builder->setHalf2Mode(true);
+ } else if (precision_mode == INT8MODE) {
+ builder->setInt8Mode(true);
+ builder->setInt8Calibrator(calibrator);
}
- VLOG(1) << "Output Nodes:";
- std::vector<tensorflow::DataType> out_types;
- std::vector<const tensorflow::Edge*> out_edges;
+ // Create the network.
+ auto trt_network =
+ TrtUniquePtrType<nvinfer1::INetworkDefinition>(builder->createNetwork());
+ if (!trt_network) {
+ return tensorflow::errors::Internal(
+ "Failed to create TensorRT network object");
+ }
+ auto ws = std::unique_ptr<TRTWeightStore>(new TRTWeightStore());
- for (auto& i : output_nodes) {
- auto node_port = tensorflow::str_util::Split(i, ":");
- VLOG(1) << " " << i << " in graph " << node_maps.count(i);
- auto out_node_name = node_port.at(0);
- if (node_port.size() > 1) {
- VLOG(1) << "Multi port output" << node_port.at(0) << " "
- << node_port.at(1) << " size=" << node_port.size();
- }
- auto node_it = node_maps.find(out_node_name);
- if (node_it != node_maps.end()) {
- tensorflow::Node* out_node = node_it->second;
- int port = 0;
- if (node_port.size() == 2) {
- port = std::strtoul(node_port.at(1).c_str(), nullptr, 10);
- out_types.push_back(out_node->output_type(port));
- } else {
- out_types.push_back(out_node->output_type(0));
+ // Build the network
+ VLOG(1) << "Starting engine conversion ";
+ Converter converter(trt_network.get(), ws.get(), precision_mode == FP16MODE);
+ std::vector<std::pair<string, string>> output_tensors;
+ // Graph nodes are already topologically sorted during construction
+ for (const auto& node_def : gdef.node()) {
+ string node_name = node_def.name();
+ VLOG(1) << "Converting op name=" << node_name << ", op=" << node_def.op();
+ if (tensorflow::str_util::StartsWith(node_name, kInputPHName) &&
+ (node_def.op() == "Placeholder")) {
+ nvinfer1::DimsCHW input_dim_pseudo_chw;
+ for (int i = 0; i < 8; i++) input_dim_pseudo_chw.d[i] = 0;
+ nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
+ auto type_status =
+ ConvertDType(node_def.attr().at("dtype").type(), &dtype);
+ if (type_status != tensorflow::Status::OK()) {
+ LOG(WARNING) << "Type conversion failed for " << node_name;
+ return type_status;
}
- for (auto out_edge : out_node->out_edges()) {
- if (subgraph_ids.count(out_edge->dst()->id()))
- continue; // skip internal edges;
- if (out_edge->src_output() == port) {
- out_edges.push_back(out_edge);
- VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
- << out_edge->src_output() << " -> " << out_edge->dst()->name()
- << ":" << out_edge->dst_input();
+ int32 slot_number = -1;
+ if (!tensorflow::strings::safe_strto32(node_name.c_str() + 8,
+ &slot_number)) {
+ LOG(ERROR) << "Failed to parse slot number from " << node_name
+ << " +8= " << node_name.c_str() + 8;
+ }
+ auto shape = input_shapes.at(slot_number);
+ if (shape.dims() > 8) {
+ LOG(ERROR) << "Tensor rank is greater than 8 for " << node_name
+ << " at input slot " << slot_number;
+ return tensorflow::errors::OutOfRange(
+ "Input tensor rank is greater than 8");
+ }
+ if (VLOG_IS_ON(1)) {
+ string dim_str("dims=");
+ StrAppend(&dim_str, "[ ", shape.dim_size(0));
+ for (int i = 1; i < shape.dims(); i++) {
+ StrAppend(&dim_str, ", ", shape.dim_size(i));
}
+ StrAppend(&dim_str, " ]");
+ VLOG(1) << dim_str;
+ }
+ for (int i = 1; i < shape.dims(); i++) {
+ input_dim_pseudo_chw.d[i - 1] = shape.dim_size(i);
}
- } else {
- LOG(WARNING) << " couldn't find output node " << out_node_name;
- }
- }
- if (VLOG_IS_ON(1)) {
- VLOG(1) << c_node->name() << " Input Nodes:";
- for (auto& i : input_names) {
- VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
- }
- }
- auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
- auto resmgr = trt_rm->getManager("TRTCalibOps");
- tensorflow::tensorrt::TRTCalibrationResource* calib_res = nullptr;
- auto status = resmgr->Lookup(res_name, res_name, &calib_res);
- if (!status.ok() || !calib_res->calibrator_) {
- return tensorflow::errors::FailedPrecondition(
- "You must run calibration"
- " and inference conversion in the same process");
- }
-
- calib_res->calibrator_->setDone();
- calib_res->thr_->join();
- delete calib_res->thr_;
- if (!calib_res->engine_) {
- LOG(ERROR) << "Calibration failed!, engine does not exist. Did you run "
- "calibration graph?";
- return tensorflow::errors::FailedPrecondition(
- "Calibration graph needs to be executed on"
- " calibration data before convertsion to inference graph");
- }
- auto weight_rmgr = trt_rm->getManager("WeightStore");
- TF_CHECK_OK(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
- res_name, res_name));
- auto engine_plan = calib_res->engine_->serialize();
- calib_res->engine_->destroy();
- calib_res->network_->destroy();
- calib_res->builder_->destroy();
- calib_res->thr_ = nullptr;
- calib_res->engine_ = nullptr;
- calib_res->builder_ = nullptr;
- tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
- std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
- income_edges.resize(c_node->num_inputs());
- for (const auto in_edge : c_node->in_edges()) {
- auto src = in_edge->src();
- int dest_port = in_edge->dst_input();
- VLOG(1) << "Incoming connection " << src->name() << ":"
- << in_edge->src_output() << " -> " << c_node->name() << ":"
- << dest_port;
- income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
- c_node->input_type(dest_port)};
- }
- tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
- income_edges);
- if (VLOG_IS_ON(2)) {
- for (const auto& inp : input_list) {
- VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
- << tensorflow::DataTypeString(inp.data_type);
- }
- }
- op_builder.Input(input_list);
- tensorflow::NodeDef engine_node;
- const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
- string engine_plan_string(engine_plan_data,
- engine_plan_data + engine_plan->size());
- status = op_builder.Attr("serialized_engine", engine_plan_string)
- .Attr("input_nodes", input_names)
- .Attr("output_nodes", output_nodes)
- .Attr("OutT", out_types)
- .Finalize(&engine_node);
- if (!status.ok()) {
- LOG(ERROR) << "Engine Node creation failed";
- return status;
- }
- auto trt_engine_node = graph.AddNode(engine_node, &status);
- TF_RETURN_IF_ERROR(status);
- std::map<string, int> port_map;
- for (size_t t = 0; t < output_nodes.size(); t++) {
- port_map.insert({output_nodes.at(t), t});
- }
- for (auto& i : out_edges) {
- string s(i->src()->name());
- if (i->src_output()) StrAppend(&s, ":", i->src_output());
- int out_port = port_map.at(s);
- VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
- << " -> " << i->dst()->name() << ":" << i->dst_input();
- TF_RETURN_IF_ERROR(
- graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
- }
- for (const auto ed : trt_engine_node->in_edges()) {
- VLOG(1) << "In Edge " << ed->src()->name() << ":" << ed->src_output()
- << " -> " << ed->dst()->name() << ":" << ed->dst_input();
- }
- for (const auto ed : trt_engine_node->out_edges()) {
- VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
- << " -> " << ed->dst()->name() << ":" << ed->dst_input();
- }
- VLOG(1) << "Segment nodes:";
- for (auto& i : segment_nodes) {
- VLOG(1) << " " << i << " in graph " << node_maps.count(i);
- auto it = node_maps.find(i);
- if (it != node_maps.end()) {
- graph.RemoveNode(it->second);
- }
- }
- graph.RemoveNode(c_node);
- return tensorflow::Status::OK();
-}
-tensorflow::Status ReverseTopologicalSort(
- const tensorrt::convert::SubGraphParams& s,
- std::list<tensorflow::Node*>* order) {
- std::vector<tensorflow::Node*> order_vec;
- tensorflow::GetPostOrder(s.graph, &order_vec);
- // Select just the subgraph
- for (tensorflow::Node* node : order_vec) {
- if (s.subgraph_node_ids.count(node->id())) {
- // We want topological order to contstruct the
- // network layer by layer
- order->push_front(node);
+ input_dim_pseudo_chw.nbDims = shape.dims() - 1;
+ nvinfer1::ITensor* input_tensor = converter.network()->addInput(
+ node_name.c_str(), dtype, input_dim_pseudo_chw);
+ if (!input_tensor) {
+ return tensorflow::errors::InvalidArgument(
+ "Failed to create Input layer tensor ", node_name,
+ " rank=", shape.dims() - 1);
+ }
+ VLOG(1) << "Input tensor name :" << node_name;
+ if (!converter.insert_input_tensor(node_name, input_tensor)) {
+ return tensorflow::errors::AlreadyExists(
+ "Output tensor already exists for op: " + node_name);
+ }
+ } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) &&
+ (node_def.op() == "Identity")) {
+ int32 slot_number = -1;
+ if (!tensorflow::strings::safe_strto32(node_name.c_str() + 9,
+ &slot_number)) {
+ LOG(ERROR) << "Failed to parse slot number from " << node_name
+ << " +9=" << node_name.c_str() + 9;
+ }
+ if (output_tensors.size() <= slot_number) {
+ output_tensors.resize(slot_number + 1);
+ }
+ output_tensors.at(slot_number) = {node_def.input(0), node_name};
+ } else {
+ VLOG(2) << "Converting node: " << node_def.name() << " , "
+ << node_def.op();
+ TF_RETURN_IF_ERROR(converter.convert_node(node_def));
}
}
- return tensorflow::Status::OK();
-}
-
-tensorflow::Status SetInputList(
- const tensorrt::convert::SubGraphParams& s,
- tensorflow::NodeDefBuilder* op_builder,
- const std::vector<string>* input_names,
- std::vector<tensorflow::DataType>* input_dtypes) {
- std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
- VLOG(2) << "input edge size: " << input_names->size();
- for (size_t i = 0; i < input_names->size(); ++i) {
- VLOG(2) << "input edges: " << i << " " << input_names->at(i);
- int output_idx = s.input_inds.at(i).second;
- // we wired up the input here already, it is redundant to do it again in
- // ConvertSubGraphToTensorRT(convert_graph.cc)
- auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
- input_names->at(i), output_idx, input_dtypes->at(i));
- income_edges.push_back(incoming_edge);
- }
- tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
- income_edges);
- op_builder->Input(input_list);
- return tensorflow::Status::OK();
-}
-
-string SubgraphNameScopeGenerator(const std::list<tensorflow::Node*>* order) {
- string subgraph_name_scope;
- if (!order->empty()) {
- subgraph_name_scope = order->front()->name();
- }
- for (const tensorflow::Node* node : *order) {
- subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
- }
- // TODO(sami,ben,jie): proper naming!
- return subgraph_name_scope;
-}
-
-tensorflow::Status ConvertSubgraph(
- Converter& converter, tensorrt::convert::SubGraphParams& s,
- std::list<tensorflow::Node*>* order, std::vector<string>* input_names,
- std::vector<tensorflow::DataType>* input_dtypes,
- std::vector<string>* output_names,
- std::vector<tensorflow::DataType>* output_dtypes,
- const string& engine_name) {
- std::set<string> added_tensors;
- for (const std::pair<int, int>& input : s.input_inds) {
- VLOG(2) << "parsing input. Node id= " << input.first;
- int node_id = input.first;
- int output_idx = input.second;
- tensorflow::Node* node = s.graph.FindNodeId(node_id);
- auto node_name = node->name();
- // input_names should use the node name in the graph
- // here it should be the input tensor name -> matching the binding
- // insert original node name without port
- auto tensor_name = node_name;
- if (output_idx != 0) {
- tensor_name = StrCat(tensor_name, ":", output_idx);
- }
-
- VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
- << " idx: " << output_idx;
-
- auto shape_inference_node_name = node_name;
- auto shape_inference_output_idx = output_idx;
- // rewire the shape inference to original node in the graph
- if (s.output_edge_map->count(tensor_name)) {
- shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
- shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
- }
- if (shape_inference_output_idx < 0) continue;
- VLOG(2) << "shapeinference name: " << shape_inference_node_name
- << " idx: " << shape_inference_output_idx;
-
- if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
- return tensorflow::errors::Internal("failed to find input node: " +
- shape_inference_node_name);
-
- auto op_info_vec =
- s.graph_properties.GetOutputProperties(shape_inference_node_name);
- if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
- return tensorflow::errors::Internal(
- "accessing output index of: ", shape_inference_output_idx,
- ", at node: ", shape_inference_node_name,
- " with output entry from shape_map: ", op_info_vec.size());
-
- auto op_info = op_info_vec.at(shape_inference_output_idx);
- tensorflow::DataType tf_dtype = op_info.dtype();
-
- nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
- auto type_status = ConvertDType(tf_dtype, &dtype);
- if (type_status != tensorflow::Status::OK()) {
- LOG(WARNING) << "Type conversion failed for " << node_name;
- return type_status;
- }
-
- VLOG(2) << "Accessing output index of: " << output_idx
- << ", at node: " << node_name
- << " with output entry from shape_map: " << op_info_vec.size();
- // TODO(ben,jie): update TRT input format/dimension
- nvinfer1::DimsCHW input_dim_pseudo_chw;
- for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1;
-
- // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
- // update the code once TRT 4.0 comes out.
- if (op_info.shape().dim_size() != 4) {
- string err_str = "Require 4 dimensional input.";
- StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
- shape_inference_node_name);
- return tensorflow::errors::Unimplemented(err_str);
- }
-
- for (int i = 1; i < op_info.shape().dim_size(); i++) {
- VLOG(2) << "dimension: " << i
- << " , size: " << op_info.shape().dim(i).size();
- input_dim_pseudo_chw.d[i - 1] = op_info.shape().dim(i).size();
- }
-
- // TODO(ben,jie): proper way to restore input tensor name?
- auto input_tensor_name = node_name;
- if (output_idx != 0) {
- input_tensor_name = StrCat(node_name, ":", output_idx);
- }
- if (added_tensors.count(input_tensor_name)) continue;
- added_tensors.insert(input_tensor_name);
- input_names->push_back(input_tensor_name);
- input_dtypes->push_back(tf_dtype);
- nvinfer1::ITensor* input_tensor = converter.network()->addInput(
- input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
-
- if (!input_tensor)
- return tensorflow::errors::InvalidArgument(
- "Failed to create Input layer");
- VLOG(2) << "Input tensor name :" << input_tensor_name;
-
- if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
- return tensorflow::errors::AlreadyExists(
- "Output tensor already exists for op: " + input_tensor_name);
- }
-
- for (const tensorflow::Node* node : *order) {
- const tensorflow::NodeDef& node_def = node->def();
- VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
- TF_RETURN_IF_ERROR(converter.convert_node(node_def));
- }
-
- VLOG(2) << "Finished conversion";
-
- // Gather output metadata
- int trt_engine_op_output_idx = 0;
- added_tensors.clear();
- for (const std::pair<int, int>& output : s.output_inds) {
- int node_id = output.first;
- int output_idx = output.second;
- tensorflow::Node* node = s.graph.FindNodeId(node_id);
- string op_name = node->name();
- string tensor_name = op_name;
-
- s.output_edge_map->insert(
- {trt_engine_op_output_idx == 0
- ? engine_name
- : StrCat(engine_name, ":", trt_engine_op_output_idx),
- {output_idx, tensor_name}});
- trt_engine_op_output_idx++;
- if (output_idx != 0)
- tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
- VLOG(2) << "Output tensor name: " << tensor_name;
- if (added_tensors.count(tensor_name)) continue;
- added_tensors.insert(tensor_name);
- output_names->push_back(tensor_name);
- auto tensor_or_weights = converter.get_tensor(tensor_name);
+ for (const auto& output : output_tensors) {
+ auto tensor_or_weights = converter.get_tensor(output.first);
if (!tensor_or_weights.is_tensor()) {
- return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
- "' is weights not tensor");
+ return tensorflow::errors::InvalidArgument(
+ "Output node '" + output.first + "' is weights not tensor");
}
nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
+ tensor->setName(output.second.c_str());
if (!tensor) {
return tensorflow::errors::NotFound("Output tensor not found: " +
- tensor_name);
+ output.first);
}
+ VLOG(1) << "Marking output tensor " << output.first << ", as output tensor "
+ << output.second;
+
converter.network()->markOutput(*tensor);
- tensorflow::DataType tf_dtype = node->output_type(output_idx);
- output_dtypes->push_back(tf_dtype);
- nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
- TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
- tensor->setType(trt_dtype);
}
+ if (convert_successfully) *convert_successfully = true;
- return tensorflow::Status::OK();
-}
-
-tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
- // Visit nodes in reverse topological order and construct the TRT network.
- // Toposort
- std::list<tensorflow::Node*> order;
- TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
-
- static int static_id = 0;
- string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
- // TODO(sami,ben,jie): proper naming!
- string calib_op_name =
- StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
- string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
- static_id++;
-
- auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
- auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
- auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
- TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
- op_res->logger_ = new tensorflow::tensorrt::Logger();
- cudaSetDevice(s.cuda_gpu_id_);
- op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
- op_res->allocator_ = s.allocator_;
-#if NV_TENSORRT_MAJOR > 3
- op_res->builder_->setGpuAllocator(s.allocator_.get());
-#endif
- if (!op_res->builder_) {
- return tensorflow::errors::Internal(
- "failed to create TensorRT builder object");
+ // Build the engine.
+ VLOG(1) << "Starting engine creation";
+ engine->reset(builder->buildCudaEngine(*converter.network()));
+ if (engine->get() == nullptr) {
+ return tensorflow::errors::Internal("Failed to build TensorRT engine");
}
-
- op_res->network_ = op_res->builder_->createNetwork();
- if (!op_res->network_) {
- return tensorflow::errors::Internal(
- "failed to create TensorRT network object");
- }
-
- // Build the network
- auto weight_rmgr = trt_rmgr->getManager("WeightStore");
- auto ws = new tensorflow::tensorrt::TRTWeightStore();
- TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
- Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
-
- std::vector<string> input_names;
- std::vector<tensorflow::DataType> input_dtypes;
- std::vector<string> output_names;
- std::vector<tensorflow::DataType> output_dtypes;
- TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
- &input_dtypes, &output_names,
- &output_dtypes, engine_name));
-
- VLOG(2) << "Finished processing outputs";
-
- // Build the engine
- op_res->builder_->setMaxBatchSize(s.max_batch_size);
- op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes);
- VLOG(0) << "Max batch size= " << s.max_batch_size
- << " max workspace size= " << s.max_workspace_size_bytes;
-
- // Build the TRT op
- // TODO(sami,ben,jie): proper naming!
- tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
- TF_RETURN_IF_ERROR(SetInputList(s, &op_builder, &input_names, &input_dtypes));
-
- std::vector<string> segment_names;
- segment_names.reserve(s.subgraph_node_ids.size());
- for (int i : s.subgraph_node_ids) {
- auto node = s.graph.FindNodeId(i);
- segment_names.push_back(node->name());
- }
- LOG(INFO) << "finished op preparation";
-
- auto status = op_builder.Attr("segment_nodes", segment_names)
- .Attr("input_names", input_names)
- .Attr("segment_output_names", output_names)
- .Attr("resource_name", calib_op_name)
- .Finalize(s.trt_node);
-
- LOG(INFO) << status.ToString();
- LOG(INFO) << "finished op building";
-
+ VLOG(1) << "Finished conversion";
return tensorflow::Status::OK();
}
-tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
- tensorrt::convert::SubGraphParams& s) {
- // Visit nodes in reverse topological order and construct the TRT network.
- std::list<tensorflow::Node*> order;
- TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
-
- static int static_id = 0;
- string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
- string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id++);
-
- tensorflow::tensorrt::Logger trt_logger;
- cudaSetDevice(s.cuda_gpu_id_);
- auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger));
- if (!trt_builder) {
- return tensorflow::errors::Internal(
- "Failed to create TensorRT builder object");
- }
-#if NV_TENSORRT_MAJOR > 3
- trt_builder->setGpuAllocator(s.allocator_.get());
-#endif
- auto trt_network = infer_object(trt_builder->createNetwork());
- if (!trt_network) {
- return tensorflow::errors::Internal(
- "Failed to create TensorRT network object");
- }
-
- auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
- auto weight_rmgr = trt_rmgr->getManager("WeightStore");
- auto ws = new tensorflow::tensorrt::TRTWeightStore();
- TF_CHECK_OK(weight_rmgr->Create(engine_name, engine_name, ws));
-
- // Build the network
- Converter converter(trt_network.get(), ws, s.precision_mode == FP16MODE);
-
- std::vector<string> input_names;
- std::vector<tensorflow::DataType> input_dtypes;
- std::vector<string> output_names;
- std::vector<tensorflow::DataType> output_dtypes;
- TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
- &input_dtypes, &output_names,
- &output_dtypes, engine_name));
-
- VLOG(2) << "Finished output";
-
- // Build the engine
- trt_builder->setMaxBatchSize(s.max_batch_size);
- trt_builder->setMaxWorkspaceSize(s.max_workspace_size_bytes);
- VLOG(0) << "Max batch size= " << s.max_batch_size
- << " max workspace size= " << s.max_workspace_size_bytes;
- if (s.precision_mode == FP16MODE) {
- trt_builder->setHalf2Mode(true);
- VLOG(0) << "Using FP16 precision mode";
- }
- LOG(INFO) << "starting build engine";
- string engine_plan_string;
- {
- auto trt_engine =
- infer_object(trt_builder->buildCudaEngine(*converter.network()));
- VLOG(0) << "Built network";
- if (trt_engine.get() == nullptr) {
- return tensorflow::errors::Internal("Engine building failure");
+tensorflow::Status ConvertSegmentToGraphDef(
+ const tensorflow::Graph* graph,
+ const tensorflow::grappler::GraphProperties& graph_properties,
+ const std::vector<int>& subgraph_node_ids, // In topological order
+ std::vector<EngineConnection>* connections,
+ tensorflow::GraphDef* segment_def, string* common_scope) {
+ std::set<string> marker_nodes;
+ // Update connection shapes/data types and add corresponding input/output
+ // nodes in the segment graphdef.
+ for (size_t i = 0; i < connections->size(); ++i) {
+ auto& connection = connections->at(i);
+ auto outside_node = graph->FindNodeId(connection.outside_id);
+ if (!outside_node) {
+ // This should never happen, unless the original graph is problematic.
+ return tensorflow::errors::NotFound(
+ "Cannot find node with id ", connection.outside_id, " in the graph.");
+ }
+ // Updates the shape and data types of input/output connections.
+ tensorflow::DataType input_type = tensorflow::DT_FLOAT;
+ tensorflow::PartialTensorShape partial_shape;
+ if (connection.is_input_edge) {
+ if (graph_properties.HasOutputProperties(connection.outside_node_name)) {
+ auto output_params =
+ graph_properties.GetOutputProperties(connection.outside_node_name);
+ auto out_shape = output_params.at(connection.outside_port);
+ input_type = out_shape.dtype();
+ std::vector<tensorflow::int64> dims;
+ partial_shape = out_shape.shape();
+ connection.outside_shape = partial_shape;
+ } else {
+ VLOG(0) << "Unknown output shape" << outside_node->name();
+ input_type = graph->FindNodeId(connection.outside_id)
+ ->output_type(connection.outside_port);
+ }
+ connection.connection_type = input_type;
+
+ } else { // output edge
+ if (graph_properties.HasInputProperties(connection.outside_node_name)) {
+ auto input_params =
+ graph_properties.GetInputProperties(connection.outside_node_name);
+ auto in_shape = input_params.at(connection.outside_port);
+ input_type = in_shape.dtype();
+ partial_shape = in_shape.shape();
+ connection.inside_shape = partial_shape;
+ } else {
+ input_type = graph->FindNodeId(connection.inside_id)
+ ->output_type(connection.outside_port);
+ }
+ connection.connection_type = input_type;
}
- auto engine_plan = infer_object(trt_engine->serialize());
- VLOG(0) << "Serialized engine";
- const char* engine_plan_data =
- static_cast<const char*>(engine_plan->data());
- engine_plan_string =
- string(engine_plan_data, engine_plan_data + engine_plan->size());
- }
- TF_RETURN_IF_ERROR(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
- engine_name, engine_name));
- LOG(INFO) << "finished engine " << engine_name << " containing "
- << s.subgraph_node_ids.size() << " nodes";
-
- // Build the TRT op
- tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
- TF_RETURN_IF_ERROR(SetInputList(s, &op_builder, &input_names, &input_dtypes));
-
- VLOG(0) << "Finished op preparation";
-
- auto status = op_builder.Attr("serialized_engine", engine_plan_string)
- .Attr("input_nodes", input_names)
- .Attr("output_nodes", output_names)
- .Attr("OutT", output_dtypes)
- .Device(s.device_name_)
- .Finalize(s.trt_node);
-
- VLOG(0) << status.ToString() << " finished op building for " << engine_name
- << " on device " << s.device_name_;
+ // Add dummy input/output nodes to the segment graphdef.
+ if (connection.is_input_edge) {
+ const string node_name = StrCat(kInputPHName, connection.port_number);
+ if (marker_nodes.count(node_name)) {
+ VLOG(1) << "Reusing input " << node_name << " for the edge "
+ << connection.outside_node_name << ":"
+ << connection.outside_port << " -> "
+ << connection.inside_node_name << ":" << connection.inside_port;
+ continue;
+ }
+ marker_nodes.insert(node_name);
+ auto seg_node = segment_def->add_node();
+ tensorflow::NodeDefBuilder builder(node_name, "Placeholder");
+ auto status = builder.Attr("shape", partial_shape)
+ .Attr("dtype", input_type)
+ .Finalize(seg_node);
+ VLOG(1) << "Constructing input " << node_name << " for the edge "
+ << connection.outside_node_name << ":" << connection.outside_port
+ << " -> " << connection.inside_node_name << ":"
+ << connection.inside_port;
+ } else {
+ const string node_name = StrCat(kOutputPHName, connection.port_number);
+ if (marker_nodes.count(node_name)) {
+ VLOG(1) << "Reusing output " << node_name << " for the edge "
+ << connection.inside_node_name << ":" << connection.inside_port
+ << " -> " << connection.outside_node_name << ":"
+ << connection.outside_port;
+ continue;
+ }
+ marker_nodes.insert(node_name);
+ auto seg_node = segment_def->add_node();
+ tensorflow::NodeDefBuilder builder(node_name, "Identity");
+ auto status = builder.Input(connection.inside_node_name, 0, input_type)
+ .Finalize(seg_node);
+ VLOG(1) << "Constructing output " << node_name << " for the edge "
+ << connection.inside_node_name << ":" << connection.inside_port
+ << " -> " << connection.outside_node_name << ":"
+ << connection.outside_port;
+ }
+ } // for each connection.
+
+ std::unordered_map<int, int> old_to_new_id_map;
+ // Copy internal nodes to new graphdef
+ string local_scope = graph->FindNodeId(*subgraph_node_ids.begin())->name();
+ for (const auto node_id : subgraph_node_ids) {
+ const auto node = graph->FindNodeId(node_id);
+ local_scope = GetCommonNameScope(local_scope, node->name());
+ old_to_new_id_map[node_id] = segment_def->node_size();
+ auto snode = segment_def->add_node();
+ snode->CopyFrom(node->def());
+ VLOG(1) << "Copying " << snode->name() << " to subgraph";
+ }
+ // Update the inputs of the new input nodes to point to placeholder nodes.
+ for (int i = 0; i < connections->size(); ++i) {
+ auto& connection = connections->at(i);
+ if (!connection.is_input_edge) continue;
+ auto snode =
+ segment_def->mutable_node(old_to_new_id_map[connection.inside_id]);
+ const string placeholder_name =
+ StrCat(kInputPHName, connection.port_number);
+ VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port
+ << " from " << snode->input(connection.inside_port) << " to "
+ << placeholder_name;
+ snode->set_input(connection.inside_port, placeholder_name);
+ }
+ *common_scope = local_scope;
+ VLOG(0) << "Segment @scope '" << local_scope << "', converted to graph";
return tensorflow::Status::OK();
}
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 3f6592cd25..1a4c0e755d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -22,69 +22,112 @@ limitations under the License.
#include <utility>
#include <vector>
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/grappler/costs/graph_properties.h"
#include "tensorflow/core/lib/core/status.h"
+
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
namespace tensorflow {
namespace tensorrt {
+static const char* kInputPHName = "InputPH_";
+static const char* kOutputPHName = "OutputPH_";
namespace convert {
+// TODO(aaroey): use an enum instead.
const int FP32MODE = 0;
const int FP16MODE = 1;
const int INT8MODE = 2;
-struct SubGraphParams {
- SubGraphParams(
- tensorflow::Graph& inp_graph,
- const std::set<int>& subgraph_node_id_numbers,
- const std::vector<std::pair<int, int>>& input_indices,
- const std::vector<std::pair<int, int>>& output_indices,
- size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
- const tensorflow::grappler::GraphProperties& current_graph_properties,
- std::unordered_map<string, std::pair<int, string>>* output_edges,
- tensorflow::NodeDef* constructed_trt_node,
- int engine_precision_mode = FP32MODE, const string& device_name = "",
- std::shared_ptr<nvinfer1::IGpuAllocator> allocator = nullptr,
- int cuda_gpu_id = 0)
- : graph(inp_graph),
- subgraph_node_ids(subgraph_node_id_numbers),
- input_inds(input_indices),
- output_inds(output_indices),
- max_batch_size(max_supported_batch_size),
- max_workspace_size_bytes(max_consumed_workspace_size_bytes),
- graph_properties(current_graph_properties),
- output_edge_map(output_edges),
- trt_node(constructed_trt_node),
- precision_mode(engine_precision_mode),
- device_name_(device_name),
- allocator_(allocator),
- cuda_gpu_id_(cuda_gpu_id) {}
-
- tensorflow::Graph& graph;
- const std::set<int>& subgraph_node_ids;
- const std::vector<std::pair<int, int>>& input_inds; // {node_id, output_idx}
- const std::vector<std::pair<int, int>>& output_inds; // {node_id, output_idx}
- size_t max_batch_size;
- size_t max_workspace_size_bytes;
- const tensorflow::grappler::GraphProperties& graph_properties;
- std::unordered_map<string, std::pair<int, string>>* output_edge_map;
- tensorflow::NodeDef* trt_node;
- const int precision_mode;
- const string device_name_;
- std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
- const int cuda_gpu_id_;
+struct EngineConnection {
+ EngineConnection(const string& outside, int out_id, int out_port,
+ const string& inside, int in_id, int in_port,
+ bool input_edge, int port)
+ : outside_node_name(outside),
+ outside_id(out_id),
+ outside_port(out_port),
+ inside_node_name(inside),
+ inside_id(in_id),
+ inside_port(in_port),
+ is_input_edge(input_edge),
+ port_number(port) {}
+
+ const string outside_node_name;
+ const int outside_id;
+ const int outside_port;
+ tensorflow::PartialTensorShape outside_shape;
+
+ const string inside_node_name;
+ const int inside_id;
+ const int inside_port;
+ tensorflow::PartialTensorShape inside_shape;
+
+ tensorflow::DataType connection_type;
+ bool is_input_edge;
+
+ // The port number of the TRT node connecting to this edge.
+ int port_number;
+};
+
+struct EngineInfo {
+ EngineInfo()
+ : engine_type(EngineType::TRTStatic),
+ max_workspace_size_bytes(0),
+ precision_mode(FP32MODE) {}
+
+ string engine_name;
+ string device;
+ tensorflow::GraphDef segment_graph_def;
+
+ // The segment nodes that are on one side of the edges are topological sorted.
+ std::vector<EngineConnection> connections;
+
+ enum class EngineType { TRTStatic = 0, TRTDynamic = 1 };
+ EngineType engine_type;
+ int64 max_workspace_size_bytes;
+ int maximum_cached_engines;
+ std::vector<int> cached_engine_batches;
+ int precision_mode;
};
-// TODO(sami): Replace references with const reference or pointers
-tensorflow::Status ConvertSubGraphToTensorRTNodeDef(SubGraphParams& params);
-tensorflow::Status InjectCalibrationNode(SubGraphParams& params);
-tensorflow::Status ConvertCalibrationNodeToEngineNode(tensorflow::Graph& graph,
- tensorflow::Node* c_node);
+// Constructs a graphdef from the segment in the given graph. Adds placeholder
+// nodes for input edges (InputPH_*) and identity nodes for output edges
+// (OutputPH_*). This function needs to be called before TensorRT nodes
+// inserted in order to correctly get sizes from the original graph.
+//
+// - subgraph_node_ids: the node ids of the subgraph, must be sorted in
+// topological order.
+// - segment_def: the output GraphDef, whose non-input/output nodedefs will be
+// sorted in topological order.
+tensorflow::Status ConvertSegmentToGraphDef(
+ const tensorflow::Graph* graph,
+ const tensorflow::grappler::GraphProperties& graph_properties,
+ const std::vector<int>& subgraph_node_ids,
+ std::vector<EngineConnection>* connections,
+ tensorflow::GraphDef* segment_def, string* common_scope);
+
+// Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff
+// 'builder' successfully build the engine. If the result is not ok, 'engine'
+// will be set to nullptr
+// Once returned, 'builder' is not needed any more and can be safely detroyed.
+//
+// - convert_successfully: indicates whether the converson to TensorRT network
+// is successful. This is different than successfully building the engine:
+// building can still fail afterwards.
+tensorflow::Status ConvertGraphDefToEngine(
+ const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
+ size_t max_workspace_size_bytes,
+ const std::vector<tensorflow::PartialTensorShape>& input_shapes,
+ Logger* logger, nvinfer1::IGpuAllocator* allocator,
+ TRTInt8Calibrator* calibrator,
+ TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+ bool* convert_successfully);
+
} // namespace convert
} // namespace tensorrt
} // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
index 8f634b1f74..ec9dbfa13b 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
@@ -45,8 +45,24 @@ tensorflow::Status TRTOptimizationPass::Init(
if (params.count("max_batch_size")) {
maximum_batch_size_ = params.at("max_batch_size").i();
}
- if (params.count("max_workspace_size_bytes"))
+ is_dynamic_op_ = false;
+ if (params.count("is_dynamic_op")) {
+ is_dynamic_op_ = params.at("is_dynamic_op").b();
+ }
+ if (params.count("cached_engine_batches")) {
+ auto batch_vec = params.at("cached_engine_batches").list();
+ batches_.reserve(batch_vec.i_size());
+ for (const auto i : batch_vec.i()) {
+ batches_.push_back(i);
+ }
+ }
+ max_cached_batches_ = 1;
+ if (params.count("maximum_cached_engines")) {
+ max_cached_batches_ = params.at("maximum_cached_engines").i();
+ }
+ if (params.count("max_workspace_size_bytes")) {
maximum_workspace_size_ = params.at("max_workspace_size_bytes").i();
+ }
if (params.count("precision_mode")) {
string pm = Uppercase(params.at("precision_mode").s());
if (pm == "FP32") {
@@ -175,6 +191,17 @@ tensorflow::Status TRTOptimizationPass::Optimize(
if (VLOG_IS_ON(1)) {
PrintDebugInfo(cluster, item);
}
+ // This is a hack to workaround optimizer issue. MetaOptimizer calls
+ // optimization passes on function objects as well, we should not modify
+ // generated funcdefs! This is fragile but we don't have any other option
+ // until framework fixes it.
+ if (item.id != "tf_graph") {
+ LOG(WARNING) << name_
+ << " is probably called on funcdef! This optimizer must *NOT* "
+ "be called on function objects.";
+ *optimized_graph = item.graph;
+ return tensorflow::Status::OK();
+ }
int max_dim = -1;
if (item.feed.size()) {
for (const auto& f : item.feed) {
@@ -204,11 +231,22 @@ tensorflow::Status TRTOptimizationPass::Optimize(
}
tensorflow::grappler::GraphProperties static_graph_properties(item);
TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
- auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(
- item.graph, item.fetch, maximum_batch_size_, maximum_workspace_size_,
- optimized_graph, precision_mode_, minimum_segment_size_,
- static_graph_properties, cluster);
+ tensorflow::tensorrt::convert::ConversionParams cp;
+ cp.input_graph_def = &item.graph;
+ cp.output_names = &item.fetch;
+ cp.max_batch_size = maximum_batch_size_;
+ cp.max_workspace_size_bytes = maximum_workspace_size_;
+ cp.output_graph_def = optimized_graph;
+ cp.precision_mode = precision_mode_;
+ cp.minimum_segment_size = minimum_segment_size_;
+ cp.graph_properties = &static_graph_properties;
+ cp.cluster = cluster;
+ cp.is_dyn_op = is_dynamic_op_;
+ cp.cached_engine_batches = batches_;
+ cp.max_cached_engines = max_cached_batches_;
+ auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(cp);
VLOG(2) << optimized_graph->DebugString();
+ VLOG(1) << "Returning from " << name_;
return status;
}
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
index d8ecead23e..463ed3883e 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
@@ -61,6 +61,9 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
int minimum_segment_size_;
int precision_mode_;
int maximum_batch_size_;
+ bool is_dynamic_op_;
+ std::vector<int> batches_;
+ int max_cached_batches_;
int64_t maximum_workspace_size_;
};
diff --git a/tensorflow/contrib/tensorrt/convert/utils.h b/tensorflow/contrib/tensorrt/convert/utils.h
new file mode 100644
index 0000000000..f601c06701
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/utils.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+
+#include <memory>
+
+namespace tensorflow {
+namespace tensorrt {
+
+template <typename T>
+struct TrtDestroyer {
+ void operator()(T* t) {
+ if (t) t->destroy();
+ }
+};
+
+template <typename T>
+using TrtUniquePtrType = std::unique_ptr<T, TrtDestroyer<T>>;
+
+} // namespace tensorrt
+} // namespace tensorflow
+
+#endif // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 9ac8047944..8a17eb02f1 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -14,8 +14,16 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
+#include <algorithm>
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"
@@ -25,144 +33,556 @@ limitations under the License.
#include "cuda/include/cuda_runtime_api.h"
namespace tensorflow {
-static ::tensorflow::tensorrt::Logger logger;
-using IRuntime = nvinfer1::IRuntime;
-using Dims = nvinfer1::Dims;
-
namespace tensorrt {
+static Logger logger;
+using ::nvinfer1::IRuntime;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
+// A helper class to call done() when destructed for asynchronous execution.
+// Helps simultaneous execution of native and TRT engines.
+class AsyncHelper : public tensorflow::core::RefCounted {
+ public:
+ AsyncHelper(tensorflow::AsyncOpKernel::DoneCallback done) { done_ = done; }
+ ~AsyncHelper() override { done_(); }
-TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
+ private:
+ tensorflow::AsyncOpKernel::DoneCallback done_;
+};
+
+#define TYPECASE(dt, X, Y) \
+ case dt: { \
+ return (void*)X->flat<tensorflow::EnumToDataType<dt>::Type>().data(); \
+ }
+
+void* GetTensorAddress(const Tensor* tensor_ptr) {
+ auto tensor_type = tensor_ptr->dtype();
+ switch (tensor_type) {
+ TYPECASE(tensorflow::DT_FLOAT, tensor_ptr, dest_ptr);
+ TYPECASE(tensorflow::DT_HALF, tensor_ptr, dest_ptr);
+ TYPECASE(tensorflow::DT_INT8, tensor_ptr, dest_ptr);
+ default: {
+ LOG(ERROR) << "Unsupported Data type "
+ << tensorflow::DataTypeString(tensor_type);
+ return nullptr;
+ }
+ }
+}
+
+tensorflow::Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) {
+ VLOG(1) << "Constructing function handle";
+ auto lib = ctx->function_library();
+ if (lib == nullptr) {
+ return tensorflow::errors::Internal("Context function library is null");
+ }
+ auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_);
+ if (fdef == nullptr) {
+ return tensorflow::errors::Internal("Native FunctionDef ", funcdef_name_,
+ " can't be found in function library");
+ }
+ tensorflow::FunctionLibraryRuntime::InstantiateOptions inst_ops;
+ inst_ops.overlay_lib = nullptr;
+ inst_ops.state_handle = "";
+ inst_ops.target = ctx->device()->name();
+ native_func_ = 0;
+ auto status = lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()),
+ inst_ops, &native_func_);
+ if (!status.ok()) {
+ LOG(ERROR) << " Instantiating native function " << funcdef_name_
+ << " failed!";
+ }
+ return status;
+}
+
+TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
+ : AsyncOpKernel(context) {
// read serialized_engine
OP_REQUIRES_OK(context,
- context->GetAttr("serialized_engine", &serialized_engine_));
+ context->GetAttr("serialized_segment", &serialized_segment_));
+ OP_REQUIRES_OK(context,
+ context->GetAttr("workspace_size_bytes", &workspace_size_));
+ OP_REQUIRES_OK(context, context->GetAttr("static_engine", &static_engine_));
+ if (!static_engine_) {
+ if (!segment_graph_.ParseFromString(serialized_segment_)) {
+ LOG(ERROR) << "Parsing segment graph failed!";
+ context->SetStatus(tensorflow::errors::InvalidArgument(
+ "Failed to parse segment graphdef!"));
+ return;
+ }
+ serialized_segment_.resize(0);
+ }
+ VLOG(1) << "Constructing " << name();
+ string precision_string;
+ OP_REQUIRES_OK(context,
+ context->GetAttr("precision_mode", &precision_string));
+ string calibration_data;
+ OP_REQUIRES_OK(context,
+ context->GetAttr("calibration_data", &calibration_data));
+ OP_REQUIRES_OK(context,
+ context->GetAttr("segment_funcdef_name", &funcdef_name_));
+ if (precision_string == "FP32") {
+ precision_mode_ = convert::FP32MODE;
+ } else if (precision_string == "FP16") {
+ precision_mode_ = convert::FP16MODE;
+ } else if (precision_string == "INT8") {
+ precision_mode_ = convert::INT8MODE;
+ }
+ calibration_mode_ =
+ (precision_mode_ == convert::INT8MODE && calibration_data.size() == 0);
+ if (calibration_data.size()) {
+ calibrator_.reset(new TRTInt8Calibrator(calibration_data));
+ calibration_data.resize(0);
+ }
+ native_func_ = tensorflow::kInvalidHandle;
+ OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count",
+ &max_cached_engines_));
+ OP_REQUIRES_OK(context,
+ context->GetAttr("fixed_input_size", &fixed_input_size_));
+ OP_REQUIRES_OK(context, context->GetAttr("cached_engine_batches",
+ &cached_engine_batches_));
+ std::sort(cached_engine_batches_.begin(), cached_engine_batches_.end());
+ if (VLOG_IS_ON(1)) {
+ string s("Engine Batches= ");
+ for (auto i : cached_engine_batches_) {
+ StrAppend(&s, i, " ");
+ }
+ VLOG(1) << s;
+ }
+}
- // register input output node name in trt_sub_graph
- OP_REQUIRES_OK(context, context->GetAttr("input_nodes", &input_nodes_));
- OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
+void TRTEngineOp::ExecuteNativeSegment(tensorflow::OpKernelContext* ctx,
+ AsyncHelper* helper) {
+ if (!calibration_mode_) {
+ VLOG(1) << "Executing native engine";
+ }
+ std::vector<Tensor> inputs;
+ std::vector<Tensor>* outputs = new std::vector<Tensor>();
+ if (native_func_ == tensorflow::kInvalidHandle) {
+ auto status = ConstructFunctionHandle(ctx);
+ if (!status.ok()) {
+ LOG(ERROR) << "Couldn't construct function handle " << funcdef_name_;
+ ctx->SetStatus(status);
+ return;
+ }
+ }
+ auto lib = ctx->function_library();
+ tensorflow::FunctionLibraryRuntime::Options opts;
+ opts.step_id = ctx->step_id();
+ opts.rendezvous = ctx->rendezvous();
+ opts.cancellation_manager = ctx->cancellation_manager();
+ opts.runner = ctx->runner();
+ for (int i = 0; i < ctx->num_inputs(); i++) {
+ inputs.push_back(ctx->input(i));
+ }
+ helper->Ref(); // Increment count for calculating native graph
+ VLOG(1) << "Executing native segment " << name();
+ lib->Run(opts, native_func_, inputs, outputs,
+ [ctx, outputs, helper](const tensorflow::Status& s) {
+ tensorflow::core::ScopedUnref sc(helper);
+ VLOG(1) << "Native Segment completed";
+ if (!s.ok()) {
+ ctx->SetStatus(s);
+ return;
+ }
+ for (size_t t = 0; t < outputs->size(); ++t) {
+ ctx->set_output(t, outputs->at(t));
+ }
+ delete outputs;
+ });
}
-void TRTEngineOp::Compute(OpKernelContext* context) {
- // TODO(samikama) runtime should be taken from a resourcemanager as well.
- // Only engine should be in the op and context and runtime should be taken
- // from resourcemanager
+void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
+ AsyncHelper* helper) {
+ helper->Ref();
+ tensorflow::core::ScopedUnref sc(helper);
+ // TODO(aaroey): remove the ResourceMgr singleton.
+ auto trt_rm = TRTResourceManager::instance();
+ auto res_mgr = trt_rm->getManager("TRTCalibration");
+ TRTCalibrationResource* calib_res = nullptr;
+ auto status = res_mgr->LookupOrCreate(
+ funcdef_name_, "Calibrator", &calib_res,
+ {[ctx, this](TRTCalibrationResource** cr) -> tensorflow::Status {
+ return this->AllocateCalibrationResources(ctx, cr);
+ }});
+ if (!status.ok()) {
+ ctx->SetStatus(status);
+ return;
+ }
+ int num_inputs = ctx->num_inputs();
+ // Pass input data to calibrator
+ std::unordered_map<string, void*> input_data;
+ for (int i = 0; i < num_inputs; i++) {
+ const Tensor& t = ctx->input(i);
+ void* data_address = GetTensorAddress(&t);
+ if (data_address == nullptr) {
+ ctx->SetStatus(tensorflow::errors::InvalidArgument(
+ "Unsupported data type encountered in input ", i));
+ return;
+ }
+ // Check the allocated buffer is sufficient for input
+ const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
+ CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
+ input_data.emplace(StrCat(kInputPHName, i), data_address);
+ }
+ VLOG(2) << "Filled map for sending";
+ // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
+ const cudaStream_t* stream = CHECK_NOTNULL(
+ reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
+ ->stream()
+ ->implementation()
+ ->CudaStreamMemberHack()));
+ calib_res->calibrator_->setBatch(input_data, *stream);
+ VLOG(2) << "Passed calibration data";
+ ExecuteNativeSegment(ctx, helper);
+}
- if (!trt_execution_context_ptr_) {
- IRuntime* infer = nvinfer1::createInferRuntime(logger);
-#if NV_TENSORRT_MAJOR > 3
- auto device = context->device();
- auto dev_allocator =
- device->GetAllocator(tensorflow::AllocatorAttributes());
- if (!dev_allocator) {
- LOG(FATAL) << "Can't find device allocator for gpu device "
- << device->name();
- }
- allocator_ = std::make_shared<TRTDeviceAllocator>(dev_allocator);
- infer->setGpuAllocator(allocator_.get());
-#endif
- trt_engine_ptr_.reset(infer->deserializeCudaEngine(
- serialized_engine_.c_str(), serialized_engine_.size(),
- PluginFactoryTensorRT::GetInstance()));
- trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
- // Runtime is safe to delete after engine creation
- infer->destroy();
- serialized_engine_.clear();
+int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) {
+ int num_batch = ctx->input(0).shape().dim_size(0);
+ int smallest_engine = 0;
+ for (const auto i : cached_engine_batches_) {
+ if (i >= num_batch) {
+ smallest_engine = i;
+ break;
+ }
}
- int num_binding = context->num_inputs() + context->num_outputs();
- std::vector<void*> buffers(num_binding);
+ // TODO(sami): Need an LRU here
+ if (smallest_engine == 0) {
+ if (max_cached_engines_ > cached_engine_batches_.size()) {
+ smallest_engine = num_batch;
+ cached_engine_batches_.push_back(num_batch);
+ VLOG(1) << "Running with batch size " << num_batch;
+ } else {
+ string s("Engine buffer is full. buffer limit= ");
+ StrAppend(&s, max_cached_engines_, ", current entries= ");
+ for (auto i : cached_engine_batches_) StrAppend(&s, i, ", ");
+ StrAppend(&s, "Requested batch= ", num_batch);
+ LOG(ERROR) << s;
+ ctx->SetStatus(tensorflow::errors::ResourceExhausted(
+ "Requested batch size is not available and engine cache is full"));
+ return -1;
+ }
+ }
+ return smallest_engine;
+}
- size_t binding_index;
- int num_batch = 0;
- for (int i = 0; i < context->num_inputs(); i++) {
- // Grab the input tensor
- binding_index = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str());
+void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
+ tensorflow::AsyncOpKernel::DoneCallback done) {
+ auto helper = new AsyncHelper(done);
+ tensorflow::core::ScopedUnref sc(helper);
+ if (calibration_mode_) {
+ ExecuteCalibration(ctx, helper);
+ return;
+ }
+ const int smallest_engine = GetEngineBatch(ctx);
+ if (smallest_engine < 0) return; // GetEngineBatch already set the status.
+
+ const int num_batch = ctx->input(0).shape().dim_size(0);
+ auto& engine_ctx_pair = GetEngine(smallest_engine, ctx);
+ auto& trt_engine_ptr = engine_ctx_pair.first;
+ if (!trt_engine_ptr) {
+ LOG(WARNING) << "Engine retrieval for batch size " << num_batch
+ << " failed Running native segment";
+ ExecuteNativeSegment(ctx, helper);
+ return;
+ }
- const Tensor& input_tensor = context->input(i);
+ const int num_binding = ctx->num_inputs() + ctx->num_outputs();
+ std::vector<void*> buffers(num_binding);
+ for (int i = 0; i < ctx->num_inputs(); i++) {
+ const string inp_name = StrCat(kInputPHName, i);
+ const size_t binding_index =
+ trt_engine_ptr->getBindingIndex(inp_name.c_str());
+
+ const Tensor& input_tensor = ctx->input(i);
const TensorShape& input_shape = input_tensor.shape();
- if (i == 0) {
- num_batch = input_shape.dim_size(0);
- if (num_batch > trt_engine_ptr_->getMaxBatchSize()) {
- LOG(FATAL) << "input tensor batch larger than max_batch_size: "
- << trt_engine_ptr_->getMaxBatchSize();
- }
- } else if (num_batch != input_shape.dim_size(0)) {
- LOG(FATAL) << "input data inconsistent batch size";
- break;
+ if (num_batch != input_shape.dim_size(0)) {
+ LOG(ERROR) << "input data inconsistent batch size";
+ ctx->SetStatus(tensorflow::errors::FailedPrecondition(
+ "Different batch sizes between input tensors"));
+ return;
}
- auto dtype = trt_engine_ptr_->getBindingDataType(binding_index);
+ auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
switch (dtype) {
case nvinfer1::DataType::kFLOAT:
buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
break;
case nvinfer1::DataType::kHALF:
- LOG(FATAL) << "half size is not supported yet!";
- break;
+ LOG(ERROR) << "FP16 inputs are not supported yet!";
+ ctx->SetStatus(tensorflow::errors::InvalidArgument(
+ "FP16 inputs are not supported!"));
+ return;
case nvinfer1::DataType::kINT8:
- LOG(FATAL) << "int8 is not supported yet!";
- break;
+ LOG(ERROR) << "INT8 inputs are not supported yet!";
+ ctx->SetStatus(tensorflow::errors::InvalidArgument(
+ "INT8 inputs are not supported!"));
+ return;
default:
- LOG(FATAL) << "Unknown data type: " << int(dtype);
- break;
+ LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
+ ctx->SetStatus(tensorflow::errors::InvalidArgument(
+ "Unknown output TRT data type! ", static_cast<int>(dtype)));
+ return;
}
}
- for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
- // This is bad that we have to reallocate output buffer every run.
+ for (int i = 0; i < ctx->num_outputs(); i++) {
// Create an output tensor
- binding_index = trt_engine_ptr_->getBindingIndex(output_nodes_[i].c_str());
+ const string output_name = StrCat(kOutputPHName, i);
+ const size_t binding_index =
+ trt_engine_ptr->getBindingIndex(output_name.c_str());
Tensor* output_tensor = nullptr;
TensorShape output_shape;
if (binding_index != -1) {
- auto dims = trt_engine_ptr_->getBindingDimensions(binding_index);
+ auto dims = trt_engine_ptr->getBindingDimensions(binding_index);
std::vector<int> trt_shape(dims.nbDims + 1);
trt_shape[0] = num_batch;
for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
- OP_REQUIRES_OK(context,
- TensorShapeUtils::MakeShape(
- trt_shape.data(), trt_shape.size(), &output_shape));
+ OP_REQUIRES_OK(
+ ctx, TensorShapeUtils::MakeShape(trt_shape.data(), trt_shape.size(),
+ &output_shape));
} else {
- LOG(FATAL) << "output node not found, at " << output_nodes_[i];
- break;
+ LOG(ERROR) << "output node not found, at " << output_name;
+ ctx->SetStatus(tensorflow::errors::Internal("output ", output_name,
+ " couldn't be found!"));
+ return;
}
-
- OP_REQUIRES_OK(context,
- context->allocate_output(i, output_shape, &output_tensor));
- auto dtype = trt_engine_ptr_->getBindingDataType(binding_index);
+ auto status = ctx->allocate_output(i, output_shape, &output_tensor);
+ if (!status.ok()) {
+ LOG(ERROR) << "Allocating output failed with " << status;
+ ctx->SetStatus(status);
+ return;
+ }
+ auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
switch (dtype) {
case nvinfer1::DataType::kFLOAT:
buffers[binding_index] =
reinterpret_cast<void*>(output_tensor->flat<float>().data());
break;
case nvinfer1::DataType::kHALF:
- LOG(FATAL) << "half size is not supported yet!";
- break;
+ LOG(ERROR) << "half size is not supported yet!";
+ ctx->SetStatus(tensorflow::errors::InvalidArgument(
+ "Half outputs are not supported!"));
+ return;
case nvinfer1::DataType::kINT8:
- LOG(FATAL) << "int8 is not supported yet!";
- break;
+ LOG(ERROR) << "int8 is not supported yet!";
+ ctx->SetStatus(tensorflow::errors::InvalidArgument(
+ "INT8 outputs are not supported!"));
+ return;
default:
- LOG(FATAL) << "Unknown data type: " << int(dtype);
- break;
+ LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype);
+ ctx->SetStatus(tensorflow::errors::InvalidArgument(
+ "Unsupported output data type! ", static_cast<int>(dtype)));
+ return;
}
}
// copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
const cudaStream_t* stream = CHECK_NOTNULL(
- reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+ reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
->stream()
->implementation()
->CudaStreamMemberHack()));
// TODO(jie): trt enqueue does not return error
- auto ret = trt_execution_context_ptr_->enqueue(num_batch, &buffers[0],
- *stream, nullptr);
- VLOG(2) << "enqueue returns: " << ret;
+ auto& trt_execution_context_ptr = engine_ctx_pair.second;
+ auto ret = trt_execution_context_ptr->enqueue(num_batch, &buffers[0], *stream,
+ nullptr);
+ if (!ret) {
+ LOG(ERROR) << "Failed to enqueue batch for TRT engine: " << name();
+ ctx->SetStatus(tensorflow::errors::Internal(
+ "Failed to enqueue batch for TRT engine: ", name()));
+ }
// sync should be done by TF.
}
+
TRTEngineOp::~TRTEngineOp() {
- // Order matters!
- trt_execution_context_ptr_.reset();
- trt_engine_ptr_.reset();
+ // We need to manually destroy the engine and execution context before
+ // the allocator is destructed.
+ for (auto& eng : engine_map_) {
+ eng.second.first.reset();
+ eng.second.second.reset();
+ }
allocator_.reset();
}
+
+nvinfer1::IGpuAllocator* TRTEngineOp::GetAllocator(OpKernelContext* ctx) {
+ if (allocator_) return allocator_.get();
+ auto device = ctx->device();
+ auto alloc = device->GetAllocator(tensorflow::AllocatorAttributes());
+ if (!alloc) {
+ LOG(ERROR) << "Can't find device allocator for gpu device "
+ << device->name();
+ ctx->SetStatus(tensorflow::errors::Internal(
+ "Can't get device allocator for device ", device->name()));
+ return nullptr;
+ }
+ allocator_.reset(new TRTDeviceAllocator(alloc));
+ return allocator_.get();
+}
+
+TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
+ OpKernelContext* ctx) {
+ static EngineCtxPair null_pair = {
+ TrtUniquePtrType<nvinfer1::ICudaEngine>(nullptr),
+ TrtUniquePtrType<nvinfer1::IExecutionContext>(nullptr)};
+ // TODO(sami): This method needs to be re-written to use resource manager and
+ // with LRU mechanism option.
+ tensorflow::mutex_lock lock(engine_mutex_);
+
+ if (static_engine_) {
+ if (engine_map_.size()) {
+ if (engine_map_.begin()->first >= batch_size) {
+ return engine_map_.begin()->second;
+ }
+ return null_pair;
+ }
+ TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
+#if NV_TENSORRT_MAJOR > 3
+ auto allocator = GetAllocator(ctx);
+ if (allocator == nullptr) {
+ // GetAllocator already set the Status.
+ return null_pair;
+ }
+ infer->setGpuAllocator(allocator);
+#endif
+ TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
+ infer->deserializeCudaEngine(serialized_segment_.c_str(),
+ serialized_segment_.size(), nullptr));
+ auto raw_static_engine = static_engine.get();
+ const auto max_batch_size = raw_static_engine->getMaxBatchSize();
+ engine_map_[max_batch_size] = {
+ std::move(static_engine),
+ TrtUniquePtrType<nvinfer1::IExecutionContext>(
+ raw_static_engine->createExecutionContext())};
+ // Runtime is safe to delete after engine creation
+ serialized_segment_.clear();
+ if (max_batch_size < batch_size) return null_pair;
+ return engine_map_.at(max_batch_size);
+ } // static_engine_
+
+ // Handle the dynamic engine case.
+ auto engine_it = engine_map_.find(batch_size);
+ if (engine_it == engine_map_.end() &&
+ engine_map_.size() < (size_t)max_cached_engines_) {
+ nvinfer1::IGpuAllocator* allocator = nullptr;
+#if NV_TENSORRT_MAJOR > 3
+ allocator = GetAllocator(ctx);
+ if (allocator == nullptr) {
+ // GetAllocator already set the Status.
+ return null_pair;
+ }
+#endif
+ std::vector<tensorflow::PartialTensorShape> shapes;
+ for (int i = 0; i < ctx->num_inputs(); ++i) {
+ shapes.emplace_back(ctx->input(i).shape());
+ }
+ TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+ bool convert_successfully = false;
+ VLOG(0) << name() << " Constructing a new engine with batch size "
+ << batch_size;
+ // Up to this point, calibrator_ can never be empty, since otherwise it
+ // means calibration_mode_ is true and this path won't get executed.
+ auto status = convert::ConvertGraphDefToEngine(
+ segment_graph_, precision_mode_, batch_size, workspace_size_, shapes,
+ &logger, allocator, calibrator_.get(), &engine, &convert_successfully);
+ if (!status.ok()) {
+ if (convert_successfully) {
+ // This means it fail to build the engine even when the network is built
+ // successfully, probably due to internal issues. In this case we don't
+ // retry in the future.
+ engine_map_[batch_size] = {nullptr, nullptr};
+ }
+ LOG(ERROR) << "Engine creation for batch size " << batch_size
+ << " failed " << status;
+ ctx->SetStatus(tensorflow::errors::Internal("Engine creation failed!"));
+ return null_pair;
+ }
+ VLOG(1) << "Conversion is done";
+ TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
+ engine->createExecutionContext());
+ engine_map_[batch_size] = {std::move(engine), std::move(exec_context)};
+ }
+ return engine_map_.at(batch_size);
+}
+
+tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
+ tensorflow::OpKernelContext* ctx, TRTCalibrationResource** cr) {
+ auto cres = new TRTCalibrationResource();
+ *cr = cres;
+ // Get the allocator.
+ auto alloc = ctx->device()->GetAllocator(tensorflow::AllocatorAttributes());
+ if (!alloc) {
+ LOG(WARNING) << "Can't get device allocator will not be able to "
+ "allocate memory from TensorFlow memory pool";
+ cres->allocator_.reset(new TRTCudaAllocator);
+ } else {
+ cres->allocator_.reset(new TRTDeviceAllocator(alloc));
+ }
+ // Get the input shapes.
+ const int batch_size = ctx->input(0).dim_size(0);
+ const int num_inputs = ctx->num_inputs();
+ std::vector<tensorflow::PartialTensorShape> shapes;
+ dev_tensors_.resize(num_inputs);
+ VLOG(1) << " Constructing calibrator";
+ for (int i = 0; i < num_inputs; i++) {
+ // allocate workspace on device for inputs
+ const tensorflow::Tensor& t = ctx->input(i);
+ shapes.emplace_back(t.shape());
+ Tensor* device_tensor;
+ TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+ t.dtype(), t.shape(), &dev_tensors_.at(i), &device_tensor));
+ CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
+ void* device_address = GetTensorAddress(device_tensor);
+ if (device_address == nullptr) {
+ return tensorflow::errors::InvalidArgument(
+ "Unsupported data type encountered in input ", i);
+ }
+ device_buffers_.emplace(
+ StrCat(kInputPHName, i),
+ std::pair<void*, size_t>(device_address, device_tensor->TotalBytes()));
+ }
+ cres->calibrator_.reset(
+ new TRTInt8Calibrator(device_buffers_, batch_size, name()));
+ const string label(name());
+ auto segment_graph = &segment_graph_;
+ const int cuda_gpu_id = ctx->device()->tensorflow_gpu_device_info()->gpu_id;
+ if (cuda_gpu_id < 0) {
+ LOG(ERROR) << "Can't get gpu_device_info from context->device()";
+ return tensorflow::errors::InvalidArgument(
+ "Context->device doesn't contain device info!");
+ }
+ const int64 workspace_size_bytes = workspace_size_;
+ cres->thr_.reset(new std::thread([cres, label, segment_graph, shapes,
+ cuda_gpu_id, workspace_size_bytes]() {
+ VLOG(0) << "Starting calibration thread on device " << cuda_gpu_id
+ << ", Calibration Resource @ " << cres;
+ auto err = cudaSetDevice(cuda_gpu_id);
+ if (err != cudaSuccess) {
+ // TODO(aaroey): should return error here.
+ LOG(ERROR) << "Couldn't set cuda device to " << cuda_gpu_id
+ << " in calibration thread";
+ }
+ // ConvertGraphDefToEngine() will try to build the engine. This thread
+ // will loop inside buildCudaEngine() consuming the calibration data
+ // that is set by the TF op, and drive the builder until calibrator returns
+ // false. Engine is discarded after calibration table is generated
+ //
+ // TODO(aaroey): maybe setting the max batch size using the python
+ // calibration wrapper class.
+ auto s = convert::ConvertGraphDefToEngine(
+ *segment_graph, convert::INT8MODE, cres->calibrator_->getBatchSize(),
+ workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(),
+ cres->calibrator_.get(), &cres->engine_,
+ /*convert_successfully=*/nullptr);
+ if (!s.ok()) {
+ LOG(ERROR) << "Calibration failed: " << s;
+ cres->calibrator_->setDone(); // Ignore further pushes
+ }
+ VLOG(1) << "Calibration loop terminated " << label;
+ }));
+ VLOG(1) << "initialized calibrator resource";
+ return tensorflow::Status::OK();
+}
+
REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
} // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index e613a71422..6fe318be6a 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -19,9 +19,14 @@ limitations under the License.
#include <memory>
#include <vector>
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
@@ -30,32 +35,95 @@ limitations under the License.
namespace tensorflow {
namespace tensorrt {
-class Logger;
-
+class TRTInt8Calibrator;
+class TRTCalibrationResource;
+class AsyncHelper;
// TODO(Sami): Remove this file?
-class TRTEngineOp : public OpKernel {
+
+// This OP can construct TRTEngine on the fly and if construction of engine
+// fails, executes equivalent subgraph as a TensorFlow function.
+class TRTEngineOp : public AsyncOpKernel {
public:
explicit TRTEngineOp(OpKernelConstruction* context);
- void Compute(OpKernelContext* context) override;
+ void ComputeAsync(OpKernelContext* context,
+ AsyncOpKernel::DoneCallback done) override;
~TRTEngineOp();
private:
- template <typename T>
- struct Destroyer {
- void operator()(T* d) { d->destroy(); }
- };
-
- template <typename T>
- using destroyed_ptr = std::unique_ptr<T, Destroyer<T>>;
- destroyed_ptr<nvinfer1::ICudaEngine> trt_engine_ptr_;
+ // Execute calibration
+ void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper);
+
+ // Construct a function handle for executing native funcdef graph
+ Status ConstructFunctionHandle(OpKernelContext* ctx);
+
+ // Execute replaced native segment as function Op.
+ void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
+
+ // Allocate necessary resources for calibration
+ Status AllocateCalibrationResources(OpKernelContext* ctx,
+ TRTCalibrationResource** cr);
+
// TODO(samikama): context should go to a resource manager!
- destroyed_ptr<nvinfer1::IExecutionContext> trt_execution_context_ptr_;
+ typedef std::pair<TrtUniquePtrType<nvinfer1::ICudaEngine>,
+ TrtUniquePtrType<nvinfer1::IExecutionContext>>
+ EngineCtxPair;
+ EngineCtxPair& GetEngine(int batch_size, OpKernelContext* ctx);
+ // Return engine batch closest to input batch.
+ int GetEngineBatch(OpKernelContext* ctx);
+
+ nvinfer1::IGpuAllocator* GetAllocator(OpKernelContext* ctx);
+
+ // map to keep engines and their execution context for given batch size.
+ std::unordered_map<int, EngineCtxPair> engine_map_;
std::vector<string> input_nodes_;
std::vector<string> output_nodes_;
- std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
- string serialized_engine_;
+
+ // keep device allocator for TRT.
+ std::unique_ptr<TRTDeviceAllocator> allocator_;
+
+ // serialized protobuf segment or trt engine depending on static_engine_ flag.
+ string serialized_segment_;
+
+ // Name of the function for TF native execution of the segment.
+ string funcdef_name_;
+
+ // GraphDef representation of the segment.
+ GraphDef segment_graph_;
+
+ // Lookup table for temporary staging areas of input tensors for calibration.
+ std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
+
+ // Temporary staging areas for calibration inputs.
+ std::vector<PersistentTensor> dev_tensors_;
+
+ // Engine Precision mode.
+ int precision_mode_;
+
+ // Whether engine is constructed during the conversion or needs to be
+ // constructed from protobuf segment.
+ bool static_engine_;
+
+ // Whether to calibrate INT8 engine.
+ bool calibration_mode_;
+
+ // Whether non-batch ranks of the inputs are assumed to be fixed or not for
+ // engine construction.
+ bool fixed_input_size_;
+
+ // Batches of the cached engines
+ std::vector<int> cached_engine_batches_;
+
+ // Maximum number of cached engines
+ int max_cached_engines_;
+
+ int64 workspace_size_;
+ mutex engine_mutex_;
+ FunctionLibraryRuntime::Handle native_func_;
+
+ // The finalized calibrator for inference.
+ std::unique_ptr<TRTInt8Calibrator> calibrator_;
};
} // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
index 079d73f7be..383635f428 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@@ -28,11 +28,19 @@ extern Status TRTEngineOpShapeInference(InferenceContext* c);
}
REGISTER_OP("TRTEngineOp")
- .Attr("serialized_engine: string")
- .Attr("input_nodes: list(string)")
- .Attr("output_nodes: list(string)")
- .Attr("InT: list({float32})")
- .Attr("OutT: list({float32})")
+ .Attr("serialized_segment: string")
+ .Attr("input_shapes: list(shape)")
+ .Attr("output_shapes: list(shape)")
+ .Attr("segment_funcdef_name: string")
+ .Attr("InT: list({int8,float16,float32})")
+ .Attr("OutT: list({int8,float16,float32})")
+ .Attr("static_engine: bool = true")
+ .Attr("fixed_input_size: bool = true")
+ .Attr("cached_engine_batches: list(int) = []")
+ .Attr("max_cached_engines_count: int = 1")
+ .Attr("workspace_size_bytes: int")
+ .Attr("precision_mode: {'FP32', 'FP16', 'INT8', 'INT8CALIB'}")
+ .Attr("calibration_data: string = ''")
.Input("in_tensor: InT")
.Output("out_tensor: OutT")
.SetShapeFn(shape_inference::TRTEngineOpShapeInference);
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 338475d90e..79f512dbcf 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -21,6 +21,8 @@ from __future__ import print_function
# pylint: disable=unused-import,line-too-long
import six as _six
from tensorflow.contrib.tensorrt.wrap_conversion import calib_convert
+from tensorflow.contrib.tensorrt.wrap_conversion import get_linked_tensorrt_version
+from tensorflow.contrib.tensorrt.wrap_conversion import get_loaded_tensorrt_version
from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
from tensorflow.core.framework import graph_pb2
from tensorflow.core.protobuf import rewriter_config_pb2
@@ -29,7 +31,9 @@ from tensorflow.python.framework import errors_impl as _impl
from tensorflow.python.framework import meta_graph
from tensorflow.python.framework import ops
from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.platform import tf_logging
from tensorflow.python.util import compat
+
# pylint: enable=unused-import,line-too-long
@@ -40,7 +44,10 @@ def create_inference_graph(input_graph_def,
max_batch_size=1,
max_workspace_size_bytes=2 << 20,
precision_mode="FP32",
- minimum_segment_size=3):
+ minimum_segment_size=3,
+ is_dynamic_op=False,
+ maximum_cached_engines=1,
+ cached_engine_batches=[]):
"""Python wrapper for the TRT transformation.
Args:
@@ -51,6 +58,10 @@ def create_inference_graph(input_graph_def,
precision_mode: one of 'FP32', 'FP16' and 'INT8'
minimum_segment_size: the minimum number of nodes required for a subgraph to
be replaced by TRTEngineOp.
+ is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
+ network and engine at run time.
+ maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
+ cached_engine_batches: batch sizes used to pre-create cached engines.
Returns:
New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
@@ -65,6 +76,30 @@ def create_inference_graph(input_graph_def,
"It should be one of {}").format(
precision_mode, "{'FP32', 'FP16', 'INT8'}"))
mode = supported_precision_modes[precision_mode.upper()]
+ compiled_version = get_linked_tensorrt_version()
+ loaded_version = get_loaded_tensorrt_version()
+ version_mismatch = False
+ if loaded_version[0] < compiled_version[0]:
+ tf_logging.error(
+ "TensorRT version mismatch. Tensorflow was compiled against " +
+ "TensorRT %s but library loaded from environment is TensorRT %s" %
+ (".".join([str(x) for x in compiled_version]),
+ ".".join([str(x) for x in loaded_version])) +
+ ". Please make sure that correct version of TensorRT " +
+ "is available in the system and added to ldconfig or LD_LIBRARY_PATH"
+ )
+ raise RuntimeError("Incompatible TensorRT library version")
+ for i in zip(loaded_version, compiled_version):
+ if i[0] != i[1]:
+ tf_logging.warn("TensorRT mismatch. Compiled against version " +
+ "%s, but loaded %s. Things may not work" %
+ (".".join([str(x) for x in compiled_version]),
+ ".".join([str(x) for x in loaded_version])))
+ version_mismatch = True
+ break
+ if not version_mismatch:
+ tf_logging.info("Running against TensorRT version %s" % ".".join(
+ [str(x) for x in loaded_version]))
def py2bytes(inp):
return inp
@@ -100,7 +135,9 @@ def create_inference_graph(input_graph_def,
# pair or strings where first one is encoded status and the second
# one is the transformed graphs protobuf string.
out = trt_convert(input_graph_def_str, out_names, max_batch_size,
- max_workspace_size_bytes, mode, minimum_segment_size)
+ max_workspace_size_bytes, mode, minimum_segment_size,
+ is_dynamic_op, maximum_cached_engines,
+ cached_engine_batches)
status = to_string(out[0])
output_graph_def_string = out[1]
del input_graph_def_str # Save some memory
@@ -120,11 +157,12 @@ def create_inference_graph(input_graph_def,
return output_graph_def
-def calib_graph_to_infer_graph(calibration_graph_def):
+def calib_graph_to_infer_graph(calibration_graph_def, is_dynamic_op=False):
"""Convert an existing calibration graph to inference graph.
Args:
calibration_graph_def: the calibration GraphDef object with calibration data
+ is_dynamic_op: whether to create dynamic static engines from calibration
Returns:
New GraphDef with TRTEngineOps placed in graph replacing calibration nodes.
Raises:
@@ -141,9 +179,16 @@ def calib_graph_to_infer_graph(calibration_graph_def):
to_string = py2string
else:
to_string = py3string
-
+ is_calib_graph = False
+ for n in calibration_graph_def.node:
+ if n.op == "TRTEngineOp":
+ is_calib_graph = is_calib_graph or not n.attr["calibration_data"].s
+ if not is_calib_graph:
+ tf_logging.error(
+ "Not a calib graph. Doesn't seem to contain any calibration nodes.")
+ return None
graph_str = calibration_graph_def.SerializeToString()
- out = calib_convert(graph_str)
+ out = calib_convert(graph_str, is_dynamic_op)
status = to_string(out[0])
output_graph_def_string = out[1]
del graph_str # Save some memory
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
index 0f0508331c..9f115990c3 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
@@ -50,7 +50,7 @@ TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
}
void TRTDeviceAllocator::free(void* memory) {
- VLOG(2) << "Deallocating " << memory;
+ VLOG(2) << "Deallocating @ " << memory;
allocator_->DeallocateRaw(memory);
}
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
index a0c2540a76..c5d2cec730 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
@@ -16,7 +16,6 @@ limitations under the License.
#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
-
#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
#include "tensorflow/core/framework/allocator.h"
@@ -52,7 +51,9 @@ class TRTDeviceAllocator : public nvinfer1::IGpuAllocator {
// Allocator implementation wrapping TF device allocators.
public:
TRTDeviceAllocator(tensorflow::Allocator* allocator);
- virtual ~TRTDeviceAllocator() {}
+ virtual ~TRTDeviceAllocator() {
+ VLOG(1) << "Destroying allocator attached to " << allocator_->Name();
+ }
void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
void free(void* memory) override;
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
index dc7c93f869..32e81858b9 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
@@ -16,7 +16,6 @@ limitations under the License.
#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
#include <atomic>
-#include <chrono>
#include <unordered_map>
#include "tensorflow/core/platform/logging.h"
@@ -37,15 +36,22 @@ TRTInt8Calibrator::TRTInt8Calibrator(
: batch_size_(batch_size),
done_(false),
dev_buffers_(dev_buffers),
- calib_running_(false),
+ calib_running_(true),
batch_is_set_(false),
engine_name_(engine_name) {}
+TRTInt8Calibrator::TRTInt8Calibrator(const string& calib_data)
+ : batch_size_(0),
+ done_(false),
+ calib_running_(false),
+ batch_is_set_(false),
+ calibration_table_(calib_data) {}
+
bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
const cudaStream_t stream) {
tensorflow::mutex_lock lock(cond_mtx_);
- while ((calib_running_ || batch_is_set_) &&
- !done_) { // wait while calibration is running
+ // wait while calibration is running.
+ while ((calib_running_ || batch_is_set_) && !done_) {
cond_.wait(lock);
}
if (done_) return false;
@@ -59,8 +65,6 @@ bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
}
const auto& d = devptr->second;
- // TODO(aaroey): we should not use sync copy on default stream. Make sure
- // stream->ThenMemcpy() is used in future PRs.
// TODO(sami,aaroey): Need to figure out a way to ensure synchronization
// between stream, perhaps using a tensor?
auto status = cudaMemcpyAsync(d.first, it.second, d.second,
@@ -84,13 +88,11 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
tensorflow::mutex_lock lock(cond_mtx_);
calib_running_ = false;
cond_.notify_all();
- while ((!batch_is_set_ && !done_)) { // wait until new batch arrives
+ // wait until new batch arrives
+ while ((!batch_is_set_ && !done_)) {
cond_.wait(lock);
-
- }
- if (done_) {
- return false;
}
+ if (done_) return false;
for (int i = 0; i < num_bindings; i++) {
auto it = dev_buffers_.find(names[i]);
@@ -107,7 +109,9 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
}
const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) {
- return nullptr;
+ if (calibration_table_.empty()) return nullptr;
+ length = calibration_table_.size();
+ return calibration_table_.data();
}
void TRTInt8Calibrator::setDone() {
@@ -117,7 +121,11 @@ void TRTInt8Calibrator::setDone() {
}
void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
- std::size_t length) {}
+ std::size_t length) {
+ calibration_table_ = string((const char*)ptr, length);
+ VLOG(1) << "Got calibration data for " << engine_name_ << " @" << ptr
+ << " length=" << length;
+}
TRTInt8Calibrator::~TRTInt8Calibrator() {
VLOG(1) << "Destroying calibrator for " << engine_name_;
}
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
index d77aa2c5ab..994312d7c3 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
@@ -39,29 +39,48 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
TRTInt8Calibrator(
const std::unordered_map<string, std::pair<void*, size_t>>& dev_buffers,
int batch_size, string engine_name);
+
+ TRTInt8Calibrator(const string& calibration_data);
+
+ ~TRTInt8Calibrator();
+
int getBatchSize() const override;
+
bool getBatch(void* bindings[], const char* names[],
int num_bindings) override;
+
bool setBatch(const std::unordered_map<string, void*>& data,
const cudaStream_t stream);
+
void setDone();
+
+ // If not null, calibration is skipped.
const void* readCalibrationCache(std::size_t& length) override;
+
void writeCalibrationCache(const void* ptr, std::size_t length) override;
- ~TRTInt8Calibrator();
+
+ const string& getCalibrationTableAsString() { return calibration_table_; }
private:
const int batch_size_;
- tensorflow::mutex cond_mtx_; // mutex for condition_variable
- tensorflow::condition_variable cond_; // condition variable to implement
- // producer-consumer queue for
- // calibration
+
+ // mutex for condition_variable
+ tensorflow::mutex cond_mtx_;
+
+ // condition variable to implement producer-consumer queue for calibration
+ tensorflow::condition_variable cond_;
+
+ // Is calibration finished?
bool done_;
- const std::unordered_map<string, std::pair<void*, size_t>>
- dev_buffers_; // map to keep tensorrt input buffers and sizes keyed with
- // buffer names
+
+ // Map to keep tensorrt input buffers and sizes keyed with buffer names
+ const std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_;
+
bool calib_running_;
bool batch_is_set_;
+
string engine_name_;
+ string calibration_table_;
};
} // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index e3469124ac..b7d5ffd674 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -22,6 +22,7 @@ limitations under the License.
#include <thread>
#include <vector>
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
@@ -34,50 +35,48 @@ limitations under the License.
namespace tensorflow {
namespace tensorrt {
+
class TRTCalibrationResource : public tensorflow::ResourceBase {
public:
- TRTCalibrationResource()
- : calibrator_(nullptr),
- builder_(nullptr),
- network_(nullptr),
- engine_(nullptr),
- logger_(nullptr),
- thr_(nullptr) {}
-
~TRTCalibrationResource() {
VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+ builder_.reset();
+ engine_.reset();
+ // We need to manually destroy the builder and engine before the allocator
+ // is destroyed.
+ allocator_.reset();
}
string DebugString() override {
std::stringstream oss;
- oss << " Calibrator = " << std::hex << calibrator_ << std::dec << std::endl
- << " Builder = " << std::hex << builder_ << std::dec << std::endl
- << " Network = " << std::hex << network_ << std::dec << std::endl
- << " Engine = " << std::hex << engine_ << std::dec << std::endl
- << " Logger = " << std::hex << logger_ << std::dec << std::endl
- << " Allocator = " << std::hex << allocator_.get() << std::dec
- << std::endl
- << " Thread = " << std::hex << thr_ << std::dec << std::endl;
+ using std::dec;
+ using std::endl;
+ using std::hex;
+ oss << " Calibrator = " << hex << calibrator_.get() << dec << endl
+ << " Builder = " << hex << builder_.get() << dec << endl
+ << " Engine = " << hex << engine_.get() << dec << endl
+ << " Logger = " << hex << &logger_ << dec << endl
+ << " Allocator = " << hex << allocator_.get() << dec << endl
+ << " Thread = " << hex << thr_.get() << dec << endl;
return oss.str();
}
- TRTInt8Calibrator* calibrator_;
- nvinfer1::IBuilder* builder_;
- nvinfer1::INetworkDefinition* network_;
- nvinfer1::ICudaEngine* engine_;
- std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
- tensorflow::tensorrt::Logger* logger_;
+ std::unique_ptr<TRTInt8Calibrator> calibrator_;
+ TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+ TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+ std::unique_ptr<nvinfer1::IGpuAllocator> allocator_;
+ tensorflow::tensorrt::Logger logger_;
// TODO(sami): Use threadpool threads!
- std::thread* thr_;
+ std::unique_ptr<std::thread> thr_;
};
-class TRTWeightStore : public tensorflow::ResourceBase {
+class TRTWeightStore {
public:
TRTWeightStore() {}
virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
- string DebugString() override {
+ string DebugString() {
std::stringstream oss;
size_t len_bytes = 0;
for (const auto& v : store_) {
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 1568dd9153..81b4bfe49f 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -29,8 +29,9 @@ namespace tensorflow {
namespace tensorrt {
namespace segment {
-// vector of segments, each entry contains a device name and a set of nodes in
-// segment
+// Vector of segments, each entry contains a set of node names and a device name
+// in the segment.
+// TODO(aaroey): use node pointer instead of node name.
using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>;
struct SegmentOptions {
@@ -48,6 +49,8 @@ struct SegmentOptions {
// in the vector describes a subgraph by giving a set of the names of
// all the NodeDefs in that subgraph.
// @return the status.
+//
+// TODO(aaroey): remove this method.
tensorflow::Status SegmentGraph(
const tensorflow::GraphDef& gdef,
const std::function<bool(const tensorflow::Node*)>& candidate_fn,
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index f36495f6b6..227ac120dd 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -29,61 +29,35 @@ namespace tensorflow {
namespace shape_inference {
tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
- tensorflow::tensorrt::Logger logger;
- string serialized_engine;
- TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
- nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
- nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
- serialized_engine.c_str(), serialized_engine.size(),
- tensorrt::PluginFactoryTensorRT::GetInstance());
-
- int num_batch = -1;
- std::vector<::tensorflow::DataType> input_type;
- TF_RETURN_IF_ERROR(context->GetAttr("InT", &input_type));
- for (size_t i = 0; i < context->num_inputs(); i++) {
- // Check if input shape is legit
- auto input_shape = context->input(i);
- for (int j = 0; j < context->Rank(input_shape); j++) {
- auto dim_handler = context->Dim(input_shape, j);
- if (j == 0) {
- if (i == 0) {
- num_batch = context->Value(dim_handler);
- } else if (num_batch != context->Value(dim_handler)) {
- // TODO(jie): TensorRT engine requires consistent batch between inputs
- // tensors. Segmenter should be aware of this.
- LOG(FATAL) << "TensorRT engine requires consistent batch size";
- }
- }
- }
+ std::vector<tensorflow::TensorShape> shapes;
+ for (int i = 0; i < context->num_outputs(); ++i) {
+ context->set_output(i, context->UnknownShape());
}
-
- // Arrange input here
- std::vector<string> input_nodes;
- TF_RETURN_IF_ERROR(context->GetAttr("input_nodes", &input_nodes));
-
- // Arrange output here
- std::vector<string> output_nodes;
- TF_RETURN_IF_ERROR(context->GetAttr("output_nodes", &output_nodes));
- for (size_t i = 0; i < output_nodes.size(); i++) {
- int binding_index = trt_engine->getBindingIndex(output_nodes[i].c_str());
- ShapeHandle output_shape;
- std::vector<DimensionHandle> dim_vec;
- dim_vec.emplace_back(context->MakeDim(num_batch));
- if (binding_index != -1) {
- auto dims = trt_engine->getBindingDimensions(binding_index);
- for (int j = 0; j < dims.nbDims; j++) {
- dim_vec.emplace_back(context->MakeDim(dims.d[j]));
- }
- } else {
- LOG(FATAL) << "TensorRT engine cannot find binding: " << output_nodes[i];
- }
- output_shape = context->MakeShape(dim_vec);
- context->set_output(i, output_shape);
+ auto status = context->GetAttr("input_shapes", &shapes);
+ // it is ok to not to have shapes
+ if (!status.ok()) return Status::OK();
+ if ((int)shapes.size() != context->num_inputs()) return Status::OK();
+ bool different_input = false;
+ for (int i = 0; i < context->num_inputs(); ++i) {
+ if (shapes.at(i) != context->input_tensor(i)->shape())
+ different_input = true;
+ }
+ if (different_input) return Status::OK();
+ shapes.resize(0);
+ status = context->GetAttr("output_shapes", &shapes);
+ if (!status.ok()) return Status::OK();
+ if ((int)shapes.size() != context->num_outputs()) return Status::OK();
+ std::vector<ShapeHandle> shape_handles(shapes.size());
+ for (size_t i = 0; i < shapes.size(); ++i) {
+ status =
+ context->MakeShapeFromTensorShape(shapes.at(i), &shape_handles.at(i));
+ if (!status.ok()) return Status::OK();
+ }
+ for (int i = 0; i < context->num_outputs(); ++i) {
+ context->set_output(i, shape_handles.at(i));
}
-
return Status::OK();
}
-
} // namespace shape_inference
} // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 175ccd8006..090aa8bdb0 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -20,6 +20,7 @@ from __future__ import print_function
import argparse
import numpy as np
+import six as _six
# normally we should do import tensorflow as tf and then
# tf.placeholder, tf.constant, tf.nn.conv2d etc but
@@ -35,10 +36,75 @@ from tensorflow.python.framework import dtypes as dtypes
from tensorflow.python.framework import importer as importer
from tensorflow.python.framework import ops as ops
from tensorflow.python.ops import array_ops as aops
+from tensorflow.python.ops import math_ops as mops
from tensorflow.python.ops import nn as nn
from tensorflow.python.ops import nn_ops as nn_ops
+def py2bytes(inp):
+ return inp
+
+
+def py3bytes(inp):
+ return inp.encode("utf-8", errors="surrogateescape")
+
+
+def py2string(inp):
+ return inp
+
+
+def py3string(inp):
+ return inp.decode("utf-8")
+
+
+if _six.PY2:
+ to_bytes = py2bytes
+ to_string = py2string
+else:
+ to_bytes = py3bytes
+ to_string = py3string
+
+
+def get_multi_engine_graph_def(mode="FP32"):
+ """Create a simple graph and return its graph_def."""
+ dtype = dtypes.float32
+ if mode.upper() == "FP16":
+ dtype = dtypes.float16
+ else:
+ pass
+
+ g = ops.Graph()
+ with g.as_default():
+ x = aops.placeholder(shape=[None, 3, 7, 5], name="input", dtype=dtype)
+ with g.name_scope("Global_scope"):
+ with g.name_scope("first_scope"):
+ e = cop.constant(
+ np.random.randn(3, 2, 3, 4), name="weights", dtype=dtype)
+ conv = nn.conv2d(
+ input=x,
+ filter=e,
+ data_format="NCHW",
+ strides=[1, 1, 1, 1],
+ padding="VALID",
+ name="conv")
+ b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias1", dtype=dtype)
+ t = conv * b
+
+ b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias2", dtype=dtype)
+ q = conv / b
+ edge = mops.sin(q)
+ edge1 = mops.cos(conv)
+ with g.name_scope("test_scope"):
+ de = edge + edge1
+ t -= edge1
+ q *= edge
+ t += q
+ t -= de
+ k = aops.squeeze(t, name="output")
+ print(k.dtype)
+ return g.as_graph_def()
+
+
def get_simple_graph_def():
"""Create a simple graph and return its graph_def."""
g = ops.Graph()
@@ -65,7 +131,9 @@ def get_simple_graph_def():
def execute_graph(gdef, dumm_inp):
"""Run given graphdef once."""
print("executing")
- gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+ gpu_options = None
+ if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
+ gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
sessconfig = cpb2.ConfigProto(gpu_options=gpu_options)
ops.reset_default_graph()
g = ops.Graph()
@@ -83,7 +151,9 @@ def execute_graph(gdef, dumm_inp):
# for calibration. For this test script it is random data.
def execute_calibration(gdef, dumm_inp):
"""Run given calibration graph multiple times."""
- gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+ gpu_options = None
+ if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
+ gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
ops.reset_default_graph()
g = ops.Graph()
with g.as_default():
@@ -100,12 +170,17 @@ def execute_calibration(gdef, dumm_inp):
return val
-def user(run_graph=execute_graph, run_calibration=execute_calibration):
+def user(multi_engine,
+ run_graph=execute_graph,
+ run_calibration=execute_calibration):
"""Example function that converts a graph to TFTRT graph."""
-
- inp_dims = (100, 24, 24, 2)
+ if multi_engine:
+ inp_dims = (2, 3, 7, 5)
+ orig_graph = get_multi_engine_graph_def()
+ else:
+ inp_dims = (100, 24, 24, 2)
+ orig_graph = get_simple_graph_def() # use a frozen graph for inference
dummy_input = np.random.random_sample(inp_dims)
- orig_graph = get_simple_graph_def() # use a frozen graph for inference
# Get optimized graph
trt_graph = trt.create_inference_graph(
input_graph_def=orig_graph,
@@ -113,8 +188,10 @@ def user(run_graph=execute_graph, run_calibration=execute_calibration):
max_batch_size=inp_dims[0],
max_workspace_size_bytes=1 << 25,
precision_mode="FP32", # TRT Engine precision "FP32","FP16" or "INT8"
- minimum_segment_size=2 # minimum number of nodes in an engine
- )
+ minimum_segment_size=2, # minimum number of nodes in an engine
+ is_dynamic_op=False,
+ maximum_cached_engines=1,
+ cached_engine_batches=[])
o1 = run_graph(orig_graph, dummy_input)
o2 = run_graph(trt_graph, dummy_input)
o3 = run_graph(trt_graph, dummy_input)
@@ -126,40 +203,51 @@ def user(run_graph=execute_graph, run_calibration=execute_calibration):
max_batch_size=inp_dims[0],
max_workspace_size_bytes=1 << 25,
precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8"
- minimum_segment_size=2 # minimum number of nodes in an engine
- )
+ minimum_segment_size=2, # minimum number of nodes in an engine
+ is_dynamic_op=False,
+ maximum_cached_engines=1,
+ cached_engine_batches=[])
int8_calib_gdef = trt.create_inference_graph(
input_graph_def=orig_graph,
outputs=["output"],
max_batch_size=inp_dims[0],
max_workspace_size_bytes=1 << 25,
precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8"
- minimum_segment_size=2 # minimum number of nodes in an engine
- )
+ minimum_segment_size=2, # minimum number of nodes in an engine
+ is_dynamic_op=False,
+ maximum_cached_engines=1,
+ cached_engine_batches=[])
o4 = run_graph(fp16_graph, dummy_input)
_ = run_calibration(int8_calib_gdef, dummy_input)
int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
o5 = run_graph(int8_graph, dummy_input)
- assert np.allclose(o1, o4)
- assert np.allclose(o1, o5)
+ print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4))
+ print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5))
print("Pass")
-def auto():
+def auto(multi_engine):
"""Run the conversion as an optimization pass."""
- inp_dims = (100, 24, 24, 2)
+ if multi_engine:
+ inp_dims = (2, 3, 7, 5)
+ orig_graph = get_multi_engine_graph_def()
+ else:
+ inp_dims = (100, 24, 24, 2)
+ orig_graph = get_simple_graph_def() # use a frozen graph for inference
dummy_input = np.random.random_sample(inp_dims)
- orig_graph = get_simple_graph_def()
opt_config = rwpb2.RewriterConfig()
+ opt_config.meta_optimizer_iterations = opt_config.ONE
opt_config.optimizers.extend(["constfold", "layout"])
custom_op = opt_config.custom_optimizers.add()
custom_op.name = "TensorRTOptimizer"
custom_op.parameter_map["minimum_segment_size"].i = 3
- custom_op.parameter_map["precision_mode"].s = "FP32"
+ custom_op.parameter_map["precision_mode"].s = to_bytes("FP32")
custom_op.parameter_map["max_batch_size"].i = inp_dims[0]
custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
print(custom_op)
- gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+ gpu_options = None
+ if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
+ gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
graph_options = cpb2.GraphOptions(rewrite_options=opt_config)
sessconfig = cpb2.ConfigProto(
gpu_options=gpu_options, graph_options=graph_options)
@@ -168,7 +256,7 @@ def auto():
ops.reset_default_graph()
with g.as_default():
inp, out = importer.import_graph_def(
- graph_def=orig_graph, return_elements=["input", "output"])
+ graph_def=orig_graph, return_elements=["input", "output"], name="")
inp = inp.outputs[0]
out = out.outputs[0]
with csess.Session(config=sessconfig, graph=g) as sess:
@@ -186,8 +274,14 @@ if "__main__" in __name__:
action="store_true",
help="Do TRT conversion automatically",
default=False)
+ P.add_argument(
+ "--multi-engine",
+ "-m",
+ action="store_true",
+ help="Use a graph that will result in 2 engines",
+ default=False)
flags, unparsed = P.parse_known_args()
if flags.automatic:
- auto()
+ auto(flags.multi_engine)
else:
- user()
+ user(flags.multi_engine)
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index 46480e99a1..d51a0b59e2 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -48,12 +48,53 @@ PyObject* pair_helper(std::pair<string, string>* in) {
}
return tuple;
}
+
+struct version_struct{
+ int vmajor;
+ int vminor;
+ int vpatch;
+};
+
+PyObject* version_helper(version_struct* in) {
+ PyObject *tuple(nullptr);
+ tuple = Py_BuildValue("(iii)", in->vmajor, in->vminor, in->vpatch);
+ if (!tuple) {
+ if (!PyErr_Occurred()) {
+ PyErr_SetString(PyExc_TypeError,
+ "Tuple creation from version structure failed!");
+ }
+ return NULL;
+ }
+ return tuple;
+}
+/* Define converters for vector<int> */
+template<>
+bool _PyObjAs(PyObject *pyobj, int* dest) {
+ *dest = PyLong_AsLong(pyobj);
+ return true;
+}
+
+template<>
+PyObject *_PyObjFrom(const int& src) {
+ return PyLong_FromLong(src);
+}
+
%}
+
+_LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
+
%typemap(out) std::pair<string, string> {
PyObject *tuple = pair_helper(&$1);
if (!tuple) SWIG_fail;
$result = tuple;
}
+
+%typemap(out) version_struct {
+ PyObject *tuple = version_helper(&$1);
+ if (!tuple) SWIG_fail;
+ $result = tuple;
+}
+
%{
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
@@ -65,6 +106,8 @@ PyObject* pair_helper(std::pair<string, string>* in) {
%unignore tensorflow;
%unignore trt_convert;
%unignore calib_convert;
+%unignore get_linked_tensorrt_version;
+%unignore get_loaded_tensorrt_version;
%{
@@ -74,7 +117,10 @@ std::pair<string, string> trt_convert(
size_t max_batch_size,
size_t max_workspace_size_bytes,
int precision_mode,
- int minimum_segment_size
+ int minimum_segment_size,
+ bool is_dyn_op,
+ int max_cached_engines,
+ std::vector<int> cached_engine_batches
// Unfortunately we can't use TF_Status here since it
// is in c/c_api and brings in a lot of other libraries
// which in turn declare ops. These ops are included
@@ -102,11 +148,12 @@ std::pair<string, string> trt_convert(
out_status = "InvalidArgument;Size of the output_names vector is 0";
return std::pair<string, string>{out_status, ""};
}
- tensorflow::GraphDef outGraph;
+ tensorflow::GraphDef out_graph;
tensorflow::Status conversion_status =
tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
graph_def, output_names, max_batch_size, max_workspace_size_bytes,
- &outGraph, precision_mode, minimum_segment_size);
+ &out_graph, precision_mode, minimum_segment_size,
+ is_dyn_op, max_cached_engines, cached_engine_batches);
if (!conversion_status.ok()) {
auto retCode = (int)conversion_status.code();
char buff[2000];
@@ -116,7 +163,7 @@ std::pair<string, string> trt_convert(
return std::pair<string, string>{out_status, ""};
}
string result;
- if (!outGraph.SerializeToString(&result)) {
+ if (!out_graph.SerializeToString(&result)) {
out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
return std::pair<string, string>{out_status, ""};
}
@@ -128,7 +175,8 @@ std::pair<string, string> trt_convert(
#endif // GOOGLE_CUDA && GOOGLE_TENSORRT
}
-std::pair<string, string> calib_convert(string graph_def_string // const tensorflow::GraphDef&
+std::pair<string, string> calib_convert(
+ string graph_def_string, bool is_dyn_op
// unfortunately we can't use TF_Status here since it
// is in c/c_api and brings in a lot of other libraries
// which in turn declare ops. These ops are included
@@ -147,11 +195,11 @@ std::pair<string, string> calib_convert(string graph_def_string // const tenso
out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
return std::pair<string, string>{out_status, ""};
}
-
- tensorflow::GraphDef outGraph;
+ graph_def_string.resize(0);
+ tensorflow::GraphDef out_graph;
tensorflow::Status conversion_status =
- tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(graph_def,
- &outGraph);
+ tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(
+ graph_def, &out_graph, is_dyn_op);
if (!conversion_status.ok()) {
auto retCode = (int)conversion_status.code();
char buff[2000];
@@ -161,7 +209,7 @@ std::pair<string, string> calib_convert(string graph_def_string // const tenso
return std::pair<string, string>{out_status, ""};
}
string result;
- if (!outGraph.SerializeToString(&result)) {
+ if (!out_graph.SerializeToString(&result)) {
out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
return std::pair<string, string>{out_status, ""};
}
@@ -172,15 +220,39 @@ std::pair<string, string> calib_convert(string graph_def_string // const tenso
return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
#endif // GOOGLE_CUDA && GOOGLE_TENSORRT
}
+
+version_struct get_linked_tensorrt_version(){
+ // Return the version at the link time.
+ const auto &lv = tensorflow::tensorrt::convert::GetLinkedTensorRTVersion();
+ version_struct s;
+ s.vmajor = lv[0];
+ s.vminor = lv[1];
+ s.vpatch = lv[2];
+ return s;
+}
+version_struct get_loaded_tensorrt_version(){
+ // Return the version from the loaded library.
+ const auto &lv = tensorflow::tensorrt::convert::GetLoadedTensorRTVersion();
+ version_struct s;
+ s.vmajor = lv[0];
+ s.vminor = lv[1];
+ s.vpatch = lv[2];
+ return s;
+}
+
%}
-std::pair<string, string> calib_convert(string graph_def_string);
+std::pair<string, string> calib_convert(string graph_def_string, bool is_dyn_op);
std::pair<string, string> trt_convert(string graph_def_string,
std::vector<string> output_names,
size_t max_batch_size,
size_t max_workspace_size_bytes,
- int precision_mode, int minimum_segment_size);
-
+ int precision_mode, int minimum_segment_size,
+ bool is_dyn_op,
+ int max_cached_engines,
+ std::vector<int> cached_engine_batches);
+version_struct get_linked_tensorrt_version();
+version_struct get_loaded_tensorrt_version();
%unignoreall
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 3b2d7adfff..38d1c3049e 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -49,11 +49,11 @@ tf_cc_binary(
":tpu_profiler_analysis_proto_cc",
":tpu_profiler_proto_cc",
":version",
+ "//tensorflow:grpc++",
"//tensorflow/core:framework_internal",
"//tensorflow/core:lib",
"//tensorflow/core/distributed_runtime/rpc:grpc_util",
"//tensorflow/core/platform/cloud:gcs_file_system",
- "@grpc//:grpc++",
],
)
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 1b45584dcb..19cb8983b6 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -53,12 +53,12 @@ cc_library(
":grpc_verbs_service_impl",
":rdma_mgr",
":verbs_service_proto_cc",
+ "//tensorflow:grpc++",
"//tensorflow/core:lib_internal",
"//tensorflow/core/distributed_runtime:session_mgr",
"//tensorflow/core/distributed_runtime/rpc:async_service_interface",
"//tensorflow/core/distributed_runtime/rpc:grpc_call",
"//tensorflow/core/distributed_runtime/rpc:grpc_util",
- "@grpc//:grpc++",
],
alwayslink = 1,
)
@@ -69,7 +69,7 @@ cc_library(
hdrs = ["grpc_verbs_service_impl.h"],
deps = [
":verbs_service_proto_cc",
- "@grpc//:grpc++",
+ "//tensorflow:grpc++",
],
)
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 19d6438809..06b797e32e 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -4,6 +4,7 @@
# The following targets can be used to access ApiDefs:
# :base_api_def
# :python_api_def
+# :java_api_def
package(
default_visibility = ["//visibility:private"],
@@ -29,6 +30,12 @@ filegroup(
visibility = ["//tensorflow:internal"],
)
+filegroup(
+ name = "java_api_def",
+ srcs = glob(["java_api/*"]),
+ visibility = ["//tensorflow:internal"],
+)
+
cc_library(
name = "excluded_ops_lib",
srcs = ["excluded_ops.cc"],
diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
index 6f1121dd37..5ab5917bd3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
@@ -68,7 +68,7 @@ END
name: "area_range"
description: <<END
The cropped area of the image must contain a fraction of the
-supplied image within in this range.
+supplied image within this range.
END
}
attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
index 473aec50aa..663fc582d4 100644
--- a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
@@ -68,7 +68,7 @@ END
name: "area_range"
description: <<END
The cropped area of the image must contain a fraction of the
-supplied image within in this range.
+supplied image within this range.
END
}
attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
index 9fabe7863e..c80ee77f73 100644
--- a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
@@ -11,7 +11,7 @@ END
name: "stride"
description: <<END
A scalar representing the steps moving the sliding window
-forward in one iteration. It must be in `[1, window_size)`.
+forward in one iteration. It must be positive.
END
}
summary: "Creates a dataset that passes a sliding window over `input_dataset`."
diff --git a/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
new file mode 100644
index 0000000000..b1f868897d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
@@ -0,0 +1,4 @@
+op {
+ graph_op_name: "Assert" #TODO(karllessard) escape that reserved name
+ visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Const.pbtxt b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
new file mode 100644
index 0000000000..2dbdca34e0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
@@ -0,0 +1,4 @@
+op {
+ graph_op_name: "Const" #TODO(karllessard) escape that reserved name
+ visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
new file mode 100644
index 0000000000..0d3362a91e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
@@ -0,0 +1,4 @@
+op {
+ graph_op_name: "Switch" #TODO(karllessard) escape that reserved name
+ visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 486f0be698..0b096a14a3 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -106,24 +106,24 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
EXPECT_EQ(1, shape.dim(1).size());
if (node->name() == y->name()) {
#ifdef INTEL_MKL
- // if MKL is used, it goes through various additional
- // graph rewrite pass. In TF, everytime a graph pass
+ // if MKL is used, it goes through various additional
+ // graph rewrite pass. In TF, everytime a graph pass
// happens, "constant" nodes are allocated
// and deallocated. Each allocation calls the
// (FindChunkPtr of BFCAllocator),
- // which increments the value of AllocationId.
- // Thus AllocationId becomes more than 3 and 4 if
- // MKL is used. Now they are 9 and 10 for MKL.
- EXPECT_EQ(19, cm->AllocationId(node, 0));
+ // which increments the value of AllocationId.
+ // Thus AllocationId becomes more than TF if MKL
+ // is used. Now IDs for MKL are 8 more than TF.
+ EXPECT_EQ(29, cm->AllocationId(node, 0));
#else
EXPECT_EQ(21, cm->AllocationId(node, 0));
-#endif
+#endif
} else {
#ifdef INTEL_MKL
- EXPECT_EQ(20, cm->AllocationId(node, 0));
+ EXPECT_EQ(30, cm->AllocationId(node, 0));
#else
EXPECT_EQ(22, cm->AllocationId(node, 0));
-#endif
+#endif
}
}
EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
index 43a909466e..4ec85457ad 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.cc
@@ -17,6 +17,13 @@ limitations under the License.
#include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#ifdef _WIN32
+// Declare function to avoid unresolved symbol in VS
+i_malloc_t i_malloc;
+i_calloc_t i_calloc;
+i_realloc_t i_realloc;
+i_free_t i_free;
+#endif
namespace tensorflow {
constexpr const char* MklCPUAllocator::kMaxLimitStr;
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 50f8a307d8..36e9b3455a 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -143,6 +143,7 @@ tf_cuda_library(
":debug_node_key",
":debug_service_proto_cc",
":debugger_event_metadata_proto_cc",
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:framework",
"//tensorflow/core:graph",
@@ -150,7 +151,6 @@ tf_cuda_library(
"//tensorflow/core:lib_internal",
"//tensorflow/core:proto_text",
"//tensorflow/core:protos_all_cc",
- "@grpc//:grpc++",
],
alwayslink = 1,
)
@@ -166,11 +166,11 @@ tf_cuda_library(
":debug_io_utils",
":debug_service_proto_cc",
":debugger_event_metadata_proto_cc",
+ "//tensorflow:grpc++",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core:protos_all_cc",
- "@grpc//:grpc++",
],
alwayslink = 1,
)
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 8247651c24..75f8a19e9c 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -628,6 +628,7 @@ tf_cuda_cc_test(
":master",
":remote_device",
":worker_interface",
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:framework",
@@ -649,7 +650,6 @@ tf_cuda_cc_test(
"//tensorflow/core/kernels:dense_update_ops",
"//tensorflow/core/kernels:identity_op",
"//tensorflow/core/kernels:variable_ops",
- "@grpc//:grpc++",
],
)
@@ -667,6 +667,7 @@ tf_cuda_cc_test(
":master",
":remote_device",
":worker_interface",
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:framework",
@@ -682,7 +683,6 @@ tf_cuda_cc_test(
"//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
"//tensorflow/core/distributed_runtime/rpc:grpc_util",
"//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
- "@grpc//:grpc++",
],
)
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 22d0902af2..055e5dfced 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -48,6 +48,8 @@ cc_library(
"eager_service_impl.h",
],
deps = [
+ "//tensorflow:grpc",
+ "//tensorflow:grpc++",
"//tensorflow/c:c_api_internal",
"//tensorflow/c:tf_status_helper",
"//tensorflow/core:core_cpu_internal",
@@ -67,8 +69,6 @@ cc_library(
"//tensorflow/core/distributed_runtime:worker_env",
"//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
"//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
- "@grpc",
- "@grpc//:grpc++",
],
)
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 382ea336ca..d6c493c022 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -41,8 +41,8 @@ cc_library(
srcs = ["grpc_util.cc"],
hdrs = ["grpc_util.h"],
deps = [
- "@grpc",
- "@grpc//:grpc++",
+ "//tensorflow:grpc",
+ "//tensorflow:grpc++",
"//tensorflow/core:lib",
# Required to be able to overload TensorResponse parsing.
"//tensorflow/core/distributed_runtime:tensor_coding",
@@ -55,8 +55,8 @@ cc_library(
hdrs = ["grpc_client_cq_tag.h"],
deps = [
":grpc_util",
+ "//tensorflow:grpc++",
"//tensorflow/core:lib",
- "@grpc//:grpc++",
],
)
@@ -67,10 +67,10 @@ cc_library(
deps = [
":grpc_client_cq_tag",
":grpc_util",
+ "//tensorflow:grpc++",
"//tensorflow/core:lib",
"//tensorflow/core/distributed_runtime:call_options",
"//tensorflow/core/distributed_runtime:tensor_coding",
- "@grpc//:grpc++",
],
)
@@ -83,6 +83,7 @@ cc_library(
":grpc_state",
":grpc_util",
":grpc_worker_service_impl",
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
@@ -90,7 +91,6 @@ cc_library(
"//tensorflow/core/distributed_runtime:tensor_coding",
"//tensorflow/core/distributed_runtime:worker_cache_logger",
"//tensorflow/core/distributed_runtime:worker_interface",
- "@grpc//:grpc++",
],
)
@@ -100,10 +100,10 @@ cc_library(
hdrs = ["grpc_channel.h"],
deps = [
":grpc_util",
+ "//tensorflow:grpc++",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
- "@grpc//:grpc++",
],
)
@@ -112,13 +112,13 @@ cc_library(
srcs = ["grpc_tensor_coding.cc"],
hdrs = ["grpc_tensor_coding.h"],
deps = [
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:framework",
"//tensorflow/core:framework_internal",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:worker_proto_cc",
- "@grpc//:grpc++",
],
)
@@ -127,9 +127,9 @@ cc_library(
srcs = [],
hdrs = ["grpc_call.h"],
deps = [
+ "//tensorflow:grpc++",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
- "@grpc//:grpc++",
],
)
@@ -167,6 +167,7 @@ tf_cuda_library(
":grpc_tensor_coding",
":grpc_util",
":grpc_worker_service_impl",
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
@@ -180,7 +181,6 @@ tf_cuda_library(
"//tensorflow/core/distributed_runtime:worker_cache",
"//tensorflow/core/distributed_runtime:worker_env",
"//tensorflow/core/distributed_runtime:worker_session",
- "@grpc//:grpc++",
],
)
@@ -190,9 +190,9 @@ cc_library(
hdrs = ["grpc_worker_service_impl.h"],
deps = [
":grpc_util",
+ "//tensorflow:grpc++",
"//tensorflow/core:worker_proto_cc",
"//tensorflow/core/distributed_runtime:tensor_coding",
- "@grpc//:grpc++",
],
)
@@ -220,12 +220,12 @@ cc_library(
":async_service_interface",
":grpc_call",
":grpc_util",
+ "//tensorflow:grpc++",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core:master_proto_cc",
"//tensorflow/core:master_service_proto_cc",
"//tensorflow/core/distributed_runtime:master",
- "@grpc//:grpc++",
],
alwayslink = 1,
)
@@ -259,6 +259,8 @@ cc_library(
":grpc_worker_cache",
":grpc_worker_service",
":rpc_rendezvous_mgr",
+ "//tensorflow:grpc",
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:framework",
@@ -277,8 +279,6 @@ cc_library(
"//tensorflow/core/distributed_runtime:worker_cache_wrapper",
"//tensorflow/core/distributed_runtime:worker_env",
"//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service_impl",
- "@grpc",
- "@grpc//:grpc++",
],
alwayslink = 1,
)
@@ -299,13 +299,13 @@ tf_cc_binary(
],
deps = [
":grpc_server_lib",
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework_internal",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core/distributed_runtime:server_lib",
"//tensorflow/core/kernels:data_flow",
- "@grpc//:grpc++",
],
)
@@ -317,6 +317,7 @@ tf_cc_binary(
],
deps = [
":grpc_server_lib",
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework_internal",
"//tensorflow/core:lib",
@@ -330,7 +331,6 @@ tf_cc_binary(
"//tensorflow/core/kernels:matmul_op",
"//tensorflow/core/kernels:reduction_ops",
"//tensorflow/core/kernels:variable_ops",
- "@grpc//:grpc++",
],
)
@@ -415,6 +415,7 @@ tf_cc_test(
deps = [
":grpc_tensor_coding",
":grpc_testlib",
+ "//tensorflow:grpc++",
"//tensorflow/core:core_cpu",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:framework",
@@ -424,7 +425,6 @@ tf_cc_test(
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
"//tensorflow/core:worker_proto_cc",
- "@grpc//:grpc++",
],
)
@@ -434,11 +434,11 @@ tf_cc_test(
srcs = ["grpc_util_test.cc"],
deps = [
":grpc_util",
+ "//tensorflow:grpc",
+ "//tensorflow:grpc++",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:worker_proto_cc",
- "@grpc",
- "@grpc//:grpc++",
],
)
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index 8cec497361..d09a85c6a5 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -11,8 +11,8 @@ cc_library(
srcs = ["grpc_eager_service.cc"],
hdrs = ["grpc_eager_service.h"],
deps = [
+ "//tensorflow:grpc++",
"//tensorflow/core:eager_service_proto_cc",
- "@grpc//:grpc++",
],
)
@@ -21,6 +21,7 @@ cc_library(
srcs = ["grpc_eager_client.cc"],
hdrs = ["grpc_eager_client.h"],
deps = [
+ "//tensorflow:grpc++",
"//tensorflow/core:eager_service_proto_cc",
"//tensorflow/core:lib",
"//tensorflow/core/distributed_runtime/eager:eager_client",
@@ -29,7 +30,6 @@ cc_library(
"//tensorflow/core/distributed_runtime/rpc:grpc_state",
"//tensorflow/core/distributed_runtime/rpc:grpc_util",
"//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service",
- "@grpc//:grpc++",
],
)
@@ -39,6 +39,7 @@ cc_library(
hdrs = ["grpc_eager_service_impl.h"],
deps = [
":grpc_eager_service",
+ "//tensorflow:grpc++",
"//tensorflow/core:framework",
"//tensorflow/core:ptr_util",
"//tensorflow/core/distributed_runtime/eager:eager_service_impl",
@@ -47,6 +48,6 @@ cc_library(
"//tensorflow/core/distributed_runtime/rpc:grpc_channel",
"//tensorflow/core/distributed_runtime/rpc:grpc_util",
"//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
- "@grpc//:grpc++",
+ "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
],
)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 7a9f3c5198..2c833d11a9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -289,6 +289,12 @@ Status GrpcServer::Init(
nullptr);
}
+Status GrpcServer::Init(
+ ServiceInitFunction service_func,
+ const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
+ return Init(std::move(service_func), rendezvous_mgr_func, nullptr, nullptr);
+}
+
Status GrpcServer::Init() { return Init(nullptr, nullptr, nullptr, nullptr); }
Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index c674da9490..3366246afb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -97,6 +97,9 @@ class GrpcServer : public ServerInterface {
const RendezvousMgrCreationFunction& rendezvous_mgr_func,
const CollectiveMgrCreationFunction& collective_mgr_func);
+ Status Init(ServiceInitFunction service_func,
+ const RendezvousMgrCreationFunction& rendezvous_mgr_func);
+
Status Init();
// A subclass can override this method to support secure credentials.
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 7645b4a7f0..fc474c0dc8 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -1901,6 +1901,11 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
#else // INTEL_MKL_ML
+// NOTE: Unit tests in this file rely on a topological sorted graph for
+// printing. But since sibling nodes of a node in the topologically sorted graph
+// can be printed in different orders, tests may fail if the order in which
+// sibling nodes are visited is changed.
+
namespace {
const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
@@ -2572,9 +2577,9 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
"A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
"DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
"F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
- "A:control->DMT/_2:control;A:control->DMT/_3:control;"
- "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
- "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+ "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+ "B->E:1;C->F;C:control->DMT/_2:control;C:control->DMT/_3:control;"
+ "D->F:1;DMT/_0->E:2;DMT/_1->E:3;DMT/_2->F:2;DMT/_3->F:3;"
"DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
"G:control->DMT/_4:control;H->I:1");
}
@@ -2681,9 +2686,9 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
"A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
"DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
"F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;"
- "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
- "C:control->DMT/_0:control;C:control->DMT/_1:control;"
- "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+ "A:control->DMT/_0:control;A:control->DMT/_1:control;B->E:1;C->F;"
+ "C:control->DMT/_2:control;C:control->DMT/_3:control;"
+ "D->F:1;DMT/_0->E:2;DMT/_1->E:3;DMT/_2->F:2;DMT/_3->F:3;"
"DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;"
"F:2->H:4;G->H:2;H->I:1");
}
@@ -3060,8 +3065,8 @@ TEST_F(MklLayoutPassTest, LRN_Negative3) {
"C:control->DMT/_1:control;C:control->DMT/_2:control;"
"C:control->DMT/_3:control;C:control->DMT/_4:control;"
"C:control->DMT/_5:control;C:control->DMT/_6:control;"
- "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
- "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
+ "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;DMT/_3->F:3;"
+ "DMT/_4->F:7;DMT/_5->F:4;DMT/_6->F:6;E->G;F->G:1");
}
/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc
index 48776cbf61..07cc91f9d5 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/slide_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
#include "tensorflow/core/framework/partial_tensor_shape.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/util/batch_util.h"
namespace tensorflow {
@@ -32,16 +33,24 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
DatasetBase** output) override {
int64 window_size = 0;
- int64 stride = 1;
+ int64 stride = 0;
OP_REQUIRES_OK(
ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size));
OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "stride", &stride));
OP_REQUIRES(
ctx, window_size > 0,
errors::InvalidArgument("Window size must be greater than zero."));
- OP_REQUIRES(
- ctx, stride > 0 && stride < window_size,
- errors::InvalidArgument("Stride must be in [1, window_size)."));
+ OP_REQUIRES(ctx, stride > 0,
+ errors::InvalidArgument("Stride must be greater than zero."));
+ if (stride == window_size) {
+ LOG(WARNING) << "stride: " << stride
+ << " is equal to window_size: " << window_size
+ << ", to use `batch` instead.";
+ } else if (stride > window_size) {
+ LOG(WARNING) << "stride: " << stride
+ << " is greater than window_size: " << window_size
+ << ", you will lose some data.";
+ }
*output = new Dataset(ctx, window_size, stride, input);
}
@@ -124,12 +133,15 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
return Status::OK();
}
batch_elements.reserve(window_size);
- const bool first_call = cache_.empty();
- if (first_call) {
- cache_.reserve(window_size);
- } else {
- // Reuse cache in the previous iteration.
- cache_.swap(batch_elements);
+ // Use cache if stride < window_size.
+ if (stride < window_size) {
+ const bool first_call = cache_.empty();
+ if (first_call) {
+ cache_.reserve(window_size);
+ } else {
+ // Reuse cache in the previous iteration.
+ cache_.swap(batch_elements);
+ }
}
// Fill up with new elements.
*end_of_sequence = false;
@@ -149,9 +161,22 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
DCHECK(*end_of_sequence);
return Status::OK();
}
- // Cache the data used for the next iteration.
- for (size_t i = stride; i < window_size; ++i) {
- cache_.emplace_back(batch_elements[i]);
+
+ if (stride < window_size) {
+ // Cache the data used for the next iteration.
+ for (size_t i = stride; i < window_size; ++i) {
+ cache_.emplace_back(batch_elements[i]);
+ }
+ } else if (stride > window_size) {
+ // Drop the data before the next iteration.
+ std::vector<Tensor> batch_element_tuple;
+ for (size_t i = window_size; i < stride && !*end_of_sequence; ++i) {
+ TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+ end_of_sequence));
+ if (*end_of_sequence) {
+ input_impl_.reset();
+ }
+ }
}
}
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index f2b14f1278..1d0edb10b3 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -59,7 +59,8 @@ namespace tensorflow {
#ifndef INTEL_MKL_ML
-struct ConvFwdDimensions {
+// This structure aggregates multiple inputs to Conv2DFwd* methods.
+struct MklConvFwdParams {
memory::dims src_dims;
memory::dims filter_dims;
memory::dims bias_dims;
@@ -69,48 +70,56 @@ struct ConvFwdDimensions {
memory::dims padding_left;
memory::dims padding_right;
- ConvFwdDimensions(memory::dims src_dims,
- memory::dims filter_dims, memory::dims bias_dims,
- memory::dims dst_dims, memory::dims strides,
- memory::dims dilations, memory::dims padding_left,
- memory::dims padding_right) :
- src_dims(src_dims), filter_dims(filter_dims),
- bias_dims(bias_dims), dst_dims(dst_dims),
- strides(strides), dilations(dilations),
- padding_left(padding_left), padding_right(padding_right) {
- }
+ MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims,
+ memory::dims bias_dims, memory::dims dst_dims,
+ memory::dims strides, memory::dims dilations,
+ memory::dims padding_left, memory::dims padding_right)
+ : src_dims(src_dims),
+ filter_dims(filter_dims),
+ bias_dims(bias_dims),
+ dst_dims(dst_dims),
+ strides(strides),
+ dilations(dilations),
+ padding_left(padding_left),
+ padding_right(padding_right) {}
};
template <typename T>
-class Conv2DFwd : public DnnOp {
+class MklConv2DFwdPrimitive : public MklPrimitive {
public:
- explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) {
- fwd_stream_.reset(new stream(stream::kind::eager));
+ explicit MklConv2DFwdPrimitive(const MklConvFwdParams& convFwdDims)
+ : cpu_engine_(engine::cpu, 0) {
+ context_.fwd_stream.reset(new stream(stream::kind::eager));
// create conv primitive
- if (conv_fwd_ == nullptr) {
+ if (context_.conv_fwd == nullptr) {
Setup(convFwdDims);
}
}
- ~Conv2DFwd() {}
+ ~MklConv2DFwdPrimitive() {}
// Convolution forward execute with bias
// src_data: input data buffer of src
// filter_data: input data buffer of filter (weights)
// bias_data: input data buffer of bias
// dst_data: output data buffer of dst
- void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) {
- src_mem_->set_data_handle(static_cast<void*>(src_data));
- filter_mem_->set_data_handle(static_cast<void*>(filter_data));
- bias_mem_->set_data_handle(static_cast<void*>(bias_data));
- dst_mem_->set_data_handle(static_cast<void*>(dst_data));
- fwd_stream_->submit(fwd_primitives_);
+ void Execute(const T* src_data, const T* filter_data, const T* bias_data,
+ const T* dst_data) {
+ context_.src_mem->set_data_handle(
+ static_cast<void*>(const_cast<T*>(src_data)));
+ context_.filter_mem->set_data_handle(
+ static_cast<void*>(const_cast<T*>(filter_data)));
+ context_.bias_mem->set_data_handle(
+ static_cast<void*>(const_cast<T*>(bias_data)));
+ context_.dst_mem->set_data_handle(
+ static_cast<void*>(const_cast<T*>(dst_data)));
+ context_.fwd_stream->submit(context_.fwd_primitives);
// after exec, set data handle back
- src_mem_->set_data_handle(DummyData);
- filter_mem_->set_data_handle(DummyData);
- bias_mem_->set_data_handle(DummyData);
- dst_mem_->set_data_handle(DummyData);
+ context_.src_mem->set_data_handle(DummyData);
+ context_.filter_mem->set_data_handle(DummyData);
+ context_.bias_mem->set_data_handle(DummyData);
+ context_.dst_mem->set_data_handle(DummyData);
return;
}
@@ -119,139 +128,177 @@ class Conv2DFwd : public DnnOp {
// src_data: input data buffer of src
// filter_data: input data buffer of filter (weights)
// dst_data: output data buffer of dst
- void Execute(T* src_data, T* filter_data, T* dst_data) {
- src_mem_->set_data_handle(static_cast<void*>(src_data));
- filter_mem_->set_data_handle(static_cast<void*>(filter_data));
- dst_mem_->set_data_handle(static_cast<void*>(dst_data));
- fwd_stream_->submit(fwd_primitives_);
-
- // after exec, set data handle back
- src_mem_->set_data_handle(DummyData);
- filter_mem_->set_data_handle(DummyData);
- dst_mem_->set_data_handle(DummyData);
-
- return;
+ void Execute(const T* src_data, const T* filter_data, const T* dst_data) {
+ context_.src_mem->set_data_handle(
+ static_cast<void*>(const_cast<T*>(src_data)));
+ context_.filter_mem->set_data_handle(
+ static_cast<void*>(const_cast<T*>(filter_data)));
+ context_.dst_mem->set_data_handle(
+ static_cast<void*>(const_cast<T*>(dst_data)));
+ context_.fwd_stream->submit(context_.fwd_primitives);
+
+ // after execution, set data handle back
+ context_.src_mem->set_data_handle(DummyData);
+ context_.filter_mem->set_data_handle(DummyData);
+ context_.dst_mem->set_data_handle(DummyData);
}
- // expected memory format for this primitive instance
- memory::format src_fmt_;
- memory::format filter_fmt_;
+ memory::format GetSrcMemoryFormat() const { return context_.src_fmt; }
- // convolution primitive
- std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_;
- std::shared_ptr<mkldnn::primitive> conv_fwd_;
+ memory::format GetFilterMemoryFormat() const { return context_.filter_fmt; }
+
+ std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
+ GetPrimitiveDesc() const {
+ return context_.fwd_pd;
+ }
private:
- void Setup(const ConvFwdDimensions& convFwdDims) {
+ // Primitive reuse context for Conv2D Fwd op
+ struct ConvFwdContext {
+ // expected memory format for this primitive instance
+ memory::format src_fmt;
+ memory::format filter_fmt;
+
+ // MKLDNN memory
+ std::shared_ptr<mkldnn::memory> src_mem;
+ std::shared_ptr<mkldnn::memory> filter_mem;
+ std::shared_ptr<mkldnn::memory> bias_mem;
+ std::shared_ptr<mkldnn::memory> dst_mem;
+
+ // desc & prmitive desc
+ std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc;
+
+ // memory desc
+ std::shared_ptr<mkldnn::memory::desc> src_md;
+ std::shared_ptr<mkldnn::memory::desc> filter_md;
+ std::shared_ptr<mkldnn::memory::desc> bias_md;
+ std::shared_ptr<mkldnn::memory::desc> dst_md;
+
+ // convolution primitive
+ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd;
+ std::shared_ptr<mkldnn::primitive> conv_fwd;
+
+ std::shared_ptr<mkldnn::stream> fwd_stream;
+ std::vector<mkldnn::primitive> fwd_primitives;
+
+ ConvFwdContext()
+ : src_fmt(memory::format::any),
+ filter_fmt(memory::format::any),
+ src_mem(nullptr),
+ filter_mem(nullptr),
+ bias_mem(nullptr),
+ dst_mem(nullptr),
+ fwd_desc(nullptr),
+ src_md(nullptr),
+ filter_md(nullptr),
+ bias_md(nullptr),
+ fwd_pd(nullptr),
+ conv_fwd(nullptr),
+ fwd_stream(nullptr) {}
+ };
+
+ void Setup(const MklConvFwdParams& convFwdDims) {
// create memory descriptors for convolution data w/ no specified format
- src_md_.reset(new memory::desc({convFwdDims.src_dims},
- MklDnnType<T>(), memory::format::any));
+ context_.src_md.reset(new memory::desc(
+ {convFwdDims.src_dims}, MklDnnType<T>(), memory::format::any));
- filter_md_.reset(new memory::desc({convFwdDims.filter_dims},
- MklDnnType<T>(), memory::format::any));
+ context_.filter_md.reset(new memory::desc(
+ {convFwdDims.filter_dims}, MklDnnType<T>(), memory::format::any));
- dst_md_.reset(new memory::desc({convFwdDims.dst_dims},
- MklDnnType<T>(), memory::format::any));
+ context_.dst_md.reset(new memory::desc(
+ {convFwdDims.dst_dims}, MklDnnType<T>(), memory::format::any));
if (!convFwdDims.bias_dims.empty())
- bias_md_.reset(new memory::desc({convFwdDims.bias_dims},
- MklDnnType<T>(), memory::format::any));
+ context_.bias_md.reset(new memory::desc(
+ {convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::any));
// create a convolution
if (!convFwdDims.bias_dims.empty()) {
- fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
- convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_,
+ context_.fwd_desc.reset(new convolution_forward::desc(
+ prop_kind::forward, convolution_direct, *context_.src_md,
+ *context_.filter_md, *context_.bias_md, *context_.dst_md,
convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
convFwdDims.padding_right, padding_kind::zero));
} else {
- fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
- convolution_direct, *src_md_, *filter_md_, *dst_md_,
- convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+ context_.fwd_desc.reset(new convolution_forward::desc(
+ prop_kind::forward, convolution_direct, *context_.src_md,
+ *context_.filter_md, *context_.dst_md, convFwdDims.strides,
+ convFwdDims.dilations, convFwdDims.padding_left,
convFwdDims.padding_right, padding_kind::zero));
}
- fwd_pd_.reset(new convolution_forward::primitive_desc(
- *fwd_desc_, cpu_engine_));
+ context_.fwd_pd.reset(new convolution_forward::primitive_desc(
+ *context_.fwd_desc, cpu_engine_));
// store the expected memory format
- src_fmt_ = static_cast<mkldnn::memory::format>(
- fwd_pd_.get()->src_primitive_desc().desc().data.format);
+ context_.src_fmt = static_cast<mkldnn::memory::format>(
+ context_.fwd_pd.get()->src_primitive_desc().desc().data.format);
- filter_fmt_ = static_cast<mkldnn::memory::format>(
- fwd_pd_.get()->weights_primitive_desc().desc().data.format);
+ context_.filter_fmt = static_cast<mkldnn::memory::format>(
+ context_.fwd_pd.get()->weights_primitive_desc().desc().data.format);
// create memory primitive based on dummy data
- src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData));
- filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(),
- DummyData));
- dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData));
+ context_.src_mem.reset(
+ new memory(context_.fwd_pd.get()->src_primitive_desc(), DummyData));
+ context_.filter_mem.reset(
+ new memory(context_.fwd_pd.get()->weights_primitive_desc(), DummyData));
+ context_.dst_mem.reset(
+ new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
// create convolution primitive and add it to net
if (!convFwdDims.bias_dims.empty()) {
- bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType<T>(),
- memory::format::x}, cpu_engine_}, DummyData));
- conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
- *filter_mem_, *bias_mem_, *dst_mem_));
+ context_.bias_mem.reset(new memory(
+ {{{convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::x},
+ cpu_engine_},
+ DummyData));
+ context_.conv_fwd.reset(new convolution_forward(
+ *context_.fwd_pd, *context_.src_mem, *context_.filter_mem,
+ *context_.bias_mem, *context_.dst_mem));
} else {
- conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
- *filter_mem_, *dst_mem_));
+ context_.conv_fwd.reset(
+ new convolution_forward(*context_.fwd_pd, *context_.src_mem,
+ *context_.filter_mem, *context_.dst_mem));
}
- fwd_primitives_.push_back(*conv_fwd_);
+ context_.fwd_primitives.push_back(*context_.conv_fwd);
return;
}
- // MKLDNN memory
- std::shared_ptr<mkldnn::memory> src_mem_;
- std::shared_ptr<mkldnn::memory> filter_mem_;
- std::shared_ptr<mkldnn::memory> bias_mem_;
- std::shared_ptr<mkldnn::memory> dst_mem_;
-
- std::shared_ptr<mkldnn::stream> fwd_stream_;
- std::vector<mkldnn::primitive> fwd_primitives_;
-
- // desc & prmitive desc
- std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
-
- // memory desc
- std::shared_ptr<mkldnn::memory::desc> src_md_;
- std::shared_ptr<mkldnn::memory::desc> filter_md_;
- std::shared_ptr<mkldnn::memory::desc> bias_md_;
- std::shared_ptr<mkldnn::memory::desc> dst_md_;
-
- engine cpu_engine_ = engine(engine::cpu, 0);
+ struct ConvFwdContext context_;
+ engine cpu_engine_;
};
template <typename T>
-class Conv2DFwdFactory : public DnnOpFactory<T> {
+class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
public:
- static Conv2DFwd<T>* Get(const ConvFwdDimensions& convFwdDims) {
- Conv2DFwd<T>* conv2d_fwd = nullptr;
-
- // try to find a suitable one in pool
- conv2d_fwd = dynamic_cast<Conv2DFwd<T>*> (
- Conv2DFwdFactory<T>::GetInstance().GetConv2DFwd(convFwdDims));
-
- if (conv2d_fwd == nullptr) {
- conv2d_fwd = new Conv2DFwd<T>(convFwdDims);
- Conv2DFwdFactory<T>::GetInstance().SetConv2DFwd(
- convFwdDims, conv2d_fwd);
- }
- return conv2d_fwd;
+ static MklConv2DFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims) {
+ MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr;
+
+ // try to find a suitable one in pool
+ conv2d_fwd = dynamic_cast<MklConv2DFwdPrimitive<T>*>(
+ MklConv2DFwdPrimitiveFactory<T>::GetInstance().GetConv2DFwd(
+ convFwdDims));
+
+ if (conv2d_fwd == nullptr) {
+ conv2d_fwd = new MklConv2DFwdPrimitive<T>(convFwdDims);
+ MklConv2DFwdPrimitiveFactory<T>::GetInstance().SetConv2DFwd(convFwdDims,
+ conv2d_fwd);
+ }
+ return conv2d_fwd;
}
private:
- Conv2DFwdFactory() {}
- ~Conv2DFwdFactory() {}
+ MklConv2DFwdPrimitiveFactory() {}
+ ~MklConv2DFwdPrimitiveFactory() {}
static const int kDilationH = 0, kDilationW = 1;
- static Conv2DFwdFactory& GetInstance() {
- static Conv2DFwdFactory instance_;
+ static MklConv2DFwdPrimitiveFactory& GetInstance() {
+ static MklConv2DFwdPrimitiveFactory instance_;
return instance_;
}
- static std::string CreateKey(const ConvFwdDimensions& convFwdDims) {
+ static std::string CreateKey(const MklConvFwdParams& convFwdDims) {
std::string prefix = "conv2d_fwd_";
FactoryKeyCreator key_creator;
key_creator.AddAsKey(prefix);
@@ -266,12 +313,12 @@ class Conv2DFwdFactory : public DnnOpFactory<T> {
return key_creator.GetKey();
}
- DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) {
+ MklPrimitive* GetConv2DFwd(const MklConvFwdParams& convFwdDims) {
std::string key = CreateKey(convFwdDims);
return this->GetOp(key);
}
- void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) {
+ void SetConv2DFwd(const MklConvFwdParams& convFwdDims, MklPrimitive* op) {
std::string key = CreateKey(convFwdDims);
this->SetOp(key, op);
}
@@ -762,7 +809,6 @@ class MklConv2DOp : public OpKernel {
MklDnnData<T> src(&cpu_engine);
MklDnnData<T> filter(&cpu_engine);
- MklDnnData<T> dst(&cpu_engine); // output
memory::dims src_dims, filter_dims, padding_left, padding_right,
dilations, strides;
@@ -812,7 +858,6 @@ class MklConv2DOp : public OpKernel {
auto src_md = src_mkl_shape.IsMklTensor()
? src_mkl_shape.GetMklLayout()
: memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
- src.SetUsrMem(src_md, &src_tensor);
// Although filter shape (filter_dims) required is in MKL-DNN order,
// the layout is Tensorflow's layout (HWIO).
@@ -820,29 +865,30 @@ class MklConv2DOp : public OpKernel {
? filter_mkl_shape.GetMklLayout()
: memory::desc(filter_dims, MklDnnType<T>(),
memory::format::hwio);
- filter.SetUsrMem(filter_md, &filter_tensor);
// MKLDNN dilation starts from 0.
dilations[kDilationH] -= 1;
dilations[kDilationW] -= 1;
// get a conv2d fwd from primitive pool
- Conv2DFwd<T> *conv2d_fwd = nullptr;
+ MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr;
if (biasEnabled) {
memory::dims bias_dims = {};
conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
- ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims,
- dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
- conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+ MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
+ dst_dims_mkl_order, strides, dilations,
+ padding_left, padding_right);
+ conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims);
} else {
- ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS,
- dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
- conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
+ MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS,
+ dst_dims_mkl_order, strides, dilations,
+ padding_left, padding_right);
+ conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims);
}
// allocate output tensors output_tensor and filter_out_tensor
- std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
- conv_fwd_pd = conv2d_fwd->fwd_pd_;
+ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd =
+ conv2d_fwd->GetPrimitiveDesc();
AllocateOutputTensor(context, *conv_fwd_pd,
dst_dims_mkl_order, tf_fmt, &dst_tensor);
Tensor* filter_out_tensor = nullptr;
@@ -854,20 +900,28 @@ class MklConv2DOp : public OpKernel {
// check whether src/filter need reorder
std::vector<primitive> net;
- if (src_md.data.format != conv2d_fwd->src_fmt_)
- src.CheckReorderToOpMem(
- conv_fwd_pd.get()->src_primitive_desc(), &net);
-
- if (filter_md.data.format != conv2d_fwd->filter_fmt_)
- filter.CheckReorderToOpMem(
- conv_fwd_pd.get()->weights_primitive_desc(),
- filter.GetTensorBuffer(filter_out_tensor), &net);
+ T* src_data = nullptr;
+ if (src_md.data.format != conv2d_fwd->GetSrcMemoryFormat()) {
+ src.SetUsrMem(src_md, &src_tensor);
+ src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc(), &net);
+ src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
+ } else {
+ src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
+ }
+ T* filter_data = nullptr;
+ if (filter_md.data.format != conv2d_fwd->GetFilterMemoryFormat()) {
+ filter.SetUsrMem(filter_md, &filter_tensor);
+ filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(),
+ filter.GetTensorBuffer(filter_out_tensor),
+ &net);
+ filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
+ } else {
+ filter_data =
+ static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data()));
+ }
+
stream(stream::kind::eager).submit(net).wait();
- T* src_data = static_cast<T*>(
- src.GetOpMem().get_data_handle());
- T* filter_data = static_cast<T*>(
- filter.GetOpMem().get_data_handle());
// execute convolution
if (biasEnabled) {
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 6655084045..9af4cc23b6 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -295,7 +295,7 @@ __global__ void ColumnReduceMax16ColumnsKernel(
// 1D array necessary due to bug in CUDA 9 compiler.
// TODO(nluehr) revert to 2D array when compiler is ready.
- // This is the mimic the following, but without any constructors:
+ // This is to mimic the following, but without any constructors:
// __shared__ storage_type<value_type> partial_sums[32 * 33];
__shared__ __align__(
alignof(value_type)) char partial_sums_raw[32 * 33 * sizeof(value_type)];
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index d65692a552..d28e35157b 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -16,6 +16,12 @@ limitations under the License.
#ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
#define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
// This file requires the following include because it uses CudaAtomicMax:
// #include "tensorflow/core/util/cuda_kernel_helper.h"
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 262526846d..c229bd5a41 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -614,7 +614,13 @@ REGISTER_OP("ApproximateEqual")
.SetIsCommutative()
.Attr("T: numbertype")
.Attr("tolerance: float = 0.00001")
- .SetShapeFn(shape_inference::UnchangedShape);
+ .SetShapeFn([](InferenceContext* c) {
+ // The inputs 'x' and 'y' must have the same shape.
+ ShapeHandle data_x = c->input(0);
+ ShapeHandle data_y = c->input(1);
+ TF_RETURN_IF_ERROR(c->Merge(data_x, data_y, &data_x));
+ return shape_inference::UnchangedShape(c);
+ });
// --------------------------------------------------------------------------
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index e64653a67a..ee6ba7b041 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -137,8 +137,8 @@ Status EncodeJwtClaim(StringPiece client_email, StringPiece scope,
const auto expiration_timestamp_sec =
request_timestamp_sec + kRequestedTokenLifetimeSec;
- root["iat"] = request_timestamp_sec;
- root["exp"] = expiration_timestamp_sec;
+ root["iat"] = Json::Value::UInt64(request_timestamp_sec);
+ root["exp"] = Json::Value::UInt64(expiration_timestamp_sec);
// Step 2: represent the JSON as a string.
string claim = root.toStyledString();
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index a319ccbdbe..66ccd81e41 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -202,7 +202,10 @@ def cc_proto_library(
)
if use_grpc_plugin:
- cc_libs += ["//external:grpc_lib"]
+ cc_libs += select({
+ "//tensorflow:linux_s390x": ["//external:grpc_lib_unsecure"],
+ "//conditions:default": ["//external:grpc_lib"],
+ })
if default_header:
header_only_name = name
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 174f41a993..f2aaf13bec 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -171,5 +171,10 @@ int64 AvailableRam() {
return INT64_MAX;
}
+int NumHyperthreadsPerCore() {
+ static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+ return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
} // namespace port
} // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index b0dd8ce5e0..979b437914 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -47,9 +47,9 @@ Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
event["ph"] = Json::Value(ph);
event["cat"] = Json::Value(category);
event["name"] = Json::Value(name);
- event["pid"] = Json::Value(pid);
- event["tid"] = Json::Value(tid);
- event["ts"] = Json::Value(ts);
+ event["pid"] = Json::Int64(pid);
+ event["tid"] = Json::Int64(tid);
+ event["ts"] = Json::Int64(ts);
return event;
}
@@ -57,7 +57,7 @@ void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) {
Json::Value event(Json::objectValue);
event["name"] = Json::Value("process_name");
event["ph"] = Json::Value("M");
- event["pid"] = Json::Value(pid);
+ event["pid"] = Json::Int64(pid);
Json::Value args(Json::objectValue);
args["name"] = Json::Value(name);
event["args"] = args;
@@ -68,7 +68,7 @@ void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
int64 tid, const string& category,
const string& name, Json::Value args) {
Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
- event["dur"] = Json::Value(duration);
+ event["dur"] = Json::Int64(duration);
event["args"] = std::move(args);
metadata_.push_back(event);
}
@@ -76,14 +76,14 @@ void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts,
int64 pid, int64 tid, int64 flow_id) {
Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
- event["id"] = flow_id;
+ event["id"] = Json::Int64(flow_id);
events_.push_back(event);
}
void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
int64 tid, int64 flow_id) {
Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
- event["id"] = flow_id;
+ event["id"] = Json::Int64(flow_id);
events_.push_back(event);
}
@@ -93,7 +93,7 @@ void ChromeTraceFormatter::EmitCounter(
const std::map<int64, std::vector<string>>& tensor_mem) {
Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
Json::Value args(Json::objectValue);
- args["Allocator Bytes in Use"] = Json::Value(bytes);
+ args["Allocator Bytes in Use"] = Json::Int64(bytes);
event["args"] = args;
events_.push_back(event);
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 90b6533690..b5e42f5384 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1814,11 +1814,11 @@ class MklDnnData {
}
};
-/// Base class for operations with reuse of DNN primitives
+/// Base class for operations with reuse of primitives
///
-class DnnOp {
+class MklPrimitive {
public:
- virtual ~DnnOp() {}
+ virtual ~MklPrimitive() {}
// Dummy data. Its size, hard-coded as 256 here, does
// not matter since MKL should never operate on this buffer.
@@ -1826,33 +1826,33 @@ class DnnOp {
};
const mkldnn::memory::dims NONE_DIMS = {};
-// This constant is used to declare dummy buffer (size), for MKL primitives
+
template <typename T>
-class DnnOpFactory {
+class MklPrimitiveFactory {
public:
- DnnOpFactory() {}
- ~DnnOpFactory() {}
+ MklPrimitiveFactory() {}
+ ~MklPrimitiveFactory() {}
- DnnOp* GetOp(const std::string& key) {
- auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
- if (stream_iter == DnnOpFactory<T>::GetHashMap().end()) {
+ MklPrimitive* GetOp(const std::string& key) {
+ auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key);
+ if (stream_iter == MklPrimitiveFactory<T>::GetHashMap().end()) {
return nullptr;
} else {
return stream_iter->second;
}
}
- void SetOp(const std::string& key, DnnOp* op) {
- auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+ void SetOp(const std::string& key, MklPrimitive* op) {
+ auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key);
- CHECK(stream_iter == DnnOpFactory<T>::GetHashMap().end());
+ CHECK(stream_iter == MklPrimitiveFactory<T>::GetHashMap().end());
- DnnOpFactory<T>::GetHashMap()[key] = op;
+ MklPrimitiveFactory<T>::GetHashMap()[key] = op;
}
private:
- static inline std::unordered_map<std::string, DnnOp*> &GetHashMap() {
- static thread_local std::unordered_map<std::string, DnnOp*> map_;
+ static inline std::unordered_map<std::string, MklPrimitive*>& GetHashMap() {
+ static thread_local std::unordered_map<std::string, MklPrimitive*> map_;
return map_;
}
};
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
new file mode 100644
index 0000000000..bd2a80d9ef
--- /dev/null
+++ b/tensorflow/docs_src/get_started/index.md
@@ -0,0 +1,29 @@
+# Get Started
+
+If you are new to machine learning, we recommend taking the following online
+course prior to diving into TensorFlow documentation:
+
+ * [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/),
+ which introduces machine learning concepts and encourages experimentation
+ with existing TensorFlow code.
+
+TensorFlow is a tool for machine learning. While it contains a wide range of
+functionality, TensorFlow is mainly designed for deep neural network models.
+
+The easiest way to get started with TensorFlow is by using Eager Execution.
+
+ * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
+
+TensorFlow provides many APIs. The remainder of this section focuses on the
+Estimator API which provide scalable, high-performance models. See the
+@{$estimators} guide.
+
+For more advanced users:
+
+ * The @{$low_level_intro$Low Level Introduction} demonstrates how to use
+ TensorFlow outside of the Estimator framework, for debugging and
+ experimentation.
+ * The @{$guide$Programmer's Guide} details major
+ TensorFlow components.
+ * The @{$tutorials$Tutorials} provide walkthroughs of a variety of
+ TensorFlow models.
diff --git a/tensorflow/docs_src/guide/debugger.md b/tensorflow/docs_src/guide/debugger.md
index 5cf9af904a..dc4db58857 100644
--- a/tensorflow/docs_src/guide/debugger.md
+++ b/tensorflow/docs_src/guide/debugger.md
@@ -17,7 +17,7 @@ how to use the graphical user interface (GUI) of tfdbg, i.e., the
Note: The TensorFlow debugger uses a
[curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text
user interface. On Mac OS X, the `ncurses` library is required and can be
-installed with `brew install homebrew/dupes/ncurses`. On Windows, curses isn't as
+installed with `brew install ncurses`. On Windows, curses isn't as
well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based
interface can be used with tfdbg by installing `pyreadline` with `pip`. If you
use Anaconda3, you can install it with a command such as
diff --git a/tensorflow/go/attrs.go b/tensorflow/go/attrs.go
new file mode 100644
index 0000000000..f86c5737bc
--- /dev/null
+++ b/tensorflow/go/attrs.go
@@ -0,0 +1,245 @@
+/*
+Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+// #include <stdlib.h>
+// #include "tensorflow/c/c_api.h"
+import "C"
+import (
+ "fmt"
+ "unsafe"
+)
+
+// makeCShape converts a shape specified in C.int64_t into a Shape.
+func makeCShape(shape []C.int64_t) Shape {
+ s := Shape{dims: make([]int64, len(shape))}
+ for i, n := range shape {
+ s.dims[i] = int64(n)
+ }
+ return s
+}
+
+// Attr returns the value of an attribute on op. It returns an error if the
+// attribute does not exist.
+func (op *Operation) Attr(name string) (interface{}, error) {
+ cname := C.CString(name)
+ defer C.free(unsafe.Pointer(cname))
+
+ status := newStatus()
+ meta := C.TF_OperationGetAttrMetadata(op.c, cname, status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+
+ if meta.is_list == 1 {
+ return listAttribute(op, cname, meta)
+ }
+ return scalarAttribute(op, cname, meta)
+}
+
+func listAttribute(op *Operation, cname *C.char, meta C.TF_AttrMetadata) (interface{}, error) {
+ status := newStatus()
+
+ switch meta._type {
+ case C.TF_ATTR_STRING:
+ if meta.list_size == 0 {
+ return []string(nil), nil
+ }
+ values := make([]unsafe.Pointer, meta.list_size)
+ lengths := make([]C.size_t, meta.list_size)
+ // Add one element in case total_size is zero.
+ storage := make([]C.char, meta.total_size+1)
+ C.TF_OperationGetAttrStringList(op.c, cname, &values[0], &lengths[0], C.int(meta.list_size), unsafe.Pointer(&storage[0]), C.size_t(meta.total_size), status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ list := make([]string, meta.list_size)
+ for i, val := range values {
+ length := lengths[i]
+ list[i] = C.GoStringN((*C.char)(val), C.int(length))
+ }
+ return list, nil
+
+ case C.TF_ATTR_INT:
+ if meta.list_size == 0 {
+ return []int64(nil), nil
+ }
+ list := make([]C.int64_t, meta.list_size)
+ C.TF_OperationGetAttrIntList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ vals := make([]int64, meta.list_size)
+ for i, val := range list {
+ vals[i] = int64(val)
+ }
+ return vals, nil
+
+ case C.TF_ATTR_FLOAT:
+ if meta.list_size == 0 {
+ return []float32(nil), nil
+ }
+ list := make([]C.float, meta.list_size)
+ C.TF_OperationGetAttrFloatList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ vals := make([]float32, meta.list_size)
+ for i, val := range list {
+ vals[i] = float32(val)
+ }
+ return vals, nil
+
+ case C.TF_ATTR_BOOL:
+ if meta.list_size == 0 {
+ return []bool(nil), nil
+ }
+ list := make([]C.uchar, meta.list_size)
+ C.TF_OperationGetAttrBoolList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ vals := make([]bool, meta.list_size)
+ for i, val := range list {
+ vals[i] = val == 1
+ }
+ return vals, nil
+
+ case C.TF_ATTR_TYPE:
+ if meta.list_size == 0 {
+ return []DataType(nil), nil
+ }
+ list := make([]C.TF_DataType, meta.list_size)
+ C.TF_OperationGetAttrTypeList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ vals := make([]DataType, meta.list_size)
+ for i, val := range list {
+ vals[i] = DataType(val)
+ }
+ return vals, nil
+
+ case C.TF_ATTR_TENSOR:
+ if meta.list_size == 0 {
+ return []*Tensor(nil), nil
+ }
+ list := make([]*C.TF_Tensor, meta.list_size)
+ C.TF_OperationGetAttrTensorList(op.c, cname, &list[0], C.int(meta.list_size), status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ vals := make([]*Tensor, meta.list_size)
+ for i, t := range list {
+ vals[i] = newTensorFromC(t)
+ }
+ return vals, nil
+
+ case C.TF_ATTR_SHAPE:
+ if meta.list_size == 0 {
+ return []Shape(nil), nil
+ }
+ dims := make([]*C.int64_t, meta.list_size)
+ numDims := make([]C.int, meta.list_size)
+ // Add one element in case total_size is zero.
+ storage := make([]C.int64_t, meta.total_size+1)
+ C.TF_OperationGetAttrShapeList(op.c, cname, &dims[0], &numDims[0], C.int(meta.list_size), &storage[0], C.int(meta.total_size), status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ list := make([]Shape, meta.list_size)
+ for i, dim := range dims {
+ numDim := numDims[i]
+ // If the number of dimensions is unknown, default to empty shape.
+ if numDim < 0 {
+ continue
+ }
+ // A []C.int64_t slice backed by C memory.
+ // See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+ slice := (*[1 << 30]C.int64_t)(unsafe.Pointer(dim))[:numDim:numDim]
+ list[i] = makeCShape(slice)
+ }
+ return list, nil
+
+ default:
+ return nil, fmt.Errorf("list type %v not supported", meta._type)
+ }
+}
+
+func scalarAttribute(op *Operation, cname *C.char, meta C.TF_AttrMetadata) (interface{}, error) {
+ status := newStatus()
+
+ switch meta._type {
+ case C.TF_ATTR_STRING:
+ if meta.total_size == 0 {
+ return "", nil
+ }
+ v := make([]C.char, meta.total_size)
+ C.TF_OperationGetAttrString(op.c, cname, unsafe.Pointer(&v[0]), C.size_t(meta.total_size), status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ return C.GoStringN(&v[0], C.int(meta.total_size)), nil
+
+ case C.TF_ATTR_INT:
+ var v C.int64_t
+ C.TF_OperationGetAttrInt(op.c, cname, &v, status.c)
+ return int64(v), status.Err()
+
+ case C.TF_ATTR_FLOAT:
+ var v C.float
+ C.TF_OperationGetAttrFloat(op.c, cname, &v, status.c)
+ return float32(v), status.Err()
+
+ case C.TF_ATTR_BOOL:
+ var v C.uchar
+ C.TF_OperationGetAttrBool(op.c, cname, &v, status.c)
+ return v == 1, status.Err()
+
+ case C.TF_ATTR_TYPE:
+ var v C.TF_DataType
+ C.TF_OperationGetAttrType(op.c, cname, &v, status.c)
+ return DataType(v), status.Err()
+
+ case C.TF_ATTR_TENSOR:
+ var v *C.TF_Tensor
+ C.TF_OperationGetAttrTensor(op.c, cname, &v, status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ return newTensorFromC(v), nil
+
+ case C.TF_ATTR_SHAPE:
+ numDims := meta.total_size
+ // If number of dims is unknown return empty shape to indicate that.
+ if numDims < 0 {
+ return Shape{}, nil
+ }
+ if numDims == 0 {
+ return ScalarShape(), nil
+ }
+ dims := make([]C.int64_t, numDims)
+ C.TF_OperationGetAttrShape(op.c, cname, (*C.int64_t)(unsafe.Pointer(&dims[0])), C.int(numDims), status.c)
+ if err := status.Err(); err != nil {
+ return nil, err
+ }
+ return makeCShape(dims), nil
+
+ default:
+ return nil, fmt.Errorf("type %v not supported", meta._type)
+ }
+}
diff --git a/tensorflow/go/attrs_test.go b/tensorflow/go/attrs_test.go
new file mode 100644
index 0000000000..ea8af221ae
--- /dev/null
+++ b/tensorflow/go/attrs_test.go
@@ -0,0 +1,193 @@
+/*
+Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+import (
+ "fmt"
+ "reflect"
+ "testing"
+)
+
+func TestOperationAttrs(t *testing.T) {
+ g := NewGraph()
+
+ i := 0
+ makeConst := func(v interface{}) Output {
+ op, err := Const(g, fmt.Sprintf("const/%d/%+v", i, v), v)
+ i++
+ if err != nil {
+ t.Fatal(err)
+ }
+ return op
+ }
+
+ makeTensor := func(v interface{}) *Tensor {
+ tensor, err := NewTensor(v)
+ if err != nil {
+ t.Fatal(err)
+ }
+ return tensor
+ }
+
+ cases := []OpSpec{
+ {
+ Name: "type",
+ Type: "Placeholder",
+ Attrs: map[string]interface{}{
+ "dtype": Float,
+ },
+ },
+ {
+ Name: "list(float)",
+ Type: "Bucketize",
+ Input: []Input{
+ makeConst([]float32{1, 2, 3, 4}),
+ },
+ Attrs: map[string]interface{}{
+ "boundaries": []float32{0, 1, 2, 3, 4, 5},
+ },
+ },
+ {
+ Name: "list(float) empty",
+ Type: "Bucketize",
+ Input: []Input{
+ makeConst([]float32{}),
+ },
+ Attrs: map[string]interface{}{
+ "boundaries": []float32(nil),
+ },
+ },
+ /* TODO(ashankar): debug this issue and add it back later.
+ {
+ Name: "list(type),list(shape)",
+ Type: "InfeedEnqueueTuple",
+ Input: []Input{
+ OutputList([]Output{
+ makeConst(float32(1)),
+ makeConst([][]int32{{2}}),
+ }),
+ },
+ Attrs: map[string]interface{}{
+ "dtypes": []DataType{Float, Int32},
+ "shapes": []Shape{ScalarShape(), MakeShape(1, 1)},
+ },
+ },
+ {
+ Name: "list(type),list(shape) empty",
+ Type: "InfeedEnqueueTuple",
+ Input: []Input{
+ OutputList([]Output{
+ makeConst([][]int32{{2}}),
+ }),
+ },
+ Attrs: map[string]interface{}{
+ "dtypes": []DataType{Int32},
+ "shapes": []Shape(nil),
+ },
+ },
+ {
+ Name: "list(type) empty,string empty,int",
+ Type: "_XlaSendFromHost",
+ Input: []Input{
+ OutputList([]Output{}),
+ makeConst(""),
+ },
+ Attrs: map[string]interface{}{
+ "Tinputs": []DataType(nil),
+ "key": "",
+ "device_ordinal": int64(0),
+ },
+ },
+ */
+ {
+ Name: "list(int),int",
+ Type: "StringToHashBucketStrong",
+ Input: []Input{
+ makeConst(""),
+ },
+ Attrs: map[string]interface{}{
+ "num_buckets": int64(2),
+ "key": []int64{1, 2},
+ },
+ },
+ {
+ Name: "list(int) empty,int",
+ Type: "StringToHashBucketStrong",
+ Input: []Input{
+ makeConst(""),
+ },
+ Attrs: map[string]interface{}{
+ "num_buckets": int64(2),
+ "key": ([]int64)(nil),
+ },
+ },
+ {
+ Name: "list(string),type",
+ Type: "TensorSummary",
+ Input: []Input{
+ makeConst(""),
+ },
+ Attrs: map[string]interface{}{
+ "T": String,
+ "labels": []string{"foo", "bar"},
+ },
+ },
+ {
+ Name: "list(string) empty,type",
+ Type: "TensorSummary",
+ Input: []Input{
+ makeConst(""),
+ },
+ Attrs: map[string]interface{}{
+ "T": String,
+ "labels": ([]string)(nil),
+ },
+ },
+ {
+ Name: "tensor",
+ Type: "Const",
+ Attrs: map[string]interface{}{
+ "dtype": String,
+ "value": makeTensor("foo"),
+ },
+ },
+ }
+
+ for i, spec := range cases {
+ op, err := g.AddOperation(spec)
+ if err != nil {
+ t.Fatal(err)
+ }
+ for key, want := range spec.Attrs {
+ out, err := op.Attr(key)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !reflect.DeepEqual(out, want) {
+ t.Fatalf("%d. %q: Got %#v, wanted %#v", i, key, out, want)
+ }
+ wantT, ok := want.(*Tensor)
+ if ok {
+ wantVal := wantT.Value()
+ outVal := out.(*Tensor).Value()
+ if !reflect.DeepEqual(outVal, wantVal) {
+ t.Fatalf("%d. %q: Got %#v, wanted %#v", i, key, outVal, wantVal)
+ }
+ }
+ }
+ }
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index b2dbdafc5f..6d9cb7c6ec 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11210,7 +11210,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
//
// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
// If not specified, defaults to <f:0.05 f:1 >
func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
return func(m optionalAttr) {
@@ -17969,9 +17969,10 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
}
// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
// if < 0, `scale * features` otherwise.
//
+// Assumes weights to have zero mean and variance 1.0 / fan_in.
+//
// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
if scope.Err() != nil {
@@ -21655,7 +21656,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
// generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
//
// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values. It is a `unit8` 1-D tensor of length `channels`.
+// non-finite input values. It is a `uint8` 1-D tensor of length `channels`.
// Each element must be in the range `[0, 255]` (It represents the value of a
// pixel in the output image). Non-finite values in the input tensor are
// replaced by this tensor in the output image. The default value is the color
@@ -24048,7 +24049,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
//
// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
// If not specified, defaults to <f:0.05 f:1 >
func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
return func(m optionalAttr) {
diff --git a/tensorflow/go/operation.go b/tensorflow/go/operation.go
index 8fcad61f4c..25ec718703 100644
--- a/tensorflow/go/operation.go
+++ b/tensorflow/go/operation.go
@@ -65,6 +65,11 @@ func (op *Operation) Output(i int) Output {
return Output{op, i}
}
+// NumInputs returns the number of inputs of op.
+func (op *Operation) NumInputs() int {
+ return int(C.TF_OperationNumInputs(op.c))
+}
+
// Output represents one of the outputs of an operation in the graph. Has a
// DataType (and eventually a Shape). May be passed as an input argument to a
// function for adding operations to a graph, or to a Session's Run() method to
@@ -123,6 +128,67 @@ func (p Output) c() C.TF_Output {
func (p Output) canBeAnInput() {}
+// Consumers returns the inputs that consume this output.
+func (p Output) Consumers() []Consumer {
+ max := int(C.TF_OperationOutputNumConsumers(p.c()))
+ if max == 0 {
+ return nil
+ }
+ inputs := make([]C.TF_Input, max)
+ n := C.TF_OperationOutputConsumers(p.c(), (*C.TF_Input)(unsafe.Pointer(&inputs[0])), C.int(max))
+ inputs = inputs[:int(n)]
+
+ var consumers []Consumer
+ for _, consumer := range inputs {
+ consumers = append(consumers, Consumer{
+ Index: int(consumer.index),
+ Op: &Operation{
+ c: consumer.oper,
+ g: p.Op.g,
+ },
+ })
+ }
+
+ return consumers
+}
+
+// Consumer identifies a specific input of an operation that consumes the output
+// of another operation.
+type Consumer struct {
+ // Op is the Operation that is consuming the output of another operation.
+ Op *Operation
+
+ // Index is the index of the input within Op that the output of another
+ // operation is connected to.
+ Index int
+}
+
+func (p Consumer) c() C.TF_Input {
+ if p.Op == nil {
+ // Attempt to provide a more useful panic message than "nil
+ // pointer dereference".
+ panic("nil-Operation. Consumer objects should only be created by a call to Output.Consumers")
+ }
+ return C.TF_Input{oper: p.Op.c, index: C.int(p.Index)}
+}
+
+// DataType returns the type of the input.
+func (p Consumer) DataType() DataType {
+ return DataType(C.TF_OperationInputType(p.c()))
+}
+
+// Producer returns the Output that is connected to this Consumer.
+func (p Consumer) Producer() Output {
+ output := C.TF_OperationInput(p.c())
+ return Output{
+ Op: &Operation{
+ c: output.oper,
+ g: p.Op.g,
+ },
+ Index: int(output.index),
+ }
+}
+
// Input is the interface for specifying inputs to an operation being added to
// a Graph.
//
diff --git a/tensorflow/go/operation_test.go b/tensorflow/go/operation_test.go
index 40c951ab8c..06b65bdfb7 100644
--- a/tensorflow/go/operation_test.go
+++ b/tensorflow/go/operation_test.go
@@ -166,6 +166,68 @@ func TestOutputDataTypeAndShape(t *testing.T) {
}
}
+func TestOperationInputs(t *testing.T) {
+ g := NewGraph()
+ x, err := Placeholder(g, "x", Float)
+ if err != nil {
+ t.Fatal(err)
+ }
+ y, err := Placeholder(g, "y", Float)
+ if err != nil {
+ t.Fatal(err)
+ }
+ add, err := Add(g, "add", x, y)
+ if err != nil {
+ t.Fatal(err)
+ }
+ addOp := add.Op
+
+ if out := addOp.NumInputs(); out != 2 {
+ t.Fatalf("Got %d inputs, wanted 2", out)
+ }
+}
+
+func TestOperationConsumers(t *testing.T) {
+ g := NewGraph()
+ x, err := Placeholder(g, "x", Float)
+ if err != nil {
+ t.Fatal(err)
+ }
+ a, err := Neg(g, "a", x)
+ if err != nil {
+ t.Fatal(err)
+ }
+ b, err := Neg(g, "b", x)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ consumers := []*Operation{a.Op, b.Op}
+
+ xConsumers := x.Consumers()
+ if out := len(xConsumers); out != 2 {
+ t.Fatalf("Got %d consumers, wanted 2", out)
+ }
+
+ for i, consumer := range xConsumers {
+ got := consumer.Op.Name()
+ want := consumers[i].Name()
+ if got != want {
+ t.Fatalf("%d. Got op name %q, wanted %q", i, got, want)
+ }
+
+ got = consumer.Producer().Op.Name()
+ want = x.Op.Name()
+ if got != want {
+ t.Fatalf("%d. Got op name %q, wanted %q", i, got, want)
+ }
+ }
+
+ if len(b.Consumers()) != 0 {
+ t.Fatalf("expected %+v to have no consumers", b)
+ }
+}
+
func forceGC() {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 19d2133a55..73e210fae0 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -56,6 +56,10 @@ java_library(
srcs = glob(["src/gen/java/org/tensorflow/processor/**/*.java"]),
javacopts = JAVACOPTS,
resources = glob(["src/gen/resources/META-INF/services/javax.annotation.processing.Processor"]),
+ deps = [
+ "@com_google_guava",
+ "@com_squareup_javapoet",
+ ],
)
filegroup(
@@ -70,6 +74,7 @@ tf_java_op_gen_srcjar(
name = "java_op_gen_sources",
api_def_srcs = [
"//tensorflow/core/api_def:base_api_def",
+ "//tensorflow/core/api_def:java_api_def",
],
base_package = "org.tensorflow.op",
gen_tool = ":java_op_gen_tool",
diff --git a/tensorflow/java/maven/.gitignore b/tensorflow/java/maven/.gitignore
index ff080515d5..657e2a60bc 100644
--- a/tensorflow/java/maven/.gitignore
+++ b/tensorflow/java/maven/.gitignore
@@ -11,4 +11,10 @@ tensorflow/src
tensorflow/target
proto/src
proto/target
+hadoop/src
+hadoop/target
+spark-connector/src
+spark-connector/target
+spark-connector/dependency-reduced-pom.xml
+spark-connector/spark-warehouse
pom.xml.versionsBackup
diff --git a/tensorflow/java/maven/README.md b/tensorflow/java/maven/README.md
index c7e8f03806..3e030dcd09 100644
--- a/tensorflow/java/maven/README.md
+++ b/tensorflow/java/maven/README.md
@@ -53,6 +53,12 @@ There are seven artifacts and thus `pom.xml`s involved in this release:
7. [`parentpom`](https://maven.apache.org/pom/index.html): Common settings
shared by all of the above.
+8. `hadoop`: The TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop.
+ The source code for this package is available in the [TensorFlow Ecosystem](https://github.com/tensorflow/ecosystem/tree/master/hadoop)
+
+9. `spark-connector`: A Scala library for loading and storing TensorFlow TFRecord
+ using Apache Spark DataFrames. The source code for this package is available
+ in the [TensorFlow Ecosystem](https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-connector)
## Updating the release
diff --git a/tensorflow/java/maven/hadoop/pom.xml b/tensorflow/java/maven/hadoop/pom.xml
new file mode 100644
index 0000000000..0642be06fa
--- /dev/null
+++ b/tensorflow/java/maven/hadoop/pom.xml
@@ -0,0 +1,24 @@
+<project
+ xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <!-- Placeholder pom which is replaced by TensorFlow ecosystem Hadoop pom during build -->
+ <modelVersion>4.0.0</modelVersion>
+ <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
+ <artifactId>hadoop</artifactId>
+ <packaging>jar</packaging>
+
+ <scm>
+ <url>https://github.com/tensorflow/ecosystem.git</url>
+ <connection>git@github.com:tensorflow/ecosystem.git</connection>
+ <developerConnection>scm:git:https://github.com/tensorflow/ecosystem.git</developerConnection>
+ </scm>
+
+ <url>https://github.com/tensorflow/ecosystem/</url>
+ <parent>
+ <groupId>org.tensorflow</groupId>
+ <artifactId>parentpom</artifactId>
+ <version>1.9.0-rc0</version>
+ <relativePath>../</relativePath>
+ </parent>
+</project> \ No newline at end of file
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 3890f3fcaa..b4746794ea 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -32,6 +32,8 @@
<module>libtensorflow_jni_gpu</module>
<module>tensorflow</module>
<module>proto</module>
+ <module>hadoop</module>
+ <module>spark-connector</module>
</modules>
<!-- Two profiles are used:
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index bf19c09b1d..2e771064e4 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -19,6 +19,7 @@
RELEASE_URL_PREFIX="https://storage.googleapis.com/tensorflow/libtensorflow"
+TF_ECOSYSTEM_URL="https://github.com/tensorflow/ecosystem.git"
# By default we deploy to both ossrh and bintray. These two
# environment variables can be set to skip either repository.
@@ -44,7 +45,9 @@ clean() {
# (though if run inside a clean docker container, there won't be any dirty
# artifacts lying around)
mvn -q clean
- rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target libtensorflow/src libtensorflow/target tensorflow-android/target
+ rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target \
+ libtensorflow/src libtensorflow/target tensorflow-android/target proto/src proto/target \
+ hadoop/src hadoop/target spark-connector/src spark-connector/target
}
update_version_in_pom() {
@@ -183,6 +186,43 @@ generate_java_protos() {
rm -rf "${DIR}/proto/tmp"
}
+
+# Download the TensorFlow ecosystem source from git.
+# The pom files from this repo do not inherit from the parent pom so the maven version
+# is updated for each module.
+download_tf_ecosystem() {
+ ECOSYSTEM_DIR="/tmp/tensorflow-ecosystem"
+ HADOOP_DIR="${DIR}/hadoop"
+ SPARK_DIR="${DIR}/spark-connector"
+
+ # Clean any previous attempts
+ rm -rf "${ECOSYSTEM_DIR}"
+
+ # Clone the TensorFlow ecosystem project
+ mkdir -p "${ECOSYSTEM_DIR}"
+ cd "${ECOSYSTEM_DIR}"
+ git clone "${TF_ECOSYSTEM_URL}"
+ cd ecosystem
+ git checkout r${TF_VERSION}
+
+ # Copy the TensorFlow Hadoop source
+ cp -r "${ECOSYSTEM_DIR}/ecosystem/hadoop/src" "${HADOOP_DIR}"
+ cp "${ECOSYSTEM_DIR}/ecosystem/hadoop/pom.xml" "${HADOOP_DIR}"
+ cd "${HADOOP_DIR}"
+ update_version_in_pom
+
+ # Copy the TensorFlow Spark connector source
+ cp -r "${ECOSYSTEM_DIR}/ecosystem/spark/spark-tensorflow-connector/src" "${SPARK_DIR}"
+ cp "${ECOSYSTEM_DIR}/ecosystem/spark/spark-tensorflow-connector/pom.xml" "${SPARK_DIR}"
+ cd "${SPARK_DIR}"
+ update_version_in_pom
+
+ # Cleanup
+ rm -rf "${ECOSYSTEM_DIR}"
+
+ cd "${DIR}"
+}
+
# Deploy artifacts using a specific profile.
# Arguments:
# profile - name of selected profile.
@@ -240,7 +280,8 @@ cd "${DIR}"
# Comment lines out appropriately if debugging/tinkering with the release
# process.
# gnupg2 is required for signing
-apt-get -qq update && apt-get -qqq install -y gnupg2
+apt-get -qq update && apt-get -qqq install -y gnupg2 git
+
clean
update_version_in_pom
download_libtensorflow
@@ -248,6 +289,8 @@ download_libtensorflow_jni
download_libtensorflow_jni_gpu
update_tensorflow_android
generate_java_protos
+download_tf_ecosystem
+
# Build the release artifacts
mvn verify
# Push artifacts to repository
diff --git a/tensorflow/java/maven/spark-connector/pom.xml b/tensorflow/java/maven/spark-connector/pom.xml
new file mode 100644
index 0000000000..19c752d08b
--- /dev/null
+++ b/tensorflow/java/maven/spark-connector/pom.xml
@@ -0,0 +1,24 @@
+<project
+ xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <!-- Placeholder pom which is replaced by TensorFlow ecosystem Spark pom during build -->
+ <modelVersion>4.0.0</modelVersion>
+ <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
+ <artifactId>spark-connector</artifactId>
+ <packaging>jar</packaging>
+
+ <scm>
+ <url>https://github.com/tensorflow/ecosystem.git</url>
+ <connection>git@github.com:tensorflow/ecosystem.git</connection>
+ <developerConnection>scm:git:https://github.com/tensorflow/ecosystem.git</developerConnection>
+ </scm>
+
+ <url>https://github.com/tensorflow/ecosystem/</url>
+ <parent>
+ <groupId>org.tensorflow</groupId>
+ <artifactId>parentpom</artifactId>
+ <version>1.9.0-rc0</version>
+ <relativePath>../</relativePath>
+ </parent>
+</project> \ No newline at end of file
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 9b171f66ec..d5bd99bdd9 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -35,7 +35,7 @@ namespace tensorflow {
namespace java {
namespace {
-const char* kLicense =
+constexpr const char kLicense[] =
"/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.\n"
"\n"
"Licensed under the Apache License, Version 2.0 (the \"License\");\n"
@@ -391,9 +391,12 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
}
if (!op.hidden()) {
// expose the op in the Ops Graph API only if it is visible
- op_class.add_annotation(
- Annotation::Create("Operator", "org.tensorflow.op.annotation")
- .attributes("group = \"" + endpoint.package() + "\""));
+ Annotation oper_annot =
+ Annotation::Create("Operator", "org.tensorflow.op.annotation");
+ if (endpoint.package() != kDefaultEndpointPackage) {
+ oper_annot.attributes("group = \"" + endpoint.package() + "\"");
+ }
+ op_class.add_annotation(oper_annot);
}
// create op class file
const string op_dir_name = io::JoinPath(
diff --git a/tensorflow/java/src/gen/cc/op_specs.h b/tensorflow/java/src/gen/cc/op_specs.h
index ca0ba16745..30ecb8ce53 100644
--- a/tensorflow/java/src/gen/cc/op_specs.h
+++ b/tensorflow/java/src/gen/cc/op_specs.h
@@ -27,6 +27,8 @@ limitations under the License.
namespace tensorflow {
namespace java {
+constexpr const char kDefaultEndpointPackage[] = "core";
+
class EndpointSpec {
public:
// A specification for an operation endpoint
diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
index 11fda4fc22..796d6a62dc 100644
--- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
+++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
@@ -15,19 +15,44 @@ limitations under the License.
package org.tensorflow.processor;
+import com.google.common.base.CaseFormat;
+import com.google.common.base.Strings;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+import com.squareup.javapoet.ClassName;
+import com.squareup.javapoet.FieldSpec;
+import com.squareup.javapoet.JavaFile;
+import com.squareup.javapoet.MethodSpec;
+import com.squareup.javapoet.ParameterSpec;
+import com.squareup.javapoet.TypeName;
+import com.squareup.javapoet.TypeSpec;
+import com.squareup.javapoet.TypeVariableName;
import java.io.IOException;
-import java.io.PrintWriter;
+import java.util.Collection;
import java.util.Collections;
-import java.util.HashSet;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import javax.annotation.processing.AbstractProcessor;
import javax.annotation.processing.Filer;
import javax.annotation.processing.Messager;
import javax.annotation.processing.ProcessingEnvironment;
import javax.annotation.processing.RoundEnvironment;
import javax.lang.model.SourceVersion;
+import javax.lang.model.element.AnnotationMirror;
+import javax.lang.model.element.AnnotationValue;
import javax.lang.model.element.Element;
+import javax.lang.model.element.ExecutableElement;
+import javax.lang.model.element.Modifier;
import javax.lang.model.element.TypeElement;
+import javax.lang.model.element.TypeParameterElement;
+import javax.lang.model.element.VariableElement;
+import javax.lang.model.type.TypeMirror;
+import javax.lang.model.type.TypeVariable;
+import javax.lang.model.util.ElementFilter;
+import javax.lang.model.util.Elements;
import javax.tools.Diagnostic.Kind;
/**
@@ -55,6 +80,7 @@ public final class OperatorProcessor extends AbstractProcessor {
super.init(processingEnv);
messager = processingEnv.getMessager();
filer = processingEnv.getFiler();
+ elements = processingEnv.getElementUtils();
}
@Override
@@ -98,42 +124,77 @@ public final class OperatorProcessor extends AbstractProcessor {
}
// Collect all classes tagged with our annotation.
- Set<TypeElement> opClasses = new HashSet<TypeElement>();
- if (!collectOpClasses(roundEnv, opClasses, annotation)) {
+ Multimap<String, MethodSpec> groupedMethods = HashMultimap.create();
+ if (!collectOpsMethods(roundEnv, groupedMethods, annotation)) {
return true;
}
// Nothing to do when there are no tagged classes.
- if (opClasses.isEmpty()) {
+ if (groupedMethods.isEmpty()) {
return true;
}
- // TODO:(kbsriram) validate operator classes and generate Op API.
- writeApi();
+ // Validate operator classes and generate Op API.
+ writeApi(groupedMethods);
+
hasRun = true;
return true;
}
@Override
public Set<String> getSupportedAnnotationTypes() {
- return Collections.singleton(String.format("%s.annotation.Operator", OP_PACKAGE));
+ return Collections.singleton("org.tensorflow.op.annotation.Operator");
+ }
+
+ private static final Pattern JAVADOC_TAG_PATTERN =
+ Pattern.compile("@(?:param|return|throws|exception|see)\\s+.*");
+ private static final TypeName T_OPS = ClassName.get("org.tensorflow.op", "Ops");
+ private static final TypeName T_OPERATOR =
+ ClassName.get("org.tensorflow.op.annotation", "Operator");
+ private static final TypeName T_SCOPE = ClassName.get("org.tensorflow.op", "Scope");
+ private static final TypeName T_GRAPH = ClassName.get("org.tensorflow", "Graph");
+ private static final TypeName T_STRING = ClassName.get(String.class);
+
+ private Filer filer;
+ private Messager messager;
+ private Elements elements;
+ private boolean hasRun = false;
+
+ private void error(Element e, String message, Object... args) {
+ if (args != null && args.length > 0) {
+ message = String.format(message, args);
+ }
+ messager.printMessage(Kind.ERROR, message, e);
}
- private void writeApi() {
- // Generate an empty class for now and get the build working correctly. This will be changed to
- // generate the actual API once we've done with build-related changes.
- // TODO:(kbsriram)
- try (PrintWriter writer =
- new PrintWriter(filer.createSourceFile(String.format("%s.Ops", OP_PACKAGE)).openWriter())) {
- writer.println(String.format("package %s;", OP_PACKAGE));
- writer.println("public class Ops{}");
+ private void write(TypeSpec spec) {
+ try {
+ JavaFile.builder("org.tensorflow.op", spec).skipJavaLangImports(true).build().writeTo(filer);
} catch (IOException e) {
- error(null, "Unexpected failure generating API: %s", e.getMessage());
+ throw new AssertionError(e);
+ }
+ }
+
+ private void writeApi(Multimap<String, MethodSpec> groupedMethods) {
+ Map<String, ClassName> groups = new HashMap<>();
+
+ // Generate a API class for each group collected other than the default one (= empty string)
+ for (Map.Entry<String, Collection<MethodSpec>> entry : groupedMethods.asMap().entrySet()) {
+ if (!entry.getKey().isEmpty()) {
+ TypeSpec groupClass = buildGroupClass(entry.getKey(), entry.getValue());
+ write(groupClass);
+ groups.put(entry.getKey(), ClassName.get("org.tensorflow.op", groupClass.name));
+ }
}
+ // Generate the top API class, adding any methods added to the default group
+ TypeSpec topClass = buildTopClass(groups, groupedMethods.get(""));
+ write(topClass);
}
- private boolean collectOpClasses(
- RoundEnvironment roundEnv, Set<TypeElement> opClasses, TypeElement annotation) {
+ private boolean collectOpsMethods(
+ RoundEnvironment roundEnv,
+ Multimap<String, MethodSpec> groupedMethods,
+ TypeElement annotation) {
boolean result = true;
for (Element e : roundEnv.getElementsAnnotatedWith(annotation)) {
// @Operator can only apply to types, so e must be a TypeElement.
@@ -145,20 +206,251 @@ public final class OperatorProcessor extends AbstractProcessor {
result = false;
continue;
}
- opClasses.add((TypeElement) e);
+ TypeElement opClass = (TypeElement) e;
+ // Skip deprecated operations for now, as we do not guarantee API stability yet
+ if (opClass.getAnnotation(Deprecated.class) == null) {
+ collectOpMethods(groupedMethods, opClass, annotation);
+ }
}
return result;
}
- private void error(Element e, String message, Object... args) {
- if (args != null && args.length > 0) {
- message = String.format(message, args);
+ private void collectOpMethods(
+ Multimap<String, MethodSpec> groupedMethods, TypeElement opClass, TypeElement annotation) {
+ AnnotationMirror am = getAnnotationMirror(opClass, annotation);
+ String groupName = getAnnotationElementValueAsString("group", am);
+ String methodName = getAnnotationElementValueAsString("name", am);
+ ClassName opClassName = ClassName.get(opClass);
+ if (Strings.isNullOrEmpty(methodName)) {
+ methodName = CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, opClassName.simpleName());
+ }
+ // Build a method for each @Operator found in the class path. There should be one method per
+ // operation factory called
+ // "create", which takes in parameter a scope and, optionally, a list of arguments
+ for (ExecutableElement opMethod : ElementFilter.methodsIn(opClass.getEnclosedElements())) {
+ if (opMethod.getModifiers().contains(Modifier.STATIC)
+ && opMethod.getSimpleName().contentEquals("create")) {
+ MethodSpec method = buildOpMethod(methodName, opClassName, opMethod);
+ groupedMethods.put(groupName, method);
+ }
}
- messager.printMessage(Kind.ERROR, message, e);
}
- private Filer filer;
- private Messager messager;
- private boolean hasRun = false;
- private static final String OP_PACKAGE = "org.tensorflow.op";
+ private MethodSpec buildOpMethod(
+ String methodName, ClassName opClassName, ExecutableElement factoryMethod) {
+ MethodSpec.Builder builder =
+ MethodSpec.methodBuilder(methodName)
+ .addModifiers(Modifier.PUBLIC)
+ .returns(TypeName.get(factoryMethod.getReturnType()))
+ .varargs(factoryMethod.isVarArgs())
+ .addJavadoc("$L", buildOpMethodJavadoc(opClassName, factoryMethod));
+
+ for (TypeParameterElement tp : factoryMethod.getTypeParameters()) {
+ TypeVariableName tvn = TypeVariableName.get((TypeVariable) tp.asType());
+ builder.addTypeVariable(tvn);
+ }
+ for (TypeMirror thrownType : factoryMethod.getThrownTypes()) {
+ builder.addException(TypeName.get(thrownType));
+ }
+ StringBuilder call = new StringBuilder("return $T.create(scope");
+ boolean first = true;
+ for (VariableElement param : factoryMethod.getParameters()) {
+ ParameterSpec p = ParameterSpec.get(param);
+ if (first) {
+ first = false;
+ continue;
+ }
+ call.append(", ");
+ call.append(p.name);
+ builder.addParameter(p);
+ }
+ call.append(")");
+ builder.addStatement(call.toString(), opClassName);
+ return builder.build();
+ }
+
+ private String buildOpMethodJavadoc(ClassName opClassName, ExecutableElement factoryMethod) {
+ StringBuilder javadoc = new StringBuilder();
+ javadoc
+ .append("Adds an {@link ")
+ .append(opClassName.simpleName())
+ .append("} operation to the graph\n\n");
+
+ // Add all javadoc tags found in the operator factory method but the first one, which should be
+ // in all cases the
+ // 'scope' parameter that is implicitly passed by this API
+ Matcher tagMatcher = JAVADOC_TAG_PATTERN.matcher(elements.getDocComment(factoryMethod));
+ boolean firstParam = true;
+
+ while (tagMatcher.find()) {
+ String tag = tagMatcher.group();
+ if (tag.startsWith("@param") && firstParam) {
+ firstParam = false;
+ } else {
+ javadoc.append(tag).append('\n');
+ }
+ }
+ javadoc.append("@see {@link ").append(opClassName).append("}\n");
+
+ return javadoc.toString();
+ }
+
+ private static TypeSpec buildGroupClass(String group, Collection<MethodSpec> methods) {
+ MethodSpec.Builder ctorBuilder =
+ MethodSpec.constructorBuilder()
+ .addParameter(T_SCOPE, "scope")
+ .addStatement("this.scope = scope");
+
+ TypeSpec.Builder builder =
+ TypeSpec.classBuilder(CaseFormat.LOWER_CAMEL.to(CaseFormat.UPPER_CAMEL, group) + "Ops")
+ .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+ .addJavadoc(
+ "An API for adding {@code $L} operations to a {@link $T Graph}\n\n"
+ + "@see {@link $T}\n",
+ group,
+ T_GRAPH,
+ T_OPS)
+ .addMethods(methods)
+ .addMethod(ctorBuilder.build());
+
+ builder.addField(
+ FieldSpec.builder(T_SCOPE, "scope").addModifiers(Modifier.PRIVATE, Modifier.FINAL).build());
+
+ return builder.build();
+ }
+
+ private static TypeSpec buildTopClass(
+ Map<String, ClassName> groupToClass, Collection<MethodSpec> methods) {
+ MethodSpec.Builder ctorBuilder =
+ MethodSpec.constructorBuilder()
+ .addModifiers(Modifier.PRIVATE)
+ .addParameter(T_SCOPE, "scope")
+ .addStatement("this.scope = scope", T_SCOPE);
+
+ for (Map.Entry<String, ClassName> entry : groupToClass.entrySet()) {
+ ctorBuilder.addStatement("$L = new $T(scope)", entry.getKey(), entry.getValue());
+ }
+
+ TypeSpec.Builder opsBuilder =
+ TypeSpec.classBuilder("Ops")
+ .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+ .addJavadoc(
+ "An API for building a {@link $T} with operation wrappers\n<p>\n"
+ + "Any operation wrapper found in the classpath properly annotated as an"
+ + "{@link $T @Operator} is exposed\n"
+ + "by this API or one of its subgroup.\n<p>Example usage:\n<pre>{@code\n"
+ + "try (Graph g = new Graph()) {\n"
+ + " Ops ops = new Ops(g);\n"
+ + " // Operations are typed classes with convenience\n"
+ + " // builders in Ops.\n"
+ + " Constant three = ops.constant(3);\n"
+ + " // Single-result operations implement the Operand\n"
+ + " // interface, so this works too.\n"
+ + " Operand four = ops.constant(4);\n"
+ + " // Most builders are found within a group, and accept\n"
+ + " // Operand types as operands\n"
+ + " Operand nine = ops.math().add(four, ops.constant(5));\n"
+ + " // Multi-result operations however offer methods to\n"
+ + " // select a particular result for use.\n"
+ + " Operand result = \n"
+ + " ops.math().add(ops.array().unique(s, a).y(), b);\n"
+ + " // Optional attributes\n"
+ + " ops.math().matMul(a, b, MatMul.transposeA(true));\n"
+ + " // Naming operators\n"
+ + " ops.withName(“foo”).constant(5); // name “foo”\n"
+ + " // Names can exist in a hierarchy\n"
+ + " Ops sub = ops.withSubScope(“sub”);\n"
+ + " sub.withName(“bar”).constant(4); // “sub/bar”\n"
+ + "}\n"
+ + "}</pre>\n",
+ T_GRAPH,
+ T_OPERATOR)
+ .addMethods(methods)
+ .addMethod(ctorBuilder.build());
+
+ opsBuilder.addMethod(
+ MethodSpec.methodBuilder("withSubScope")
+ .addModifiers(Modifier.PUBLIC)
+ .addParameter(T_STRING, "childScopeName")
+ .returns(T_OPS)
+ .addStatement("return new $T(scope.withSubScope(childScopeName))", T_OPS)
+ .addJavadoc(
+ "Returns an API that adds operations to the graph with the provided name prefix.\n"
+ + "\n@see {@link $T#withSubScope(String)}\n",
+ T_SCOPE)
+ .build());
+
+ opsBuilder.addMethod(
+ MethodSpec.methodBuilder("withName")
+ .addModifiers(Modifier.PUBLIC)
+ .addParameter(T_STRING, "opName")
+ .returns(T_OPS)
+ .addStatement("return new Ops(scope.withName(opName))")
+ .addJavadoc(
+ "Returns an API that uses the provided name for an op.\n\n"
+ + "@see {@link $T#withName(String)}\n",
+ T_SCOPE)
+ .build());
+
+ opsBuilder.addField(
+ FieldSpec.builder(T_SCOPE, "scope").addModifiers(Modifier.PRIVATE, Modifier.FINAL).build());
+
+ opsBuilder.addMethod(
+ MethodSpec.methodBuilder("scope")
+ .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+ .returns(T_SCOPE)
+ .addStatement("return scope")
+ .addJavadoc("Returns the current {@link $T scope} of this API\n", T_SCOPE)
+ .build());
+
+ for (Map.Entry<String, ClassName> entry : groupToClass.entrySet()) {
+ opsBuilder.addField(
+ FieldSpec.builder(entry.getValue(), entry.getKey())
+ .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+ .build());
+
+ opsBuilder.addMethod(
+ MethodSpec.methodBuilder(entry.getKey())
+ .addModifiers(Modifier.PUBLIC, Modifier.FINAL)
+ .returns(entry.getValue())
+ .addStatement("return $L", entry.getKey())
+ .addJavadoc(
+ "Returns an API for adding {@code $L} operations to the graph\n", entry.getKey())
+ .build());
+ }
+
+ opsBuilder.addMethod(
+ MethodSpec.methodBuilder("create")
+ .addModifiers(Modifier.PUBLIC, Modifier.STATIC)
+ .addParameter(T_GRAPH, "graph")
+ .returns(T_OPS)
+ .addStatement("return new Ops(new $T(graph))", T_SCOPE)
+ .addJavadoc("Creates an API for adding operations to the provided {@code graph}\n")
+ .build());
+
+ return opsBuilder.build();
+ }
+
+ private static AnnotationMirror getAnnotationMirror(Element element, TypeElement annotation) {
+ for (AnnotationMirror am : element.getAnnotationMirrors()) {
+ if (am.getAnnotationType().asElement().equals(annotation)) {
+ return am;
+ }
+ }
+ throw new IllegalArgumentException(
+ "Annotation "
+ + annotation.getSimpleName()
+ + " not present on element "
+ + element.getSimpleName());
+ }
+
+ private static String getAnnotationElementValueAsString(String elementName, AnnotationMirror am) {
+ for (Map.Entry<? extends ExecutableElement, ? extends AnnotationValue> entry :
+ am.getElementValues().entrySet()) {
+ if (entry.getKey().getSimpleName().contentEquals(elementName)) {
+ return entry.getValue().getValue().toString();
+ }
+ }
+ return "";
+ }
}
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 78d18e41ed..20c7a69b7c 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -24,10 +24,10 @@ Example:
classifier = BaselineClassifier(n_classes=3)
# Input builders
-def input_fn_train: # returns x, y (where y represents label's class index).
+def input_fn_train(): # returns x, y (where y represents label's class index).
pass
-def input_fn_eval: # returns x, y (where y represents label's class index).
+def input_fn_eval(): # returns x, y (where y represents label's class index).
pass
# Fit model.
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 010c0f3f59..ca26341445 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -333,11 +333,7 @@ def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
"""A serving_input_receiver_fn that expects features to be fed directly."""
receiver_tensors = _placeholders_from_receiver_tensors_dict(
features, default_batch_size)
-
- # TODO(b/34885899): remove the unnecessary copy
- # The features provided are simply the placeholders, but we defensively copy
- # the dict because it may be mutated.
- return ServingInputReceiver(receiver_tensors, receiver_tensors.copy())
+ return ServingInputReceiver(receiver_tensors, receiver_tensors)
return serving_input_receiver_fn
diff --git a/tensorflow/python/keras/datasets/boston_housing.py b/tensorflow/python/keras/datasets/boston_housing.py
index 4c4cab8c08..eeb7cbc44a 100644
--- a/tensorflow/python/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/datasets/boston_housing.py
@@ -45,10 +45,9 @@ def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
origin=origin_folder + 'boston_housing.npz',
file_hash=
'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
- f = np.load(path)
- x = f['x']
- y = f['y']
- f.close()
+ with np.load(path) as f:
+ x = f['x']
+ y = f['y']
np.random.seed(seed)
indices = np.arange(len(x))
diff --git a/tensorflow/python/keras/datasets/mnist.py b/tensorflow/python/keras/datasets/mnist.py
index 03564accc7..a96b581960 100644
--- a/tensorflow/python/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/datasets/mnist.py
@@ -47,8 +47,8 @@ def load_data(path='mnist.npz'):
path,
origin=origin_folder + 'mnist.npz',
file_hash='8a61469f7ea1b51cbae51d4f78837e45')
- f = np.load(path)
- x_train, y_train = f['x_train'], f['y_train']
- x_test, y_test = f['x_test'], f['y_test']
- f.close()
- return (x_train, y_train), (x_test, y_test)
+ with np.load(path) as f:
+ x_train, y_train = f['x_train'], f['y_train']
+ x_test, y_test = f['x_test'], f['y_test']
+
+ return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/python/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py
index 2120b4b242..cb796bb06c 100644
--- a/tensorflow/python/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/datasets/reuters.py
@@ -130,7 +130,5 @@ def get_word_index(path='reuters_word_index.json'):
path,
origin=origin_folder + 'reuters_word_index.json',
file_hash='4d44cc38712099c9e383dc6e5f11a921')
- f = open(path)
- data = json.load(f)
- f.close()
- return data
+ with open(path) as f:
+ return json.load(f)
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 3234c05be0..e3a686f45d 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -87,9 +87,11 @@ from tensorflow.python.keras.layers.local import LocallyConnected2D
# Merge layers.
from tensorflow.python.keras.layers.merge import Add
+from tensorflow.python.keras.layers.merge import Subtract
from tensorflow.python.keras.layers.merge import Multiply
from tensorflow.python.keras.layers.merge import Average
from tensorflow.python.keras.layers.merge import Maximum
+from tensorflow.python.keras.layers.merge import Minimum
from tensorflow.python.keras.layers.merge import Concatenate
from tensorflow.python.keras.layers.merge import Dot
from tensorflow.python.keras.layers.merge import add
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index 770665c5fb..f295af3fe0 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -250,6 +250,7 @@ class Add(_Merge):
return output
+@tf_export('keras.layers.Subtract')
class Subtract(_Merge):
"""Layer that subtracts two inputs.
@@ -336,6 +337,7 @@ class Maximum(_Merge):
return output
+@tf_export('keras.layers.Minimum')
class Minimum(_Merge):
"""Layer that computes the minimum (element-wise) a list of inputs.
@@ -586,6 +588,7 @@ def add(inputs, **kwargs):
return Add(**kwargs)(inputs)
+@tf_export('keras.layers.subtract')
def subtract(inputs, **kwargs):
"""Functional interface to the `Subtract` layer.
@@ -656,6 +659,7 @@ def maximum(inputs, **kwargs):
return Maximum(**kwargs)(inputs)
+@tf_export('keras.layers.minimum')
def minimum(inputs, **kwargs):
"""Functional interface to the `Minimum` layer.
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index 159cba5fa3..c4d4ce780b 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.ops import data_flow_ops
from tensorflow.python.ops import gradients_impl
import tensorflow.python.ops.data_flow_grad # pylint: disable=unused-import
from tensorflow.python.platform import test
-from tensorflow.python.framework import dtypes
class DynamicStitchTestBase(object):
diff --git a/tensorflow/python/lib/core/numpy.h b/tensorflow/python/lib/core/numpy.h
index 25322b458b..d4621d61ee 100644
--- a/tensorflow/python/lib/core/numpy.h
+++ b/tensorflow/python/lib/core/numpy.h
@@ -29,7 +29,9 @@ limitations under the License.
#define NO_IMPORT_ARRAY
#endif
+// Place `<locale>` before <Python.h> to avoid build failure in macOS.
#include <Python.h>
+#include <locale>
#include "numpy/arrayobject.h"
#include "numpy/ufuncobject.h"
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
index dcda1f4a44..6b6c82015f 100644
--- a/tensorflow/python/lib/core/py_util.cc
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -15,7 +15,9 @@ limitations under the License.
#include "tensorflow/python/lib/core/py_util.h"
+// Place `<locale>` before <Python.h> to avoid build failure in macOS.
#include <Python.h>
+#include <locale>
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index f27d9224c1..a2eae452ae 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -57,6 +57,7 @@ ops.NotDifferentiable('NonMaxSuppression')
ops.NotDifferentiable('NonMaxSuppressionV2')
+# pylint: disable=invalid-name
def _assert(cond, ex_type, msg):
"""A polymorphic assert, works with tensors and boolean expressions.
@@ -945,7 +946,7 @@ def resize_images(images,
Resized images will be distorted if their original aspect ratio is not
the same as `size`. To avoid distortions see
- @{tf.image.resize_image_with_crop_or_pad}.
+ @{tf.image.resize_image_with_pad}.
`method` can be one of:
@@ -1069,6 +1070,106 @@ def resize_images(images,
return images
+@tf_export('image.resize_image_with_pad')
+def resize_image_with_pad(image,
+ target_height,
+ target_width,
+ method=ResizeMethod.BILINEAR):
+ """Resizes and pads an image to a target width and height.
+
+ Resizes an image to a target width and height by keeping
+ the aspect ratio the same without distortion. If the target
+ dimensions don't match the image dimensions, the image
+ is resized and then padded with zeroes to match requested
+ dimensions.
+
+ Args:
+ image: 4-D Tensor of shape `[batch, height, width, channels]` or
+ 3-D Tensor of shape `[height, width, channels]`.
+ target_height: Target height.
+ target_width: Target width.
+ method: Method to use for resizing image. See `resize_images()`
+
+ Raises:
+ ValueError: if `target_height` or `target_width` are zero or negative.
+
+ Returns:
+ Resized and padded image.
+ If `images` was 4-D, a 4-D float Tensor of shape
+ `[batch, new_height, new_width, channels]`.
+ If `images` was 3-D, a 3-D float Tensor of shape
+ `[new_height, new_width, channels]`.
+ """
+ with ops.name_scope(None, 'resize_image_with_pad', [image]):
+ image = ops.convert_to_tensor(image, name='image')
+ image_shape = image.get_shape()
+ is_batch = True
+ if image_shape.ndims == 3:
+ is_batch = False
+ image = array_ops.expand_dims(image, 0)
+ elif image_shape.ndims is None:
+ is_batch = False
+ image = array_ops.expand_dims(image, 0)
+ image.set_shape([None] * 4)
+ elif image_shape.ndims != 4:
+ raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+
+ assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+ assert_ops += _assert(target_width > 0, ValueError,
+ 'target_width must be > 0.')
+ assert_ops += _assert(target_height > 0, ValueError,
+ 'target_height must be > 0.')
+
+ image = control_flow_ops.with_dependencies(assert_ops, image)
+
+ def max_(x, y):
+ if _is_tensor(x) or _is_tensor(y):
+ return math_ops.maximum(x, y)
+ else:
+ return max(x, y)
+
+ _, height, width, _ = _ImageDimensions(image, rank=4)
+
+ # convert values to float, to ease divisions
+ f_height = math_ops.cast(height, dtype=dtypes.float64)
+ f_width = math_ops.cast(width, dtype=dtypes.float64)
+ f_target_height = math_ops.cast(target_height, dtype=dtypes.float64)
+ f_target_width = math_ops.cast(target_width, dtype=dtypes.float64)
+
+ # Find the ratio by which the image must be adjusted
+ # to fit within the target
+ ratio = max_(f_width / f_target_width, f_height / f_target_height)
+ resized_height_float = f_height / ratio
+ resized_width_float = f_width / ratio
+ resized_height = math_ops.cast(
+ math_ops.floor(resized_height_float), dtype=dtypes.int32)
+ resized_width = math_ops.cast(
+ math_ops.floor(resized_width_float), dtype=dtypes.int32)
+
+ padding_height = (f_target_height - resized_height_float) / 2
+ padding_width = (f_target_width - resized_width_float) / 2
+ f_padding_height = math_ops.floor(padding_height)
+ f_padding_width = math_ops.floor(padding_width)
+ p_height = max_(0, math_ops.cast(f_padding_height, dtype=dtypes.int32))
+ p_width = max_(0, math_ops.cast(f_padding_width, dtype=dtypes.int32))
+
+ # Resize first, then pad to meet requested dimensions
+ resized = resize_images(image, [resized_height, resized_width], method)
+
+ padded = pad_to_bounding_box(resized, p_height, p_width, target_height,
+ target_width)
+
+ if padded.get_shape().ndims is None:
+ raise ValueError('padded contains no shape.')
+
+ _ImageDimensions(padded, rank=4)
+
+ if not is_batch:
+ padded = array_ops.squeeze(padded, squeeze_dims=[0])
+
+ return padded
+
+
@tf_export('image.per_image_standardization')
def per_image_standardization(image):
"""Linearly scales `image` to have zero mean and unit norm.
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 2a6ab26e96..cf9761803b 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -2680,6 +2680,102 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
+class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
+
+ def _ResizeImageWithPad(self, x, target_height, target_width,
+ use_tensor_inputs):
+ if use_tensor_inputs:
+ target_height = ops.convert_to_tensor(target_height)
+ target_width = ops.convert_to_tensor(target_width)
+ x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
+ feed_dict = {x_tensor: x}
+ else:
+ x_tensor = x
+ feed_dict = {}
+
+ y = image_ops.resize_image_with_pad(x_tensor, target_height,
+ target_width)
+ if not use_tensor_inputs:
+ self.assertTrue(y.get_shape().is_fully_defined())
+
+ with self.test_session(use_gpu=True):
+ return y.eval(feed_dict=feed_dict)
+
+ def _assertReturns(self,
+ x,
+ x_shape,
+ y,
+ y_shape,
+ use_tensor_inputs_options=None):
+ use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+ target_height, target_width, _ = y_shape
+ x = np.array(x).reshape(x_shape)
+ y = np.array(y).reshape(y_shape)
+
+ for use_tensor_inputs in use_tensor_inputs_options:
+ y_tf = self._ResizeImageWithPad(x, target_height, target_width,
+ use_tensor_inputs)
+ self.assertAllClose(y, y_tf)
+
+ def _assertRaises(self,
+ x,
+ x_shape,
+ target_height,
+ target_width,
+ err_msg,
+ use_tensor_inputs_options=None):
+ use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+ x = np.array(x).reshape(x_shape)
+
+ for use_tensor_inputs in use_tensor_inputs_options:
+ try:
+ self._ResizeImageWithPad(x, target_height, target_width,
+ use_tensor_inputs)
+ except Exception as e: # pylint: disable=broad-except
+ if err_msg not in str(e):
+ raise
+ else:
+ raise AssertionError("Exception not raised: %s" % err_msg)
+
+ def _assertShapeInference(self, pre_shape, height, width, post_shape):
+ image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
+ y = image_ops.resize_image_with_pad(image, height, width)
+ self.assertEqual(y.get_shape().as_list(), post_shape)
+
+ def testNoOp(self):
+ x_shape = [10, 10, 10]
+ x = np.random.uniform(size=x_shape)
+
+ self._assertReturns(x, x_shape, x, x_shape)
+
+ def testPad(self):
+ # Reduce vertical dimension
+ x = [1, 2, 3, 4, 5, 6, 7, 8]
+ x_shape = [2, 4, 1]
+
+ y = [0, 1, 3, 0]
+ y_shape = [1, 4, 1]
+
+ self._assertReturns(x, x_shape, y, y_shape)
+
+ # Reduce horizontal dimension
+ x = [1, 2, 3, 4, 5, 6, 7, 8]
+ x_shape = [2, 4, 1]
+
+ y = [1, 3, 0, 0]
+ y_shape = [2, 2, 1]
+
+ self._assertReturns(x, x_shape, y, y_shape)
+
+ x = [1, 2, 3, 4, 5, 6, 7, 8]
+ x_shape = [2, 4, 1]
+
+ y = [1, 3]
+ y_shape = [1, 2, 1]
+
+ self._assertReturns(x, x_shape, y, y_shape)
+
+
class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
def _ResizeImageWithCropOrPad(self, x, target_height, target_width,
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 8417d8a7b1..6b709e5e7f 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -235,6 +235,15 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
z_tf = self.evaluate(math_ops.approximate_equal(x, y, tolerance=0.0001))
self.assertAllEqual(z, z_tf)
+ def testApproximateEqualShape(self):
+ for dtype in [np.float32, np.double]:
+ x = np.array([1, 2], dtype=dtype)
+ y = np.array([[1, 2]], dtype=dtype)
+ # The inputs 'x' and 'y' must have the same shape.
+ with self.assertRaisesRegexp(
+ ValueError, "Shapes must be equal rank, but are 1 and 2"):
+ math_ops.approximate_equal(x, y)
+
class ScalarMulTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index d06b0c318d..9a10abfcf7 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -201,6 +201,8 @@ def einsum(equation, *inputs, **kwargs):
indices in its subscript, or
- the input shapes are inconsistent along a particular axis.
"""
+ equation = equation.replace(' ', '')
+
name = kwargs.pop('name', None)
if kwargs:
raise TypeError('invalid keyword arguments for this function: ' + ', '.join(
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 8646e48571..9bc4098d5b 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -241,6 +241,12 @@ class EinsumTest(test.TestCase):
'iJ,Jk->ik',
'iJ,Ki->JK',
'iJk,Jklm->Jk'
+ 'ij, jk, kl -> il',
+ 'a, ab, abc -> abc',
+ 'ab, ab, cd, cd, ef, ef -> ',
+ 'abc, bac',
+ 'iJ, Ki -> JK',
+ 'iJk, Jklm -> Jk'
]
long_cases = [
@@ -249,6 +255,8 @@ class EinsumTest(test.TestCase):
'ea,fb,gc,hd,abcd->efgh',
'ea,fb,abcd,gc,hd->efgh',
'abhe,hidj,jgba,hiab,gab',
+ 'efc, dbc, acf, fd -> abe',
+ 'abhe, hidj, jgba, hiab, gab',
]
invalid_cases = [
@@ -319,7 +327,7 @@ class EinsumTest(test.TestCase):
input_axes, _, _ = axes.partition('->')
for idx in input_axes.split(','):
- shape = [all_axes[ax] for ax in idx]
+ shape = [all_axes[ax] for ax in idx if ax.isalpha()]
input_vals.append(np.random.random(shape))
input_tensors = [constant_op.constant(val) for val in input_vals]
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 08b7cda73b..8cb6a0537e 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -394,7 +394,7 @@ def scatter_add(ref, indices, updates, use_locking=False, name=None):
A tensor of indices into the first dimension of `ref`.
updates: A `Tensor`. Must have the same type as `ref`.
A tensor of updated values to store in `ref`.
- use_locking: An optional `bool`. Defaults to `True`.
+ use_locking: An optional `bool`. Defaults to `False`.
If True, the assignment will be protected by a lock;
otherwise the behavior is undefined, but may exhibit less contention.
name: A name for the operation (optional).
@@ -458,7 +458,7 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
A tensor of indices into ref.
updates: A `Tensor`. Must have the same type as `ref`.
A tensor of updated values to add to ref.
- use_locking: An optional `bool`. Defaults to `True`.
+ use_locking: An optional `bool`. Defaults to `False`.
An optional bool. Defaults to True. If True, the assignment will
be protected by a lock; otherwise the behavior is undefined,
but may exhibit less contention.
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index c2f0e9d3e6..5b372e82b3 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -147,7 +147,7 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
partitioner=lambda shape, dtype: [5, 1])
# Initialize all variables in `new_scope_1` from `old_scope_1`.
- init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/', 'new_scope_1'})
+ init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/': 'new_scope_1'})
# Use names to specify which variables to initialize from checkpoint.
init_from_checkpoint('/tmp/model.ckpt',
diff --git a/tensorflow/tf_framework_version_script.lds b/tensorflow/tf_framework_version_script.lds
new file mode 100644
index 0000000000..d4977f88c0
--- /dev/null
+++ b/tensorflow/tf_framework_version_script.lds
@@ -0,0 +1,11 @@
+VERS_1.0 {
+ # Hide libjpeg symbols to avoid symbol conflict with OpenCV
+ local:
+ jpeg_*;
+ jinit_*;
+ jdiv_round_up;
+ jround_up;
+ jzero_far;
+ jcopy_*;
+ jsimd_*;
+};
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 5398d3cf28..e89b4dbffd 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -177,6 +177,10 @@ tf_module {
argspec: "args=[\'image\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
}
member_method {
+ name: "resize_image_with_pad"
+ argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
+ }
+ member_method {
name: "resize_images"
argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt
new file mode 100644
index 0000000000..56e32e9d36
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-minimum.pbtxt
@@ -0,0 +1,176 @@
+path: "tensorflow.keras.layers.Minimum"
+tf_class {
+ is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
+ is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+ is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+ is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+ is_instance: "<type \'object\'>"
+ member {
+ name: "activity_regularizer"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "dtype"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "inbound_nodes"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "input"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "input_mask"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "input_shape"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "losses"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "name"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "non_trainable_variables"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "non_trainable_weights"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "outbound_nodes"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "output"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "output_mask"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "output_shape"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "trainable_variables"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "trainable_weights"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "updates"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "variables"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "weights"
+ mtype: "<type \'property\'>"
+ }
+ member_method {
+ name: "__init__"
+ argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+ }
+ member_method {
+ name: "add_loss"
+ argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+ }
+ member_method {
+ name: "add_update"
+ argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+ }
+ member_method {
+ name: "add_variable"
+ argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+ }
+ member_method {
+ name: "add_weight"
+ argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+ }
+ member_method {
+ name: "apply"
+ argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+ }
+ member_method {
+ name: "build"
+ argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "call"
+ argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "compute_mask"
+ argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+ }
+ member_method {
+ name: "compute_output_shape"
+ argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "count_params"
+ argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "from_config"
+ argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_config"
+ argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_input_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_input_mask_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_input_shape_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_losses_for"
+ argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_output_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_output_mask_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_output_shape_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_updates_for"
+ argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_weights"
+ argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "set_weights"
+ argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+ }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt
new file mode 100644
index 0000000000..35ad87ad5d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-subtract.pbtxt
@@ -0,0 +1,176 @@
+path: "tensorflow.keras.layers.Subtract"
+tf_class {
+ is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
+ is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+ is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+ is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+ is_instance: "<type \'object\'>"
+ member {
+ name: "activity_regularizer"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "dtype"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "inbound_nodes"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "input"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "input_mask"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "input_shape"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "losses"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "name"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "non_trainable_variables"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "non_trainable_weights"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "outbound_nodes"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "output"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "output_mask"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "output_shape"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "trainable_variables"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "trainable_weights"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "updates"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "variables"
+ mtype: "<type \'property\'>"
+ }
+ member {
+ name: "weights"
+ mtype: "<type \'property\'>"
+ }
+ member_method {
+ name: "__init__"
+ argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+ }
+ member_method {
+ name: "add_loss"
+ argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+ }
+ member_method {
+ name: "add_update"
+ argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+ }
+ member_method {
+ name: "add_variable"
+ argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+ }
+ member_method {
+ name: "add_weight"
+ argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+ }
+ member_method {
+ name: "apply"
+ argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+ }
+ member_method {
+ name: "build"
+ argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "call"
+ argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "compute_mask"
+ argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+ }
+ member_method {
+ name: "compute_output_shape"
+ argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "count_params"
+ argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "from_config"
+ argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_config"
+ argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_input_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_input_mask_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_input_shape_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_losses_for"
+ argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_output_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_output_mask_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_output_shape_at"
+ argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_updates_for"
+ argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "get_weights"
+ argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+ }
+ member_method {
+ name: "set_weights"
+ argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+ }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index 0df5a1b91e..9d7e5bb8c7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -281,6 +281,10 @@ tf_module {
mtype: "<type \'type\'>"
}
member {
+ name: "Minimum"
+ mtype: "<type \'type\'>"
+ }
+ member {
name: "Multiply"
mtype: "<type \'type\'>"
}
@@ -353,6 +357,10 @@ tf_module {
mtype: "<type \'type\'>"
}
member {
+ name: "Subtract"
+ mtype: "<type \'type\'>"
+ }
+ member {
name: "ThresholdedReLU"
mtype: "<type \'type\'>"
}
@@ -413,7 +421,15 @@ tf_module {
argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
}
member_method {
+ name: "minimum"
+ argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+ }
+ member_method {
name: "multiply"
argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
}
+ member_method {
+ name: "subtract"
+ argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+ }
}
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le
new file mode 100644
index 0000000000..f496ac59b6
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.ppc64le
@@ -0,0 +1,19 @@
+FROM ubuntu:16.04
+
+LABEL maintainer="William Irons <wdirons@us.ibm.com>"
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa
+RUN /install/install_deb_packages.sh
+RUN apt-get update && apt-get install -y libopenblas-dev
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel_from_source.sh
+RUN /install/install_proto3.sh
+RUN /install/install_buildifier_from_source.sh
+RUN /install/install_auditwheel.sh
+RUN /install/install_golang_ppc64le.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
new file mode 100644
index 0000000000..3eddc56550
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
@@ -0,0 +1,27 @@
+FROM nvidia/cuda-ppc64le:9.0-cudnn7-devel-ubuntu16.04
+
+LABEL maintainer="William Irons <wdirons@us.ibm.com>"
+
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
+# /usr/local/cuda
+RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
+RUN cp -P /usr/lib/powerpc64le-linux-gnu/libcudnn* /usr/local/cuda/lib64
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa
+RUN /install/install_deb_packages.sh
+RUN apt-get update && apt-get install -y libopenblas-dev
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel_from_source.sh
+RUN /install/install_golang_ppc64le.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+# Configure the build for our CUDA configuration.
+ENV TF_NEED_CUDA 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 1f0fd0387a..f6a50d3d4c 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -79,7 +79,7 @@ if [[ "${CONTAINER_TYPE}" == "cmake" ]]; then
fi
# Use nvidia-docker if the container is GPU.
-if [[ "${CONTAINER_TYPE}" == "gpu" ]]; then
+if [[ "${CONTAINER_TYPE}" == gpu* ]]; then
DOCKER_BINARY="nvidia-docker"
else
DOCKER_BINARY="docker"
@@ -99,7 +99,7 @@ BUILD_TAG="${BUILD_TAG:-tf_ci}"
# Add extra params for cuda devices and libraries for GPU container.
# And clear them if we are not building for GPU.
-if [[ "${CONTAINER_TYPE}" != "gpu" ]]; then
+if [[ "${CONTAINER_TYPE}" != gpu* ]]; then
GPU_EXTRA_PARAMS=""
fi
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 90bd8bc3d0..300ba8ea0b 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -258,9 +258,9 @@ function set_script_variable() {
# Process container type
-if [[ ${CTYPE} == "cpu" ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
+if [[ ${CTYPE} == cpu* ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
:
-elif [[ ${CTYPE} == "gpu" ]]; then
+elif [[ ${CTYPE} == gpu* ]]; then
set_script_variable TF_NEED_CUDA 1
if [[ $TF_CUDA_CLANG == "1" ]]; then
@@ -418,12 +418,12 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
BAZEL_TARGET=${TF_BUILD_BAZEL_TARGET}
fi
- if [[ ${CTYPE} == "cpu" ]] || \
+ if [[ ${CTYPE} == cpu* ]] || \
[[ ${CTYPE} == "debian.jessie.cpu" ]]; then
# CPU only command, fully parallel.
NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} ${EXTRA_ARGS} -- "\
"${BAZEL_TARGET}"
- elif [[ ${CTYPE} == "gpu" ]]; then
+ elif [[ ${CTYPE} == gpu* ]]; then
# GPU only command, run as many jobs as the GPU count only.
NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\
"--local_test_jobs=${TF_GPU_COUNT} "\
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
new file mode 100755
index 0000000000..ddad00c5f0
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This script is to be used to install bzel on non x86_64 systems
+# It will compile bazel from source and install it in /usr/local/bin
+
+# Select bazel version.
+BAZEL_VERSION="0.11.0"
+
+set +e
+local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
+
+if [[ "$local_bazel_ver" == "$BAZEL_VERSION" ]]; then
+ exit 0
+fi
+
+set -e
+
+# Compile bazel from source
+mkdir -p /bazel
+cd /bazel
+
+curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip
+unzip bazel-$BAZEL_VERSION-dist.zip
+bash ./compile.sh
+cp output/bazel /usr/local/bin/
+rm -rf /bazel
diff --git a/tensorflow/tools/ci_build/install/install_buildifier_from_source.sh b/tensorflow/tools/ci_build/install/install_buildifier_from_source.sh
new file mode 100755
index 0000000000..a93c258fad
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_buildifier_from_source.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+BUILDTOOLS_VERSION="0.11.1"
+
+# Clone buildtools
+git clone -b $BUILDTOOLS_VERSION https://github.com/bazelbuild/buildtools
+cd buildtools
+
+# Build buildifier
+bazel build //buildifier
+sudo mv bazel-bin/buildifier/linux*stripped/buildifier /usr/local/bin
+
+# Build buildozer
+bazel build //buildozer
+sudo mv bazel-bin/buildozer/linux*stripped/buildozer /usr/local/bin
diff --git a/tensorflow/tools/ci_build/install/install_golang_ppc64le.sh b/tensorflow/tools/ci_build/install/install_golang_ppc64le.sh
new file mode 100755
index 0000000000..47d23a59b3
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_golang_ppc64le.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -ex
+
+GOLANG_URL="https://storage.googleapis.com/golang/go1.10.linux-ppc64le.tar.gz"
+
+sudo mkdir -p /usr/local
+wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index fbed4574e0..221b5b80fb 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -110,6 +110,10 @@ pip3 install --upgrade gast
pip2 install --upgrade termcolor
pip3 install --upgrade termcolor
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
+
# Keras
pip2 install keras_applications==1.0.2
pip3 install keras_applications==1.0.2
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 037fc0e2e1..45a30c6e82 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -81,6 +81,9 @@ pip3.5 install --upgrade astor
pip3.5 install --upgrade gast
pip3.5 install --upgrade termcolor
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
# Keras
pip3.5 install keras_applications==1.0.2
pip3.5 install keras_preprocessing==1.0.1
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 8fd65a3ee2..d66b2aa18a 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -97,11 +97,11 @@ pip3 install --upgrade astor
pip3 install --upgrade gast
pip3 install --upgrade termcolor
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
# Keras
pip3.5 install keras_applications==1.0.2
pip3.5 install keras_preprocessing==1.0.1
-# Install last working version of setuptools.
-pip3 install --upgrade setuptools==39.1.0
-
# LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/gpu/run_mkl.sh
new file mode 100755
index 0000000000..50ee07e727
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/gpu/run_mkl.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export PYTHON_BIN_PATH=`which python2`
+
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=9.0
+export TF_CUDNN_VERSION=7
+export TF_CUDA_COMPUTE_CAPABILITIES=3.7
+
+yes "" | $PYTHON_BIN_PATH configure.py
+
+# Run bazel test command. Double test timeouts to avoid flakes.
+# Setting KMP_BLOCKTIME to 0 lets OpenMP threads to sleep right after parallel execution
+# in an MKL primitive. This reduces the effects of an oversubscription of OpenMP threads
+# caused by executing multiple tests concurrently.
+bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test \
+ --test_lang_filters=cc,py -k --jobs="${N_JOBS}" \
+ --test_timeout 300,450,1200,3600 --build_tests_only --test_env=KMP_BLOCKTIME=0\
+ --config=mkl --config=opt --test_output=errors --local_test_jobs=8 \
+ --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
+ //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
+
diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh
new file mode 100755
index 0000000000..68354bf7c1
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-gpu-test.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Usage: basic_mkl_test.sh
+
+# Helper function to traverse directories up until given file is found.
+function upsearch () {
+ test / == "$PWD" && return || \
+ test -e "$1" && echo "$PWD" && return || \
+ cd .. && upsearch "$1"
+}
+
+# Set up WORKSPACE.
+WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
+
+BUILD_TAG=mkl-gpu-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh gpu tensorflow/tools/ci_build/linux/gpu/run_mkl.sh
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 73dee98bae..cc2288a7fa 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -164,14 +164,17 @@ def get_git_version(git_base_path, git_tag_override):
"git", str("--git-dir=%s/.git" % git_base_path),
str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
]).strip())
- if git_tag_override:
+ if git_tag_override and val:
split_val = val.split("-")
- if len(split_val) != 3:
+ if len(split_val) < 3:
raise Exception(
("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
"but got '%s'") % val)
- split_val[0] = git_tag_override
- val = bytes("-".join(split_val))
+ # There might be "-" in the tag name. But we can be sure that the final
+ # two "-" are those inserted by the git describe command.
+ abbrev_commit = split_val[-1]
+ val = bytes(
+ "-".join([git_tag_override, "0", abbrev_commit]))
return val if val else unknown_label
except (subprocess.CalledProcessError, OSError):
return unknown_label
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 77f83b77a0..05c23cd3ee 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -130,7 +130,7 @@ genrule(
"@highwayhash//:LICENSE",
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
- "@libxsmm_archive//:LICENSE",
+ "@libxsmm_archive//:LICENSE.md",
"@llvm//:LICENSE.TXT",
"@lmdb//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",
@@ -168,7 +168,7 @@ genrule(
"@highwayhash//:LICENSE",
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
- "@libxsmm_archive//:LICENSE",
+ "@libxsmm_archive//:LICENSE.md",
"@llvm//:LICENSE.TXT",
"@lmdb//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 6cfd271968..a0caf42331 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -147,7 +147,7 @@ filegroup(
"@jemalloc//:COPYING",
"@jpeg//:LICENSE.md",
"@kafka//:LICENSE",
- "@libxsmm_archive//:LICENSE",
+ "@libxsmm_archive//:LICENSE.md",
"@lmdb//:LICENSE",
"@local_config_nccl//:LICENSE",
"@local_config_sycl//sycl:LICENSE.text",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index f7e42ce536..9e41514cfa 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -24,9 +24,15 @@ function real_path() {
function cp_external() {
local src_dir=$1
local dest_dir=$2
- for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*local_config_tensorrt*' ! -name '*org_tensorflow*'`; do
- cp -R "$f" "$dest_dir"
+
+ pushd .
+ cd "$src_dir"
+ for f in `find . ! -type d ! -name '*.py' ! -name '*local_config_cuda*' ! -name '*local_config_tensorrt*' ! -name '*org_tensorflow*'`; do
+ mkdir -p "${dest_dir}/$(dirname ${f})"
+ cp "${f}" "${dest_dir}/$(dirname ${f})/"
done
+ popd
+
mkdir -p "${dest_dir}/local_config_cuda/cuda/cuda/"
cp "${src_dir}/local_config_cuda/cuda/cuda/cuda_config.h" "${dest_dir}/local_config_cuda/cuda/cuda/"
}
@@ -49,6 +55,8 @@ function prepare_src() {
TMPDIR="$1"
mkdir -p "$TMPDIR"
+ EXTERNAL_INCLUDES="${TMPDIR}/tensorflow/include/external"
+
echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
if [ ! -d bazel-bin/tensorflow ]; then
@@ -66,10 +74,9 @@ function prepare_src() {
cp -R \
bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow/tensorflow \
"${TMPDIR}"
- mkdir "${TMPDIR}/external"
cp_external \
bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
- "${TMPDIR}/external"
+ "${EXTERNAL_INCLUDES}/"
RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow
else
RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow
@@ -78,10 +85,9 @@ function prepare_src() {
cp -R \
bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
"${TMPDIR}"
- mkdir "${TMPDIR}/external"
cp_external \
bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \
- "${TMPDIR}/external"
+ "${EXTERNAL_INCLUDES}"
# Copy MKL libs over so they can be loaded at runtime
so_lib_dir=$(ls $RUNFILES | grep solib) || true
if [ -n "${so_lib_dir}" ]; then
@@ -96,10 +102,9 @@ function prepare_src() {
cp -R \
bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
"${TMPDIR}"
- mkdir "${TMPDIR}/external"
cp_external \
bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
- "${TMPDIR}/external"
+ "${EXTERNAL_INCLUDES}"
# Copy MKL libs over so they can be loaded at runtime
so_lib_dir=$(ls $RUNFILES | grep solib) || true
if [ -n "${so_lib_dir}" ]; then
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 253802b959..55cd4f37c6 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -53,7 +53,7 @@ REQUIRED_PACKAGES = [
'gast >= 0.2.0',
'numpy >= 1.13.3',
'six >= 1.10.0',
- 'protobuf >= 3.6.0',
+ 'protobuf >= 3.4.0',
'setuptools <= 39.1.0',
'tensorboard >= 1.8.0, < 1.9.0',
'termcolor >= 1.1.0',
@@ -84,7 +84,7 @@ else:
if 'tf_nightly' in project_name:
for i, pkg in enumerate(REQUIRED_PACKAGES):
if 'tensorboard' in pkg:
- REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.9.0a0, < 1.10.0a0'
+ REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.10.0a0, < 1.11.0a0'
break
# weakref.finalize and enum were introduced in Python 3.4
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 3a5e0d1163..ba679e0055 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -131,11 +131,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
tf_http_archive(
name = "libxsmm_archive",
urls = [
- "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
- "https://github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
+ "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.9.tar.gz",
+ "https://github.com/hfp/libxsmm/archive/1.9.tar.gz",
],
- sha256 = "2ade869c3f42f23b5263c7d594aa3c7e5e61ac6a3afcaf5d6e42899d2a7986ce",
- strip_prefix = "libxsmm-1.8.1",
+ sha256 = "cd8532021352b4a0290d209f7f9bfd7c2411e08286a893af3577a43457287bfa",
+ strip_prefix = "libxsmm-1.9",
build_file = clean_dep("//third_party:libxsmm.BUILD"),
)
@@ -155,12 +155,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
tf_http_archive(
name = "com_googlesource_code_re2",
urls = [
- "https://mirror.bazel.build/github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
- "https://github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
+ "https://mirror.bazel.build/github.com/google/re2/archive/2018-04-01.tar.gz",
+ "https://github.com/google/re2/archive/2018-04-01.tar.gz",
],
- sha256 = "e57eeb837ac40b5be37b2c6197438766e73343ffb32368efea793dfd8b28653b",
- strip_prefix = "re2-26cd968b735e227361c9703683266f01e5df7857",
+ sha256 = "2f945446b71336e7f5a2bcace1abcf0b23fbba368266c6a1be33de3de3b3c912",
+ strip_prefix = "re2-2018-04-01",
)
tf_http_archive(
@@ -200,6 +200,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
urls = [
"https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
"http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.12.02.tar.bz2/d15843c3fb7db39af80571ee27ec6fad/nasm-2.12.02.tar.bz2",
+ "http://www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
],
sha256 = "00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324",
strip_prefix = "nasm-2.12.02",
@@ -298,11 +299,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
tf_http_archive(
name = "absl_py",
urls = [
- "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/ea8c4d2ddbf3fba610c4d613260561699b776db8.tar.gz",
- "https://github.com/abseil/abseil-py/archive/ea8c4d2ddbf3fba610c4d613260561699b776db8.tar.gz",
+ "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
+ "https://github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
],
- sha256 = "c30b48e0d2580ef1412e55c5c0e1dab8db2ee4ab56e2075eccff29c90c7c7059",
- strip_prefix = "abseil-py-ea8c4d2ddbf3fba610c4d613260561699b776db8",
+ sha256 = "95160f778a62c7a60ddeadc7bf2d83f85a23a27359814aca12cf949e896fa82c",
+ strip_prefix = "abseil-py-pypi-v0.2.2",
)
tf_http_archive(
@@ -392,12 +393,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
tf_http_archive(
name = "pcre",
- sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
+ sha256 = "69acbc2fbdefb955d42a4c606dfde800c2885711d2979e356c0636efde9ec3b5",
urls = [
- "https://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
- "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
+ "https://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.42.tar.gz",
+ "http://ftp.exim.org/pub/pcre/pcre-8.42.tar.gz",
],
- strip_prefix = "pcre-8.39",
+ strip_prefix = "pcre-8.42",
build_file = clean_dep("//third_party:pcre.BUILD"),
)
@@ -415,12 +416,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
tf_http_archive(
name = "curl",
- sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
+ sha256 = "e9c37986337743f37fd14fe8737f246e97aec94b39d1b71e8a5973f72a9fc4f5",
urls = [
- "https://mirror.bazel.build/curl.haxx.se/download/curl-7.49.1.tar.gz",
- "https://curl.haxx.se/download/curl-7.49.1.tar.gz",
+ "https://mirror.bazel.build/curl.haxx.se/download/curl-7.60.0.tar.gz",
+ "https://curl.haxx.se/download/curl-7.60.0.tar.gz",
],
- strip_prefix = "curl-7.49.1",
+ strip_prefix = "curl-7.60.0",
build_file = clean_dep("//third_party:curl.BUILD"),
)
@@ -462,22 +463,22 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
tf_http_archive(
name = "lmdb",
urls = [
- "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
- "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+ "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz",
+ "https://github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz",
],
- sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
- strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
+ sha256 = "f3927859882eb608868c8c31586bb7eb84562a40a6bf5cc3e13b6b564641ea28",
+ strip_prefix = "lmdb-LMDB_0.9.22/libraries/liblmdb",
build_file = clean_dep("//third_party:lmdb.BUILD"),
)
tf_http_archive(
name = "jsoncpp_git",
urls = [
- "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
- "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
+ "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz",
+ "https://github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz",
],
- sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
- strip_prefix = "jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70",
+ sha256 = "c49deac9e0933bcb7044f08516861a2d560988540b23de2ac1ad443b219afdb6",
+ strip_prefix = "jsoncpp-1.8.4",
build_file = clean_dep("//third_party:jsoncpp.BUILD"),
)
@@ -627,6 +628,16 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
licenses = ["notice"], # Apache 2.0
)
+ java_import_external(
+ name = "com_squareup_javapoet",
+ jar_sha256 = "5bb5abdfe4366c15c0da3332c57d484e238bd48260d6f9d6acf2b08fdde1efea",
+ jar_urls = [
+ "http://mirror.bazel.build/repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar",
+ "http://repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar",
+ ],
+ licenses = ["notice"], # Apache 2.0
+ )
+
tf_http_archive(
name = "com_google_pprof",
urls = [
@@ -684,11 +695,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
tf_http_archive(
name = "flatbuffers",
- strip_prefix = "flatbuffers-971a68110e4fc1bace10fcb6deeb189e7e1a34ce",
- sha256 = "874088d2ee0d9f8524191f77209556415f03dd44e156276edf19e5b90ceb5f55",
+ strip_prefix = "flatbuffers-1.9.0",
+ sha256 = "5ca5491e4260cacae30f1a5786d109230db3f3a6e5a0eb45d0d0608293d247e3",
urls = [
- "https://mirror.bazel.build/github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
- "https://github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
+ "https://mirror.bazel.build/github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
+ "https://github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
],
build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"),
)
@@ -793,6 +804,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
actual = "@grpc//:grpc++",
)
+ native.bind(
+ name = "grpc_lib_unsecure",
+ actual = "@grpc//:grpc++_unsecure",
+ )
+
# Needed by gRPC
native.bind(
name = "libssl",
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 4def6f9489..1638b72161 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -7,6 +7,7 @@ exports_files(["COPYING"])
CURL_WIN_COPTS = [
"/Iexternal/curl/lib",
+ "/DBUILDING_LIBCURL",
"/DHAVE_CONFIG_H",
"/DCURL_DISABLE_FTP",
"/DCURL_DISABLE_NTLM",
@@ -49,6 +50,8 @@ cc_library(
"lib/curl_addrinfo.c",
"lib/curl_addrinfo.h",
"lib/curl_base64.h",
+ "lib/curl_ctype.c",
+ "lib/curl_ctype.h",
"lib/curl_des.h",
"lib/curl_endian.h",
"lib/curl_fnmatch.c",
@@ -75,6 +78,7 @@ cc_library(
"lib/curl_sec.h",
"lib/curl_setup.h",
"lib/curl_setup_once.h",
+ "lib/curl_sha256.h",
"lib/curl_sspi.c",
"lib/curl_sspi.h",
"lib/curl_threads.c",
@@ -134,6 +138,8 @@ cc_library(
"lib/md5.c",
"lib/memdebug.c",
"lib/memdebug.h",
+ "lib/mime.c",
+ "lib/mime.h",
"lib/mprintf.c",
"lib/multi.c",
"lib/multihandle.h",
@@ -153,8 +159,8 @@ cc_library(
"lib/pop3.h",
"lib/progress.c",
"lib/progress.h",
- "lib/rawstr.c",
- "lib/rawstr.h",
+ "lib/rand.c",
+ "lib/rand.h",
"lib/rtsp.c",
"lib/rtsp.h",
"lib/security.c",
@@ -162,8 +168,11 @@ cc_library(
"lib/select.h",
"lib/sendf.c",
"lib/sendf.h",
+ "lib/setopt.c",
+ "lib/setopt.h",
"lib/setup-os400.h",
"lib/setup-vms.h",
+ "lib/sha256.c",
"lib/share.c",
"lib/share.h",
"lib/sigpipe.h",
@@ -179,10 +188,10 @@ cc_library(
"lib/splay.c",
"lib/splay.h",
"lib/ssh.h",
+ "lib/strcase.c",
+ "lib/strcase.h",
"lib/strdup.c",
"lib/strdup.h",
- "lib/strequal.c",
- "lib/strequal.h",
"lib/strerror.c",
"lib/strerror.h",
"lib/strtok.c",
@@ -241,13 +250,12 @@ cc_library(
}),
hdrs = [
"include/curl/curl.h",
- "include/curl/curlbuild.h",
- "include/curl/curlrules.h",
"include/curl/curlver.h",
"include/curl/easy.h",
"include/curl/mprintf.h",
"include/curl/multi.h",
"include/curl/stdcheaders.h",
+ "include/curl/system.h",
"include/curl/typecheck-gcc.h",
],
copts = select({
@@ -256,6 +264,7 @@ cc_library(
"//conditions:default": [
"-Iexternal/curl/lib",
"-D_GNU_SOURCE",
+ "-DBUILDING_LIBCURL",
"-DHAVE_CONFIG_H",
"-DCURL_DISABLE_FTP",
"-DCURL_DISABLE_NTLM", # turning it off in configure is not enough
@@ -676,6 +685,7 @@ genrule(
"# define SIZEOF_INT 4",
"# define SIZEOF_LONG 8",
"# define SIZEOF_OFF_T 8",
+ "# define SIZEOF_CURL_OFF_T 8",
"# define SIZEOF_SHORT 2",
"# define SIZEOF_SIZE_T 8",
"# define SIZEOF_TIME_T 8",
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index 824c97be60..639dff2cd0 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -98,6 +98,8 @@ cc_binary(
"grpc/src/compiler/cpp_generator.h",
"grpc/src/compiler/go_generator.cc",
"grpc/src/compiler/go_generator.h",
+ "grpc/src/compiler/java_generator.cc",
+ "grpc/src/compiler/java_generator.h",
"grpc/src/compiler/schema_interface.h",
"src/flatc_main.cpp",
"src/idl_gen_cpp.cpp",
diff --git a/third_party/jsoncpp.BUILD b/third_party/jsoncpp.BUILD
index 65f98410b2..cf3cba0555 100644
--- a/third_party/jsoncpp.BUILD
+++ b/third_party/jsoncpp.BUILD
@@ -6,7 +6,6 @@ cc_library(
name = "jsoncpp",
srcs = [
"include/json/assertions.h",
- "src/lib_json/json_batchallocator.h",
"src/lib_json/json_reader.cpp",
"src/lib_json/json_tool.h",
"src/lib_json/json_value.cpp",
@@ -20,9 +19,13 @@ cc_library(
"include/json/json.h",
"include/json/reader.h",
"include/json/value.h",
+ "include/json/version.h",
"include/json/writer.h",
],
- copts = ["-DJSON_USE_EXCEPTION=0"],
+ copts = [
+ "-DJSON_USE_EXCEPTION=0",
+ "-DJSON_HAS_INT64",
+ ],
includes = ["include"],
visibility = ["//visibility:public"],
deps = [":private"],
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index 78ed1f4e16..ee49d281ab 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -3,7 +3,7 @@
licenses(["notice"]) # BSD 3-clause
-exports_files(["LICENSE"])
+exports_files(["LICENSE.md"])
# Arguments to ./scripts/libxsmm_interface.py, see that file for detailed description.
# precision: SP & DP