aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Tim Shen <timshen@google.com>2018-08-15 17:08:11 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-08-15 17:12:37 -0700
commitc3ef5d7034a50ca1b500c6fabea9250d38628884 (patch)
tree851410c7e0e9112bad6635b3404ba8a82fecacd2
parenta10219e1de775ca16281f1b597f7bf4d60d0585f (diff)
Implmenet cuDNN algorithm cross checking.
PiperOrigin-RevId: 208910038
-rw-r--r--tensorflow/compiler/xla/service/gpu/BUILD1
-rw-r--r--tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc51
2 files changed, 52 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 19575c7905..cb9b937854 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -362,6 +362,7 @@ cc_library(
hdrs = ["cudnn_convolution_algorithm_picker.h"],
deps = [
":backend_configs",
+ ":buffer_comparator",
":cudnn_convolution_runner",
":gpu_executable",
":ir_emission_utils",
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 7d93bdfc8b..5a2377c32f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -16,6 +16,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
#include "tensorflow/compiler/xla/literal_util.h"
#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
#include "tensorflow/core/lib/gtl/optional.h"
@@ -177,6 +178,12 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
const Shape& output_shape, const Window& window,
const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) {
+ CHECK_EQ(input_shape.element_type(), filter_shape.element_type());
+ CHECK_EQ(input_shape.element_type(), output_shape.element_type());
+ // TODO(timshen): for now only check fp16. It can be expanded to other types,
+ // with some work on the HLO routines.
+ const bool cross_check_enabled = input_shape.element_type() == xla::F16;
+
// Don't run this function concurrently on the same GPU.
//
// This is a bit of a hack and doesn't protect us against arbitrary concurrent
@@ -225,11 +232,27 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
.ThenMemZero(&output_buf, output_buf.size())
.BlockHostUntilDone());
+ DeviceMemoryBase* result_buf = [&] {
+ switch (kind) {
+ case CudnnConvKind::kBackwardFilter:
+ return &filter_buf;
+ case CudnnConvKind::kBackwardInput:
+ return &input_buf;
+ case CudnnConvKind::kForward:
+ return &output_buf;
+ }
+ }();
+
const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
input_shape, output_shape, dnums, stream_exec_);
se::dnn::ProfileResult best_result;
int64 best_result_bytes_used = 0;
+ optional<F16BufferComparator> comparator;
+ // Use the first algorithm that's supported as reference. There isn't a
+ // particular reason to use it, as any algorithm sufficies. It doesn't make
+ // this algorithm considered correct, though.
+ optional<AlgorithmDesc> first_algorithm;
for (const AlgorithmDesc& alg :
GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) {
ScratchAllocator scratch_allocator(device_ordinal, allocator);
@@ -245,6 +268,34 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
.ok();
if (launch_ok && profile_result.is_valid()) {
+ if (comparator.has_value()) {
+ StatusOr<bool> result = comparator->CompareEqual(
+ se::DeviceMemory<Eigen::half>(*result_buf));
+ if (!result.ok()) {
+ LOG(ERROR) << "Unable to compare "
+ << AlgorithmToString(*first_algorithm) << " against "
+ << AlgorithmToString(alg) << " for " << instr->ToString()
+ << ": " << result.status();
+ } else if (!result.ValueOrDie()) {
+ LOG(ERROR) << "Results mismatch between different convolution "
+ "algorithms. This is likely a bug in convolution, or "
+ "an excessive loss of precision in convolution. "
+ << instr->ToString() << " for "
+ << AlgorithmToString(*first_algorithm) << " vs "
+ << AlgorithmToString(alg);
+ }
+ } else if (cross_check_enabled) {
+ auto comp = F16BufferComparator::Create(
+ se::DeviceMemory<Eigen::half>(*result_buf), compiler_, allocator,
+ &stream);
+ if (comp.ok()) {
+ comparator.emplace(comp.ConsumeValueOrDie());
+ first_algorithm.emplace(alg);
+ } else {
+ LOG(ERROR) << "Fail to initialize buffer comparator: "
+ << comp.status() << ", instruction: " << instr->ToString();
+ }
+ }
int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
<< " succeeded, taking " << profile_result.elapsed_time_in_ms()