aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/BUILD1
-rw-r--r--tensorflow/compiler/xla/tests/cpu/BUILD99
-rw-r--r--tensorflow/compiler/xla/tests/cpu/cpu_bytesizeof_test.cc37
-rw-r--r--tensorflow/compiler/xla/tests/cpu/cpu_codegen_test.h30
-rw-r--r--tensorflow/compiler/xla/tests/cpu/cpu_external_constants_test.cc73
-rw-r--r--tensorflow/compiler/xla/tests/cpu/cpu_fusion_test.cc330
-rw-r--r--tensorflow/compiler/xla/tests/cpu/cpu_intrinsic_test.cc150
7 files changed, 720 insertions, 0 deletions
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 5bb31d7df1..065e61efca 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -330,6 +330,7 @@ filegroup(
"//tensorflow/compiler/xla/service/interpreter:all_files",
"//tensorflow/compiler/xla/service/llvm_ir:all_files",
"//tensorflow/compiler/xla/tests:all_files",
+ "//tensorflow/compiler/xla/tests/cpu:all_files",
"//tensorflow/compiler/xla/tools:all_files",
"//tensorflow/contrib:all_files",
"//tensorflow/contrib/all_reduce:all_files",
diff --git a/tensorflow/compiler/xla/tests/cpu/BUILD b/tensorflow/compiler/xla/tests/cpu/BUILD
new file mode 100644
index 0000000000..e0253b6a6b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/cpu/BUILD
@@ -0,0 +1,99 @@
+# Description:
+# Tests for CPU, in C++, against the XLA API, using the in-process
+# client library.
+
+licenses(["notice"]) # Apache 2.0
+
+package(
+ default_visibility = [":friends"],
+)
+
+package_group(
+ name = "friends",
+ includes = [
+ "//tensorflow/compiler/xla:friends",
+ ],
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+tf_cc_test(
+ name = "cpu_fusion_test",
+ srcs = ["cpu_fusion_test.cc"],
+ deps = [
+ "//tensorflow/compiler/xla:literal_util",
+ "//tensorflow/compiler/xla:shape_util",
+ "//tensorflow/compiler/xla:util",
+ "//tensorflow/compiler/xla:xla_data_proto",
+ "//tensorflow/compiler/xla/service:cpu_plugin",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/compiler/xla/service/cpu:cpu_instruction_fusion",
+ "//tensorflow/compiler/xla/tests:hlo_test_base",
+ "//tensorflow/compiler/xla/tests:literal_test_util",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "cpu_bytesizeof_test",
+ srcs = ["cpu_bytesizeof_test.cc"],
+ deps = [
+ "//tensorflow/compiler/xla:shape_util",
+ "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "cpu_external_constants_test",
+ srcs = ["cpu_external_constants_test.cc"],
+ deps = [
+ ":cpu_codegen_test",
+ "//tensorflow/compiler/xla:array2d",
+ "//tensorflow/compiler/xla:shape_util",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/compiler/xla/tests:filecheck",
+ "//tensorflow/core:test",
+ ],
+)
+
+cc_library(
+ name = "cpu_codegen_test",
+ testonly = True,
+ hdrs = ["cpu_codegen_test.h"],
+ deps = [
+ "//tensorflow/compiler/xla/service:cpu_plugin",
+ "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+tf_cc_test(
+ name = "cpu_intrinsic_test",
+ srcs = ["cpu_intrinsic_test.cc"],
+ deps = [
+ ":cpu_codegen_test",
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+# -----------------------------------------------------------------------------
+
+filegroup(
+ name = "all_files",
+ srcs = glob(
+ ["**/*"],
+ exclude = [
+ "**/METADATA",
+ "**/OWNERS",
+ ],
+ ),
+ visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/compiler/xla/tests/cpu/cpu_bytesizeof_test.cc b/tensorflow/compiler/xla/tests/cpu/cpu_bytesizeof_test.cc
new file mode 100644
index 0000000000..3f2bbbd076
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/cpu/cpu_bytesizeof_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/platform/test.h"
+
+class CpuByteSizeOfTest : public ::testing::Test {};
+
+TEST_F(CpuByteSizeOfTest, ARM32) {
+ llvm::DataLayout data_layout(
+ "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64");
+ auto tuple_shape =
+ xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {})});
+ EXPECT_EQ(xla::llvm_ir::ByteSizeOf(tuple_shape, data_layout),
+ data_layout.getPointerSize());
+}
+
+TEST_F(CpuByteSizeOfTest, ARM64) {
+ llvm::DataLayout data_layout("e-m:e-i64:64-i128:128-n32:64-S128");
+ auto tuple_shape =
+ xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {})});
+ EXPECT_EQ(xla::llvm_ir::ByteSizeOf(tuple_shape, data_layout),
+ data_layout.getPointerSize());
+}
diff --git a/tensorflow/compiler/xla/tests/cpu/cpu_codegen_test.h b/tensorflow/compiler/xla/tests/cpu/cpu_codegen_test.h
new file mode 100644
index 0000000000..a6ca00b07d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/cpu/cpu_codegen_test.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef PLATFORMS_XLA_TESTS_CPU_CPU_CODEGEN_TEST_H_
+#define PLATFORMS_XLA_TESTS_CPU_CPU_CODEGEN_TEST_H_
+
+#include "tensorflow/compiler/xla/tests/llvm_irgen_test_base.h"
+
+namespace xla {
+namespace cpu {
+
+// Tests that verify IR emitted by the CPU backend is as expected.
+class CpuCodegenTest : public LLVMIRGenTestBase {};
+
+} // namespace cpu
+} // namespace xla
+
+#endif // PLATFORMS_XLA_TESTS_CPU_CPU_CODEGEN_TEST_H_
diff --git a/tensorflow/compiler/xla/tests/cpu/cpu_external_constants_test.cc b/tensorflow/compiler/xla/tests/cpu/cpu_external_constants_test.cc
new file mode 100644
index 0000000000..14f223e05e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/cpu/cpu_external_constants_test.cc
@@ -0,0 +1,73 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/cpu/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+class CpuExternalConstantsTest : public CpuCodegenTest {
+ public:
+ void TestWithArray(int64 rows, int64 cols, const char* filecheck_pattern) {
+ HloComputation::Builder builder(TestName());
+
+ Array2D<float> backing_array(rows, cols);
+ backing_array.FillUnique();
+
+ auto shape = ShapeUtil::MakeShape(F32, {rows, cols});
+
+ HloInstruction* constant =
+ builder.AddInstruction(HloInstruction::CreateConstant(
+ Literal::CreateR2FromArray2D(backing_array)));
+ HloInstruction* param =
+ builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+ builder.AddInstruction(
+ HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, constant));
+
+ std::unique_ptr<HloModule> module = CreateNewModule();
+ module->AddEntryComputation(builder.Build());
+
+ CompileAndVerifyIr(std::move(module), filecheck_pattern,
+ /*match_optimized_ir=*/false);
+ }
+};
+
+TEST_F(CpuExternalConstantsTest, Basic) {
+ TestWithArray(/*rows=*/1024, /*cols=*/1024, R"(
+CHECK: @constant_global_0 = external constant [1024 x [1024 x float]], align 16
+)");
+}
+
+TEST_F(CpuExternalConstantsTest, BasicNegative) {
+ // The constant array in this test case is small enough that there is no need
+ // to externalize it.
+ TestWithArray(/*rows=*/4, /*cols=*/4, R"(
+CHECK-NOT: @constant_global_0 = external constant [4 x [4 x float]], align 8
+CHECK: @0 = private constant [4 x [4 x float]] {{.*}}, align 8
+)");
+}
+} // namespace
+} // namespace cpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/tests/cpu/cpu_fusion_test.cc b/tensorflow/compiler/xla/tests/cpu/cpu_fusion_test.cc
new file mode 100644
index 0000000000..9231d3960e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/cpu/cpu_fusion_test.cc
@@ -0,0 +1,330 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+class CpuFusionTest : public HloTestBase {
+ protected:
+ CpuFusionTest() {}
+
+ ErrorSpec error_spec_{0.0001, 1e-5};
+};
+
+TEST_F(CpuFusionTest, FuseTwoElementwiseOps) {
+ auto builder = HloComputation::Builder(TestName());
+ auto input_literal1 = Literal::CreateR1<float>({1.0, 2.0, 3.0});
+ auto input_literal2 = Literal::CreateR1<float>({-2.0, -42.0, 2.0});
+ Shape vshape = input_literal1->shape();
+
+ auto input1 = builder.AddInstruction(
+ HloInstruction::CreateConstant(std::move(input_literal1)));
+ auto input2 = builder.AddInstruction(
+ HloInstruction::CreateConstant(std::move(input_literal2)));
+
+ auto add1 = builder.AddInstruction(
+ HloInstruction::CreateBinary(vshape, HloOpcode::kAdd, input1, input2));
+ builder.AddInstruction(
+ HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, add1));
+
+ auto module = CreateNewModule();
+ module->AddEntryComputation(builder.Build());
+
+ CpuInstructionFusion fusion;
+ EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+ // The computation root instruction was fused. Verify the fusion instruction
+ // is now the root.
+ auto computation = module->entry_computation();
+ auto fusion_instruction = computation->root_instruction();
+ EXPECT_EQ(HloOpcode::kFusion, fusion_instruction->opcode());
+ EXPECT_EQ(HloOpcode::kNegate,
+ fusion_instruction->fused_expression_root()->opcode());
+ // There should be four fused instructions: 2 parameters, the add, and the
+ // negate.
+ EXPECT_EQ(4, fusion_instruction->fused_instruction_count());
+
+ // Compile and execute the computation.
+ auto result = ExecuteAndTransfer(std::move(module), {});
+
+ // Check the output correctness.
+ LiteralTestUtil::ExpectR1Near<float>({1.0, 40.0, -5.0}, *result, error_spec_);
+}
+
+TEST_F(CpuFusionTest, FuseElementwiseOpChain) {
+ auto builder = HloComputation::Builder(TestName());
+ auto input_literal = Literal::CreateR1<float>({-1.5, -2.5, -3.0});
+ Shape vshape = input_literal->shape();
+
+ auto input = builder.AddInstruction(
+ HloInstruction::CreateConstant(std::move(input_literal)));
+ auto negate = builder.AddInstruction(
+ HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, input));
+ auto ceil = builder.AddInstruction(
+ HloInstruction::CreateUnary(vshape, HloOpcode::kCeil, negate));
+ auto exp = builder.AddInstruction(
+ HloInstruction::CreateUnary(vshape, HloOpcode::kExp, ceil));
+ auto floor = builder.AddInstruction(
+ HloInstruction::CreateUnary(vshape, HloOpcode::kFloor, exp));
+ auto two = builder.AddInstruction(
+ HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
+ builder.AddInstruction(
+ HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, two, floor));
+
+ auto module = CreateNewModule();
+ module->AddEntryComputation(builder.Build());
+
+ CpuInstructionFusion fusion;
+ EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+ // The computation root instruction was fused. Verify the fusion instruction
+ // is now the root.
+ auto computation = module->entry_computation();
+ auto fusion_instruction = computation->root_instruction();
+ EXPECT_EQ(HloOpcode::kFusion, fusion_instruction->opcode());
+ EXPECT_EQ(HloOpcode::kMultiply,
+ fusion_instruction->fused_expression_root()->opcode());
+ // There should be 7 fused instructions: 2 parameters and the fused
+ // operations.
+ EXPECT_EQ(7, fusion_instruction->fused_instruction_count());
+
+ // Compile and execute the computation.
+ auto result = ExecuteAndTransfer(std::move(module), {});
+
+ // Check the output correctness.
+ LiteralTestUtil::ExpectR1Near<float>({14.0, 40.0, 40.0}, *result,
+ error_spec_);
+}
+
+TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusableInstruction) {
+ // Test a chain of fusable ops with a non-fusable op (a reduce) thrown in the
+ // middle.
+ auto module = CreateNewModule();
+ auto builder = HloComputation::Builder(TestName());
+ auto input_literal = Literal::CreateR1<float>({-1.5, -2.5, -3.0});
+ Shape vshape = input_literal->shape();
+
+ auto input = builder.AddInstruction(
+ HloInstruction::CreateConstant(std::move(input_literal)));
+ auto negate = builder.AddInstruction(
+ HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, input));
+ auto ceil = builder.AddInstruction(
+ HloInstruction::CreateUnary(vshape, HloOpcode::kCeil, negate));
+
+ auto cshape = ShapeUtil::MakeShape(F32, {6});
+ auto concatenate = builder.AddInstruction(
+ HloInstruction::CreateConcatenate(cshape, {ceil, ceil}, /*dimension=*/0));
+
+ // Build an x+y computation to use in a reduce.
+ Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+ auto embedded_builder = HloComputation::Builder("f32+f32");
+ embedded_builder.AddInstruction(HloInstruction::CreateBinary(
+ r0f32, HloOpcode::kAdd,
+ embedded_builder.AddInstruction(
+ HloInstruction::CreateParameter(0, r0f32, "x")),
+ embedded_builder.AddInstruction(
+ HloInstruction::CreateParameter(1, r0f32, "y"))));
+ auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
+
+ // This is a nop reduction.
+ auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+ cshape,
+ builder.AddInstruction(HloInstruction::CreateReshape(
+ ShapeUtil::MakeShape(F32, {6, 1}), concatenate)),
+ /*init_value=*/
+ builder.AddInstruction(
+ HloInstruction::CreateConstant(Literal::CreateR0<float>(0))),
+ /*dimensions_to_reduce=*/{1}, add_f32));
+
+ auto exp = builder.AddInstruction(
+ HloInstruction::CreateUnary(cshape, HloOpcode::kExp, reduce));
+ auto floor = builder.AddInstruction(
+ HloInstruction::CreateUnary(cshape, HloOpcode::kFloor, exp));
+ auto two = builder.AddInstruction(
+ HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
+ builder.AddInstruction(
+ HloInstruction::CreateBinary(cshape, HloOpcode::kMultiply, two, floor));
+
+ module->AddEntryComputation(builder.Build());
+
+ CpuInstructionFusion fusion;
+ EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+ // The computation root instruction was fused. Verify the fusion instruction
+ // is now the root.
+ auto computation = module->entry_computation();
+
+ auto fusion_instruction1 = computation->root_instruction();
+ EXPECT_EQ(HloOpcode::kFusion, fusion_instruction1->opcode());
+ EXPECT_EQ(HloOpcode::kMultiply,
+ fusion_instruction1->fused_expression_root()->opcode());
+ // There should be 5 fused instructions in the root fusion instruction: 2
+ // parameters, multiply, floor, and exp.
+ EXPECT_EQ(5, fusion_instruction1->fused_instruction_count())
+ << fusion_instruction1->fused_instructions_computation()->ToString();
+
+ auto fusion_instruction2 = reduce->operand(0);
+ EXPECT_EQ(HloOpcode::kFusion, fusion_instruction1->opcode());
+ EXPECT_EQ(HloOpcode::kReshape,
+ fusion_instruction2->fused_expression_root()->opcode());
+ // There should be 5 fused instructions in the second fusion instruction: 1
+ // parameter, negate, ceil, concat, and reshape.
+ EXPECT_EQ(5, fusion_instruction2->fused_instruction_count())
+ << fusion_instruction2->fused_instructions_computation()->ToString();
+
+ // Compile and execute the computation.
+ auto result = ExecuteAndTransfer(std::move(module), {});
+
+ // Check the output correctness.
+ LiteralTestUtil::ExpectR1Near<float>({14.0, 40.0, 40.0, 14.0, 40.0, 40.0},
+ *result, error_spec_);
+}
+
+TEST_F(CpuFusionTest, TestOperandOrderToAvoidDuplication) {
+ // Test that the operands of an instruction to be fused are considered in the
+ // proper order to avoid duplication. Test input:
+ //
+ // constant = {...}
+ // negate = neg(constant)
+ // ceil = ceil(negate)
+ // add1 = add(negate, ceil)
+ // add2 = add(ceil, negate)
+ //
+ // In this example, the operands of both add1 and add2 should be fused in the
+ // order {ceil, negate} even though they have different orders in their
+ // operand vectors. Test for this problem by counting the number of nodes in
+ // each fusion instruction to ensure that negate is not duplicated.
+ auto builder = HloComputation::Builder(TestName());
+ auto input_literal = Literal::CreateR1<float>({1.0, 2.0, 3.0});
+ Shape vshape = input_literal->shape();
+
+ auto constant = builder.AddInstruction(
+ HloInstruction::CreateConstant(std::move(input_literal)));
+ auto negate = builder.AddInstruction(
+ HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, constant));
+ auto ceil = builder.AddInstruction(
+ HloInstruction::CreateUnary(vshape, HloOpcode::kCeil, negate));
+
+ auto add1 = builder.AddInstruction(
+ HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, negate, ceil));
+ auto add2 = builder.AddInstruction(
+ HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, ceil, negate));
+
+ // Tie together the two adds with a tuple to create a single root.
+ auto result =
+ builder.AddInstruction(HloInstruction::CreateTuple({add1, add2}));
+
+ // Create computation and module.
+ auto module = CreateNewModule();
+ module->AddEntryComputation(builder.Build());
+
+ // Run fusion.
+ CpuInstructionFusion fusion;
+ EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+ auto fusion1 = result->operand(0);
+ auto fusion2 = result->operand(1);
+ EXPECT_EQ(HloOpcode::kFusion, fusion1->opcode());
+ EXPECT_EQ(HloOpcode::kFusion, fusion2->opcode());
+
+ // Each fusion instruction should have 4 fused instruction inside: add, ceil,
+ // negate, and the fused parameter.
+ EXPECT_EQ(4, fusion1->fused_instruction_count());
+ EXPECT_EQ(4, fusion2->fused_instruction_count());
+
+ // Each fusion instruction should have one parameter and the parameter should
+ // be the constant.
+ EXPECT_EQ(1, fusion1->operand_count());
+ EXPECT_EQ(constant, fusion1->operand(0));
+ EXPECT_EQ(1, fusion2->operand_count());
+ EXPECT_EQ(constant, fusion2->operand(0));
+}
+
+TEST_F(CpuFusionTest, DoNotDuplicateExpensiveOps) {
+ // Verify that expensive operations will not be fused if the fusion results in
+ // duplication. Test code:
+ //
+ // constant = 42.0
+ // exp1 = exp(constant)
+ // negate1 = negate(exp1)
+ // exp2 = exp(constant)
+ // negate2 = negate(exp2)
+ // tuple = tuple(negate1, negate2, exp2)
+ //
+ // exp1 should be fused down into negate1, but exp2 will not be fused into
+ // negate2 because this will result in duplication of the expensive exp
+ // computation. The duplication is caused by the other use of exp2 in the
+ // tuple.
+ auto builder = HloComputation::Builder(TestName());
+ auto input_literal1 = Literal::CreateR1<float>({1.0, 2.0, 3.0});
+ auto input_literal2 = Literal::CreateR1<float>({-2.0, -42.0, 2.0});
+ auto constant = builder.AddInstruction(
+ HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
+ Shape shape = constant->shape();
+
+ auto exp1 = builder.AddInstruction(
+ HloInstruction::CreateUnary(shape, HloOpcode::kExp, constant));
+ auto negate1 = builder.AddInstruction(
+ HloInstruction::CreateUnary(shape, HloOpcode::kNegate, exp1));
+
+ auto exp2 = builder.AddInstruction(
+ HloInstruction::CreateUnary(shape, HloOpcode::kExp, constant));
+ auto negate2 = builder.AddInstruction(
+ HloInstruction::CreateUnary(shape, HloOpcode::kNegate, exp2));
+
+ auto tuple = builder.AddInstruction(
+ HloInstruction::CreateTuple({negate1, negate2, exp2}));
+
+ auto module = CreateNewModule();
+ module->AddEntryComputation(builder.Build());
+
+ CpuInstructionFusion fusion;
+ EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+ // The only fusion instruction should be operand 0 of the tuple (formerly
+ // negate1).
+ EXPECT_EQ(HloOpcode::kFusion, tuple->operand(0)->opcode());
+ EXPECT_EQ(HloOpcode::kNegate, tuple->operand(1)->opcode());
+ EXPECT_EQ(HloOpcode::kExp, tuple->operand(2)->opcode());
+
+ auto fusion_inst = tuple->operand(0);
+ // There should be three fused instructions: negate2, exp2, and the fused
+ // parameter.
+ EXPECT_EQ(3, fusion_inst->fused_instruction_count());
+ EXPECT_EQ(1, fusion_inst->operand_count());
+ EXPECT_EQ(constant, fusion_inst->operand(0));
+}
+
+} // namespace
+} // namespace cpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/tests/cpu/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/tests/cpu/cpu_intrinsic_test.cc
new file mode 100644
index 0000000000..15a8a44e4c
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/cpu/cpu_intrinsic_test.cc
@@ -0,0 +1,150 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/tests/cpu/cpu_codegen_test.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+const char* const kTriple_x86_64 = "x86_64-pc-linux";
+const char* const kTriple_android_arm = "armv7-none-android";
+
+struct IntrinsicTestSpec {
+ HloOpcode opcode;
+ tensorflow::StringPiece triple;
+ tensorflow::StringPiece features;
+ tensorflow::StringPiece check_lines;
+};
+
+// Tests that unary functions get lowered using intrinsic calls.
+class CpuUnaryIntrinsicTest
+ : public CpuCodegenTest,
+ public ::testing::WithParamInterface<IntrinsicTestSpec> {
+ public:
+ static string Name(const ::testing::TestParamInfo<IntrinsicTestSpec>& info) {
+ auto spec = info.param;
+
+ string opcode = HloOpcodeString(spec.opcode);
+ opcode[0] = toupper(opcode[0]);
+
+ string triple{spec.triple.data(), spec.triple.size()};
+ if (triple == kTriple_x86_64) {
+ triple = "x86_64";
+ } else if (triple == kTriple_android_arm) {
+ triple = "android_arm";
+ } else {
+ triple = "Unknown";
+ }
+
+ string features{spec.features.data(), spec.features.size()};
+ if (!features.empty()) {
+ std::replace_if(features.begin(), features.end(),
+ [](char c) { return c != '_' && !isalnum(c); }, '_');
+ } else {
+ features = "";
+ }
+
+ return tensorflow::strings::StrCat(opcode.c_str(), "_On_", triple.c_str(),
+ features.empty() ? "" : "_With",
+ features.c_str());
+ }
+};
+
+// Creates a module with a call to the unary op, and tests if the
+// compiler replaced it with a call to the intrinsic.
+TEST_P(CpuUnaryIntrinsicTest, DoIt) {
+ HloComputation::Builder builder(TestName());
+ IntrinsicTestSpec spec = GetParam();
+
+ auto param_shape = ShapeUtil::MakeShape(F32, {1024});
+ HloInstruction* param = builder.AddInstruction(
+ HloInstruction::CreateParameter(0, param_shape, "input"));
+ builder.AddInstruction(
+ HloInstruction::CreateUnary(param_shape, spec.opcode, param));
+ std::unique_ptr<HloComputation> computation = builder.Build();
+
+ string triple{spec.triple.data(), spec.triple.size()};
+ string features{spec.features.data(), spec.features.size()};
+
+ CpuAotCompilationOptions options{
+ /*triple=*/triple, /*cpu_name=*/"", /*features=*/features,
+ /*entry_point_name=*/"entry",
+ /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+ auto hlo_module = CreateNewModule();
+ hlo_module->AddEntryComputation(std::move(computation));
+
+ string check_lines{spec.check_lines.data(), spec.check_lines.size()};
+
+ CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options, check_lines,
+ /*match_optimized_ir=*/true);
+}
+
+IntrinsicTestSpec CpuUnaryIntrinsicTestCases[] = {
+ IntrinsicTestSpec{
+ HloOpcode::kExp, kTriple_x86_64, "+sse4.1",
+ R"(CHECK: call fast <4 x float> @__xla_cpu_runtime_ExpV4F32SSE(<4 x float> %wide.load))"},
+
+ IntrinsicTestSpec{
+ HloOpcode::kExp, kTriple_x86_64, "+avx",
+ R"(CHECK: call fast <8 x float> @__xla_cpu_runtime_ExpV8F32AVX(<8 x float> %wide.load))"},
+
+ IntrinsicTestSpec{
+ HloOpcode::kExp, kTriple_android_arm, "+neon",
+ R"(CHECK: call fast <4 x float> @__xla_cpu_runtime_ExpV4F32NEON(<4 x float> %wide.load))"},
+
+ IntrinsicTestSpec{
+ HloOpcode::kLog, kTriple_x86_64, "+sse4.1",
+ R"(CHECK: call fast <4 x float> @__xla_cpu_runtime_LogV4F32SSE(<4 x float> %wide.load))"},
+
+ IntrinsicTestSpec{
+ HloOpcode::kLog, kTriple_x86_64, "+avx",
+ R"(CHECK: call fast <8 x float> @__xla_cpu_runtime_LogV8F32AVX(<8 x float> %wide.load))"},
+
+ IntrinsicTestSpec{
+ HloOpcode::kLog, kTriple_android_arm, "+neon",
+ R"(CHECK: call fast <4 x float> @__xla_cpu_runtime_LogV4F32NEON(<4 x float> %wide.load))"},
+
+ // Tanh is inlined, so we match a line from it instead of a function call.
+
+ IntrinsicTestSpec{
+ HloOpcode::kTanh, kTriple_x86_64, "",
+ R"(CHECK: fcmp fast uge <4 x float> %wide.load, <float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00>)"},
+
+ IntrinsicTestSpec{
+ HloOpcode::kTanh, kTriple_x86_64, "+avx",
+ R"(CHECK: fcmp fast uge <8 x float> %wide.load, <float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00>)"},
+
+ IntrinsicTestSpec{
+ HloOpcode::kTanh, kTriple_android_arm, "",
+ R"(CHECK: fcmp fast uge <4 x float> %wide.load, <float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00>)"}};
+
+INSTANTIATE_TEST_CASE_P(CpuUnaryIntrinsicTestInstantiation,
+ CpuUnaryIntrinsicTest,
+ ::testing::ValuesIn(CpuUnaryIntrinsicTestCases),
+ CpuUnaryIntrinsicTest::Name);
+
+} // namespace
+} // namespace cpu
+} // namespace xla