diff options
author | 2016-05-26 15:14:00 -0800 | |
---|---|---|
committer | 2016-05-26 16:18:36 -0700 | |
commit | 36357e7e1127873165694a38e3a989df4e0b6ffe (patch) | |
tree | eaf00810dfaefdc7f308a088829a3798af23e7bc | |
parent | e1b4934bb59904ee4dd243a34cc8356ff6bd266d (diff) |
Added support for half floats to the batch normalization op
Change: 123368006
-rw-r--r-- | tensorflow/core/framework/tensor_testutil.h | 19 | ||||
-rw-r--r-- | tensorflow/core/kernels/batch_norm_op.cc | 42 | ||||
-rw-r--r-- | tensorflow/core/kernels/batch_norm_op.h | 4 | ||||
-rw-r--r-- | tensorflow/core/kernels/batch_norm_op_gpu.cu.cc | 2 | ||||
-rw-r--r-- | tensorflow/core/kernels/batch_norm_op_test.cc | 27 | ||||
-rw-r--r-- | tensorflow/core/kernels/ops_testutil.h | 22 |
6 files changed, 96 insertions, 20 deletions
diff --git a/tensorflow/core/framework/tensor_testutil.h b/tensorflow/core/framework/tensor_testutil.h index 8d14c25261..4efa7298d1 100644 --- a/tensorflow/core/framework/tensor_testutil.h +++ b/tensorflow/core/framework/tensor_testutil.h @@ -60,6 +60,19 @@ void FillValues(Tensor* tensor, gtl::ArraySlice<T> vals) { } } +// Fills in '*tensor' with 'vals', converting the types as needed. +template <typename T, typename SrcType> +void FillValues(Tensor* tensor, std::initializer_list<SrcType> vals) { + auto flat = tensor->flat<T>(); + CHECK_EQ(flat.size(), vals.size()); + if (flat.size() > 0) { + size_t i = 0; + for (auto itr = vals.begin(); itr != vals.end(); ++itr, ++i) { + flat(i) = T(*itr); + } + } +} + // Fills in '*tensor' with a sequence of value of val, val+1, val+2, ... // Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2})); // test::FillIota<float>(&x, 1.0); @@ -100,7 +113,8 @@ namespace internal { template <typename T> struct is_floating_point_type { - static const bool value = std::is_same<T, float>::value || + static const bool value = std::is_same<T, Eigen::half>::value || + std::is_same<T, float>::value || std::is_same<T, double>::value || std::is_same<T, std::complex<float> >::value || std::is_same<T, std::complex<double> >::value; @@ -175,7 +189,8 @@ struct Expector<T, true> { static void Near(const T& a, const T& b, const double abs_err) { if (a != b) { // Takes care of inf. - EXPECT_LE(std::abs(a - b), abs_err) << "a = " << a << " b = " << b; + EXPECT_LE(double(Eigen::numext::abs(a - b)), abs_err) << "a = " << a + << " b = " << b; } } diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc index 2389af050a..01936e9fd5 100644 --- a/tensorflow/core/kernels/batch_norm_op.cc +++ b/tensorflow/core/kernels/batch_norm_op.cc @@ -33,8 +33,10 @@ template <typename Device, typename T> class BatchNormOp : public OpKernel { public: explicit BatchNormOp(OpKernelConstruction* context) : OpKernel(context) { + float variance_epsilon; OP_REQUIRES_OK(context, - context->GetAttr("variance_epsilon", &variance_epsilon_)); + context->GetAttr("variance_epsilon", &variance_epsilon)); + variance_epsilon_ = T(variance_epsilon); OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization", &scale_after_normalization_)); } @@ -73,7 +75,7 @@ class BatchNormOp : public OpKernel { } private: - float variance_epsilon_; + T variance_epsilon_; bool scale_after_normalization_; }; @@ -81,8 +83,10 @@ template <typename Device, typename T> class BatchNormGradOp : public OpKernel { public: explicit BatchNormGradOp(OpKernelConstruction* context) : OpKernel(context) { + float variance_epsilon; OP_REQUIRES_OK(context, - context->GetAttr("variance_epsilon", &variance_epsilon_)); + context->GetAttr("variance_epsilon", &variance_epsilon)); + variance_epsilon_ = T(variance_epsilon); OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization", &scale_after_normalization_)); } @@ -145,7 +149,7 @@ class BatchNormGradOp : public OpKernel { } private: - float variance_epsilon_; + T variance_epsilon_; bool scale_after_normalization_; }; @@ -155,6 +159,7 @@ class BatchNormGradOp : public OpKernel { .TypeConstraint<T>("T"), \ BatchNormOp<CPUDevice, T>); +REGISTER_KERNEL(Eigen::half); REGISTER_KERNEL(float); REGISTER_KERNEL(double); #undef REGISTER_KERNEL @@ -168,12 +173,13 @@ namespace functor { const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var, \ typename TTypes<T>::ConstVec beta, typename TTypes<T>::ConstVec gamma, \ - float variance_epsilon, bool scale_after_normalization, \ + T variance_epsilon, bool scale_after_normalization, \ typename TTypes<T, 4>::Tensor output); \ extern template struct BatchNorm<GPUDevice, T>; #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T); +DECLARE_GPU_SPECS(Eigen::half); DECLARE_GPU_SPECS(float); #undef DECLARE_GPU_SPEC } // namespace functor @@ -185,6 +191,7 @@ DECLARE_GPU_SPECS(float); .TypeConstraint<T>("T"), \ BatchNormOp<GPUDevice, T>); +REGISTER_GPU_KERNEL(Eigen::half); REGISTER_GPU_KERNEL(float); #undef REGISTER_GPU_KERNEL @@ -196,6 +203,7 @@ REGISTER_GPU_KERNEL(float); .TypeConstraint<T>("T"), \ BatchNormGradOp<CPUDevice, T>); +REGISTER_KERNEL(Eigen::half); REGISTER_KERNEL(float); REGISTER_KERNEL(double); #undef REGISTER_KERNEL @@ -203,21 +211,22 @@ REGISTER_KERNEL(double); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { -#define DECLARE_GPU_SPEC(T) \ - template <> \ - void BatchNormGrad<GPUDevice, T>::operator()( \ - const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ - typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var, \ - typename TTypes<T>::ConstVec gamma, \ - typename TTypes<T, 4>::ConstTensor out_backprop, float variance_epsilon, \ - bool scale_after_normalization, typename TTypes<T, 4>::Tensor dx, \ - typename TTypes<T>::Vec dm, typename TTypes<T>::Vec dv, \ - typename TTypes<T>::Vec db, typename TTypes<T>::Vec dg, \ - typename TTypes<T>::Vec scratch1, typename TTypes<T>::Vec scratch2); \ +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void BatchNormGrad<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ + typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var, \ + typename TTypes<T>::ConstVec gamma, \ + typename TTypes<T, 4>::ConstTensor out_backprop, T variance_epsilon, \ + bool scale_after_normalization, typename TTypes<T, 4>::Tensor dx, \ + typename TTypes<T>::Vec dm, typename TTypes<T>::Vec dv, \ + typename TTypes<T>::Vec db, typename TTypes<T>::Vec dg, \ + typename TTypes<T>::Vec scratch1, typename TTypes<T>::Vec scratch2); \ extern template struct BatchNormGrad<GPUDevice, T>; #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T); +DECLARE_GPU_SPECS(Eigen::half); DECLARE_GPU_SPECS(float); #undef DECLARE_GPU_SPEC } // namespace functor @@ -229,6 +238,7 @@ DECLARE_GPU_SPECS(float); .TypeConstraint<T>("T"), \ BatchNormGradOp<GPUDevice, T>); +REGISTER_GPU_KERNEL(Eigen::half); REGISTER_GPU_KERNEL(float); #undef REGISTER_GPU_KERNEL diff --git a/tensorflow/core/kernels/batch_norm_op.h b/tensorflow/core/kernels/batch_norm_op.h index baef68125e..94707e9be9 100644 --- a/tensorflow/core/kernels/batch_norm_op.h +++ b/tensorflow/core/kernels/batch_norm_op.h @@ -29,7 +29,7 @@ struct BatchNorm { typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var, typename TTypes<T>::ConstVec beta, - typename TTypes<T>::ConstVec gamma, float variance_epsilon, + typename TTypes<T>::ConstVec gamma, T variance_epsilon, bool scale_after_normalization, typename TTypes<T, 4>::Tensor output) { const int depth = mean.dimension(0); @@ -77,7 +77,7 @@ struct BatchNormGrad { typename TTypes<T>::ConstVec var, typename TTypes<T>::ConstVec gamma, typename TTypes<T, 4>::ConstTensor out_backprop, - float variance_epsilon, bool scale_after_normalization, + T variance_epsilon, bool scale_after_normalization, typename TTypes<T, 4>::Tensor dx, typename TTypes<T>::Vec dm, typename TTypes<T>::Vec dv, typename TTypes<T>::Vec db, typename TTypes<T>::Vec dg, typename TTypes<T>::Vec scratch1, diff --git a/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc b/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc index 26433c9d12..2379cb612b 100644 --- a/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc +++ b/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc @@ -25,7 +25,9 @@ namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; template struct functor::BatchNorm<GPUDevice, float>; +template struct functor::BatchNorm<GPUDevice, Eigen::half>; template struct functor::BatchNormGrad<GPUDevice, float>; +template struct functor::BatchNormGrad<GPUDevice, Eigen::half>; } // namespace tensorflow diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc index e70bcc5b4c..9b7bf6d149 100644 --- a/tensorflow/core/kernels/batch_norm_op_test.cc +++ b/tensorflow/core/kernels/batch_norm_op_test.cc @@ -59,4 +59,31 @@ TEST_F(BatchNormOpTest, Simple) { test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01); } +TEST_F(BatchNormOpTest, Fp16) { + TF_EXPECT_OK( + NodeDefBuilder("batch_norm_op", "BatchNormWithGlobalNormalization") + .Input(FakeInput(DT_HALF)) + .Input(FakeInput(DT_HALF)) + .Input(FakeInput(DT_HALF)) + .Input(FakeInput(DT_HALF)) + .Input(FakeInput(DT_HALF)) + .Attr("scale_after_normalization", false) + .Attr("variance_epsilon", 0.001) + .Finalize(node_def())); + TF_EXPECT_OK(InitOpWithGraphVersion(8)); + AddInputFromList<Eigen::half>(TensorShape({1, 1, 6, 2}), + {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6}); + AddInputFromList<Eigen::half>(TensorShape({2}), {10, 20}); + AddInputFromList<Eigen::half>(TensorShape({2}), {0.25, 0.5}); + AddInputFromList<Eigen::half>(TensorShape({2}), {0.1, 0.6}); + AddInputFromList<Eigen::half>(TensorShape({2}), {0.0, 0.0}); + TF_ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_HALF, TensorShape({1, 1, 6, 2})); + test::FillValues<Eigen::half>( + &expected, {-17.86, -22.00, -15.87, -20.59, -13.87, -19.18, -21.86, + -33.31, -23.85, -34.72, -25.85, -36.13}); + test::ExpectTensorNear<Eigen::half>(expected, *GetOutput(0), 0.1); +} + } // namespace tensorflow diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h index 521168cb17..b2b926e78d 100644 --- a/tensorflow/core/kernels/ops_testutil.h +++ b/tensorflow/core/kernels/ops_testutil.h @@ -150,6 +150,28 @@ class OpsTestBase : public ::testing::Test { } } + // Convenience function to add an input and populate it with the elements from + // an initializer list converting the types as needed. + template <typename T, typename SrcType> + void AddInputFromList(const TensorShape& shape, + std::initializer_list<SrcType> data) { + CHECK_GT(input_types_.size(), inputs_.size()) + << "Adding more inputs than types; perhaps you need to call MakeOp"; + bool is_ref = IsRefType(input_types_[inputs_.size()]); + Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()), + DataTypeToEnum<T>::v(), shape); + test::FillValues<T>(input, data); + tensors_.push_back(input); + if (is_ref) { + CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]), + DataTypeToEnum<T>::v()); + inputs_.push_back({&lock_for_refs_, input}); + } else { + CHECK_EQ(input_types_[inputs_.size()], DataTypeToEnum<T>::v()); + inputs_.push_back({nullptr, input}); + } + } + // Runs an operation producing 'num_outputs' outputs. // // Returns the context's status after running the operation. |