diff options
-rw-r--r-- | tensorflow/core/kernels/conv_grad_filter_ops.cc | 7 | ||||
-rw-r--r-- | tensorflow/core/kernels/conv_grad_input_ops.cc | 7 | ||||
-rw-r--r-- | tensorflow/core/kernels/conv_ops.cc | 7 | ||||
-rw-r--r-- | tensorflow/core/kernels/conv_ops_gpu_2.cu.cc | 3 | ||||
-rw-r--r-- | tensorflow/core/kernels/conv_ops_gpu_3.cu.cc | 3 | ||||
-rw-r--r-- | tensorflow/core/kernels/depthwise_conv_op.cc | 4 | ||||
-rw-r--r-- | tensorflow/core/kernels/eigen_spatial_convolutions.h | 182 | ||||
-rw-r--r-- | tensorflow/core/ops/nn_ops.cc | 6 | ||||
-rw-r--r-- | tensorflow/python/kernel_tests/conv_ops_test.py | 4 |
9 files changed, 215 insertions, 8 deletions
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc index e6ae595291..66ee474ca3 100644 --- a/tensorflow/core/kernels/conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc @@ -520,6 +520,7 @@ class Conv2DCustomBackpropFilterOp : public OpKernel { TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS // GPU definitions. @@ -1017,6 +1018,7 @@ namespace functor { typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \ extern template struct PadInput<GPUDevice, T, int, 4>; +DECLARE_GPU_SPEC(double); DECLARE_GPU_SPEC(float); DECLARE_GPU_SPEC(Eigen::half); #undef DECLARE_GPU_SPEC @@ -1024,6 +1026,11 @@ DECLARE_GPU_SPEC(Eigen::half); REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") .Device(DEVICE_GPU) + .TypeConstraint<double>("T") + .HostMemory("filter_sizes"), + Conv2DSlowBackpropFilterOp<GPUDevice, double>); +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") + .Device(DEVICE_GPU) .TypeConstraint<float>("T") .HostMemory("filter_sizes"), Conv2DSlowBackpropFilterOp<GPUDevice, float>); diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc index 15c55e4d99..71ea0d5d72 100644 --- a/tensorflow/core/kernels/conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/conv_grad_input_ops.cc @@ -592,6 +592,7 @@ class Conv2DCustomBackpropInputOp : public OpKernel { TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS // GPU definitions. @@ -1090,6 +1091,7 @@ namespace functor { typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \ extern template struct PadInput<GPUDevice, T, int, 4>; +DECLARE_GPU_SPEC(double); DECLARE_GPU_SPEC(float); DECLARE_GPU_SPEC(Eigen::half); #undef DECLARE_GPU_SPEC @@ -1097,6 +1099,11 @@ DECLARE_GPU_SPEC(Eigen::half); REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") .Device(DEVICE_GPU) + .TypeConstraint<double>("T") + .HostMemory("input_sizes"), + Conv2DSlowBackpropInputOp<GPUDevice, double>); +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") + .Device(DEVICE_GPU) .TypeConstraint<float>("T") .HostMemory("input_sizes"), Conv2DSlowBackpropInputOp<GPUDevice, float>); diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index 47f6907c04..88843e4da7 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -446,10 +446,11 @@ class Conv2DOp : public BinaryOp<T> { #if !defined(USE_GEMM_FOR_CONV) TF_CALL_half(REGISTER_CPU); TF_CALL_float(REGISTER_CPU); +TF_CALL_double(REGISTER_CPU); #endif // USE_GEMM_FOR_CONV // To be used inside depthwise_conv_op.cc. -template class LaunchConv2DOp<CPUDevice, float>; +template struct LaunchConv2DOp<CPUDevice, float>; #if GOOGLE_CUDA int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb, @@ -810,6 +811,7 @@ namespace functor { typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \ extern template struct PadInput<GPUDevice, T, int, 4> +DECLARE_GPU_SPEC(double); DECLARE_GPU_SPEC(float); DECLARE_GPU_SPEC(Eigen::half); #undef DECLARE_GPU_SPEC @@ -822,6 +824,9 @@ REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER( Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<float>("T"), Conv2DOp<GPUDevice, float>); +REGISTER_KERNEL_BUILDER( + Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<double>("T"), + Conv2DOp<GPUDevice, double>); // To be used inside depthwise_conv_op.cc. template class LaunchConv2DOp<GPUDevice, float>; diff --git a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc index b5dd26a9e4..52859af950 100644 --- a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc +++ b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc @@ -25,6 +25,9 @@ limitations under the License. namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; +template struct functor::InflatePadAndShuffle<GPUDevice, double, 4, int>; +template struct functor::InflatePadAndShuffle<GPUDevice, double, 4, + Eigen::DenseIndex>; template struct functor::InflatePadAndShuffle<GPUDevice, float, 4, int>; template struct functor::InflatePadAndShuffle<GPUDevice, float, 4, Eigen::DenseIndex>; diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc index a376534bad..2503b475dc 100644 --- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc +++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc @@ -1039,9 +1039,11 @@ template struct functor::SwapDimension0And2InTensor3<GPUDevice, double2, /*conjugate=*/true>; // For 2d ops. +template struct functor::TransformFilter<GPUDevice, double, int, 4>; template struct functor::TransformFilter<GPUDevice, float, int, 4>; template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>; +template struct functor::ReverseTransformFilter<GPUDevice, double, 4>; template struct functor::ReverseTransformFilter<GPUDevice, float, 4>; template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>; @@ -1054,6 +1056,7 @@ template struct functor::NCHWToNHWC<GPUDevice, float, 4>; template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>; template struct functor::PadInput<GPUDevice, int, int, 4>; +template struct functor::PadInput<GPUDevice, double, int, 4>; template struct functor::PadInput<GPUDevice, float, int, 4>; template struct functor::PadInput<GPUDevice, Eigen::half, int, 4>; diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc index c060b2e14d..6dedb1a61e 100644 --- a/tensorflow/core/kernels/depthwise_conv_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_op.cc @@ -241,7 +241,7 @@ struct LaunchDepthwiseConvOp<CPUDevice, T> { }; // Extern template instantiated in conv_ops.cc. -extern template class LaunchConv2DOp<CPUDevice, float>; +extern template struct LaunchConv2DOp<CPUDevice, float>; #if GOOGLE_CUDA @@ -251,7 +251,7 @@ extern template struct LaunchDepthwiseConvOp<GPUDevice, float>; extern template struct LaunchDepthwiseConvOp<GPUDevice, double>; // Extern template instantiated in conv_ops.cc. -extern template class LaunchConv2DOp<GPUDevice, float>; +extern template struct LaunchConv2DOp<GPUDevice, float>; #endif diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h index 1acbe3a658..a4dff4b91c 100644 --- a/tensorflow/core/kernels/eigen_spatial_convolutions.h +++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h @@ -797,6 +797,188 @@ struct gemm_pack_rhs< } }; +// Template specialization for packet_size = 2. We must special-case packet +// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>. +template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, + typename ArgType, typename Device, typename Scalar, typename Index, + typename nocontract_t, typename contract_t, bool inner_dim_contiguous, + bool inner_dim_reordered, int Alignment, int nr> +struct gemm_pack_rhs< + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp< + NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, + Alignment>, + nr, ColMajor, false, false> { + typedef TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp< + NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, + Device>, + nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, + Alignment> + SubMapper; + typedef SubMapper DataMapper; + + EIGEN_DEVICE_FUNC + static inline Index ceil_div(Index a, Index b) { return (a + b - 1) / b; } + + EIGEN_DEVICE_FUNC + EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, + Index depth, Index cols, Index stride = 0, + Index offset = 0) const { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + typedef typename packet_traits<Scalar>::type Packet; + + const int packet_size = 2; + const Index packet_cols4 = (cols / 4) * 4; + const Index peeled_k = (depth / packet_size) * packet_size; + const bool non_standard_patches = rhs.nonStandardPatches(); + + for (Index j2 = 0; j2 < packet_cols4; j2 += 4) { + const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + Index k = 0; + if (!non_standard_patches) { + const Index patch_depth = rhs.patchDepth(); + if ((patch_depth % packet_size) == 0) { + const Index patch_cols = rhs.patchCols(); + const Index patch_rows = rhs.patchRows(); + + const Index startCol = rhs.colOffset(); + const Index max_cols = std::min<Index>( + ceil_div(peeled_k, patch_rows * patch_depth) + startCol, + patch_cols); + + for (Index c = startCol; c < max_cols; ++c) { + eigen_assert(k < peeled_k); + const Index startRow = (c == startCol) ? rhs.rowOffset() : 0; + const Index max_rows = std::min<Index>( + ceil_div(peeled_k - c * patch_rows * patch_depth, patch_depth) + + startRow, + patch_rows); + + const bool pad_col0 = dm0.padCol(c); + const bool pad_col1 = dm1.padCol(c); + const bool pad_col2 = dm2.padCol(c); + const bool pad_col3 = dm3.padCol(c); + for (Index r = startRow; r < max_rows; ++r) { + eigen_assert(k < peeled_k); + const bool pad0 = pad_col0 || dm0.padRow(r); + const bool pad1 = pad_col1 || dm1.padRow(r); + const bool pad2 = pad_col2 || dm2.padRow(r); + const bool pad3 = pad_col3 || dm3.padRow(r); + + const Index idx0 = dm0.baseIndex(r, c); + const Index idx1 = dm1.baseIndex(r, c); + const Index idx2 = dm2.baseIndex(r, c); + const Index idx3 = dm3.baseIndex(r, c); + + const Index startDepth = + ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0; + const Index max_depth = + std::min<Index>(peeled_k - c * patch_rows * patch_depth - + r * patch_depth + startDepth, + patch_depth); + eigen_assert((max_depth - startDepth) % packet_size == 0); + for (Index d = startDepth; d < max_depth; d += packet_size) { + eigen_assert(k < peeled_k); + PacketBlock<Packet, 2> kernel0; + PacketBlock<Packet, 2> kernel1; + kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx0); + kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx1); + kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx2); + kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0)) + : rhs.packetNoPadding(d, idx3); + ptranspose(kernel0); + ptranspose(kernel1); + pstoreu(block + 0 * packet_size, kernel0.packet[0]); + pstoreu(block + 1 * packet_size, kernel1.packet[0]); + pstoreu(block + 2 * packet_size, kernel0.packet[1]); + pstoreu(block + 3 * packet_size, kernel1.packet[1]); + block += 4 * packet_size; + k += packet_size; + } + } + } + + for (; k < peeled_k; k += packet_size) { + PacketBlock<Packet, 2> kernel0; + PacketBlock<Packet, 2> kernel1; + kernel0.packet[0] = dm0.loadPacketFast(k); + kernel0.packet[1] = dm1.loadPacketFast(k); + kernel1.packet[0] = dm2.loadPacketFast(k); + kernel1.packet[1] = dm3.loadPacketFast(k); + ptranspose(kernel0); + ptranspose(kernel1); + pstoreu(block + 0 * packet_size, kernel0.packet[0]); + pstoreu(block + 1 * packet_size, kernel1.packet[0]); + pstoreu(block + 2 * packet_size, kernel0.packet[1]); + pstoreu(block + 3 * packet_size, kernel1.packet[1]); + block += 4 * packet_size; + } + } else { + for (; k < peeled_k; k += packet_size) { + PacketBlock<Packet, 2> kernel0; + PacketBlock<Packet, 2> kernel1; + kernel0.packet[0] = dm0.loadPacketStandard(k); + kernel0.packet[1] = dm1.loadPacketStandard(k); + kernel1.packet[0] = dm2.loadPacketStandard(k); + kernel1.packet[1] = dm3.loadPacketStandard(k); + ptranspose(kernel0); + ptranspose(kernel1); + pstoreu(block + 0 * packet_size, kernel0.packet[0]); + pstoreu(block + 1 * packet_size, kernel1.packet[0]); + pstoreu(block + 2 * packet_size, kernel0.packet[1]); + pstoreu(block + 3 * packet_size, kernel1.packet[1]); + block += 4 * packet_size; + } + } + } + if (!rhs.nonStandardPatches()) { + for (; k < depth; k++) { + block[0] = dm0.loadCoeffStandard(k); + block[1] = dm1.loadCoeffStandard(k); + block[2] = dm2.loadCoeffStandard(k); + block[3] = dm3.loadCoeffStandard(k); + block += 4; + } + } else { + for (; k < depth; k++) { + block[0] = dm0(k); + block[1] = dm1(k); + block[2] = dm2(k); + block[3] = dm3(k); + block += 4; + } + } + } + + // copy the remaining columns one at a time (nr==1) + for (Index j2 = packet_cols4; j2 < cols; ++j2) { + const SubMapper dm0 = rhs.getLinearMapper(0, j2); + for (Index k = 0; k < depth; k++) { + *block = dm0(k); + block += 1; + } + } + } +}; + // Special case for non-vectorized types such as float16. template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device, typename Scalar, typename Index, diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index d6a0f38033..1f4e9753c3 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -266,7 +266,7 @@ REGISTER_OP("Conv2D") .Input("input: T") .Input("filter: T") .Output("output: T") - .Attr("T: {half, bfloat16, float}") + .Attr("T: {half, bfloat16, float, double}") .Attr("strides: list(int)") .Attr("use_cudnn_on_gpu: bool = true") .Attr(GetPaddingAttrString()) @@ -279,7 +279,7 @@ REGISTER_OP("Conv2DBackpropInput") .Input("filter: T") .Input("out_backprop: T") .Output("output: T") - .Attr("T: {half, bfloat16, float}") + .Attr("T: {half, bfloat16, float, double}") .Attr("strides: list(int)") .Attr("use_cudnn_on_gpu: bool = true") .Attr(GetPaddingAttrString()) @@ -301,7 +301,7 @@ REGISTER_OP("Conv2DBackpropFilter") .Input("filter_sizes: int32") .Input("out_backprop: T") .Output("output: T") - .Attr("T: {half, bfloat16, float}") + .Attr("T: {half, bfloat16, float, double}") .Attr("strides: list(int)") .Attr("use_cudnn_on_gpu: bool = true") .Attr(GetPaddingAttrString()) diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py index 25525cc128..a291bef0ad 100644 --- a/tensorflow/python/kernel_tests/conv_ops_test.py +++ b/tensorflow/python/kernel_tests/conv_ops_test.py @@ -159,11 +159,11 @@ class Conv2DTest(test.TestCase): def _DtypesToTest(self, use_gpu): if use_gpu and not test_util.CudaSupportsHalfMatMulAndConv(): - return [dtypes.float32] + return [dtypes.float32, dtypes.float64] else: # It is important that float32 comes before float16 here, # as we will be using its gradients as reference for fp16 gradients. - return [dtypes.float32, dtypes.float16] + return [dtypes.float32, dtypes.float16, dtypes.float64] def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, dilations, strides, padding, data_format, dtype, use_gpu): |