aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/core/kernels/conv_grad_filter_ops.cc7
-rw-r--r--tensorflow/core/kernels/conv_grad_input_ops.cc7
-rw-r--r--tensorflow/core/kernels/conv_ops.cc7
-rw-r--r--tensorflow/core/kernels/conv_ops_gpu_2.cu.cc3
-rw-r--r--tensorflow/core/kernels/conv_ops_gpu_3.cu.cc3
-rw-r--r--tensorflow/core/kernels/depthwise_conv_op.cc4
-rw-r--r--tensorflow/core/kernels/eigen_spatial_convolutions.h182
-rw-r--r--tensorflow/core/ops/nn_ops.cc6
-rw-r--r--tensorflow/python/kernel_tests/conv_ops_test.py4
9 files changed, 215 insertions, 8 deletions
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index e6ae595291..66ee474ca3 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -520,6 +520,7 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
// GPU definitions.
@@ -1017,6 +1018,7 @@ namespace functor {
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
extern template struct PadInput<GPUDevice, T, int, 4>;
+DECLARE_GPU_SPEC(double);
DECLARE_GPU_SPEC(float);
DECLARE_GPU_SPEC(Eigen::half);
#undef DECLARE_GPU_SPEC
@@ -1024,6 +1026,11 @@ DECLARE_GPU_SPEC(Eigen::half);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
.Device(DEVICE_GPU)
+ .TypeConstraint<double>("T")
+ .HostMemory("filter_sizes"),
+ Conv2DSlowBackpropFilterOp<GPUDevice, double>);
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+ .Device(DEVICE_GPU)
.TypeConstraint<float>("T")
.HostMemory("filter_sizes"),
Conv2DSlowBackpropFilterOp<GPUDevice, float>);
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 15c55e4d99..71ea0d5d72 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -592,6 +592,7 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
TF_CALL_half(REGISTER_CPU_KERNELS);
TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
// GPU definitions.
@@ -1090,6 +1091,7 @@ namespace functor {
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
extern template struct PadInput<GPUDevice, T, int, 4>;
+DECLARE_GPU_SPEC(double);
DECLARE_GPU_SPEC(float);
DECLARE_GPU_SPEC(Eigen::half);
#undef DECLARE_GPU_SPEC
@@ -1097,6 +1099,11 @@ DECLARE_GPU_SPEC(Eigen::half);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
.Device(DEVICE_GPU)
+ .TypeConstraint<double>("T")
+ .HostMemory("input_sizes"),
+ Conv2DSlowBackpropInputOp<GPUDevice, double>);
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+ .Device(DEVICE_GPU)
.TypeConstraint<float>("T")
.HostMemory("input_sizes"),
Conv2DSlowBackpropInputOp<GPUDevice, float>);
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 47f6907c04..88843e4da7 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -446,10 +446,11 @@ class Conv2DOp : public BinaryOp<T> {
#if !defined(USE_GEMM_FOR_CONV)
TF_CALL_half(REGISTER_CPU);
TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
#endif // USE_GEMM_FOR_CONV
// To be used inside depthwise_conv_op.cc.
-template class LaunchConv2DOp<CPUDevice, float>;
+template struct LaunchConv2DOp<CPUDevice, float>;
#if GOOGLE_CUDA
int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
@@ -810,6 +811,7 @@ namespace functor {
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
extern template struct PadInput<GPUDevice, T, int, 4>
+DECLARE_GPU_SPEC(double);
DECLARE_GPU_SPEC(float);
DECLARE_GPU_SPEC(Eigen::half);
#undef DECLARE_GPU_SPEC
@@ -822,6 +824,9 @@ REGISTER_KERNEL_BUILDER(
REGISTER_KERNEL_BUILDER(
Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
Conv2DOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+ Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+ Conv2DOp<GPUDevice, double>);
// To be used inside depthwise_conv_op.cc.
template class LaunchConv2DOp<GPUDevice, float>;
diff --git a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
index b5dd26a9e4..52859af950 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
@@ -25,6 +25,9 @@ limitations under the License.
namespace tensorflow {
typedef Eigen::GpuDevice GPUDevice;
+template struct functor::InflatePadAndShuffle<GPUDevice, double, 4, int>;
+template struct functor::InflatePadAndShuffle<GPUDevice, double, 4,
+ Eigen::DenseIndex>;
template struct functor::InflatePadAndShuffle<GPUDevice, float, 4, int>;
template struct functor::InflatePadAndShuffle<GPUDevice, float, 4,
Eigen::DenseIndex>;
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index a376534bad..2503b475dc 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -1039,9 +1039,11 @@ template struct functor::SwapDimension0And2InTensor3<GPUDevice, double2,
/*conjugate=*/true>;
// For 2d ops.
+template struct functor::TransformFilter<GPUDevice, double, int, 4>;
template struct functor::TransformFilter<GPUDevice, float, int, 4>;
template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
+template struct functor::ReverseTransformFilter<GPUDevice, double, 4>;
template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;
@@ -1054,6 +1056,7 @@ template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;
template struct functor::PadInput<GPUDevice, int, int, 4>;
+template struct functor::PadInput<GPUDevice, double, int, 4>;
template struct functor::PadInput<GPUDevice, float, int, 4>;
template struct functor::PadInput<GPUDevice, Eigen::half, int, 4>;
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index c060b2e14d..6dedb1a61e 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -241,7 +241,7 @@ struct LaunchDepthwiseConvOp<CPUDevice, T> {
};
// Extern template instantiated in conv_ops.cc.
-extern template class LaunchConv2DOp<CPUDevice, float>;
+extern template struct LaunchConv2DOp<CPUDevice, float>;
#if GOOGLE_CUDA
@@ -251,7 +251,7 @@ extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
// Extern template instantiated in conv_ops.cc.
-extern template class LaunchConv2DOp<GPUDevice, float>;
+extern template struct LaunchConv2DOp<GPUDevice, float>;
#endif
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 1acbe3a658..a4dff4b91c 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -797,6 +797,188 @@ struct gemm_pack_rhs<
}
};
+// Template specialization for packet_size = 2. We must special-case packet
+// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
+template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
+ typename ArgType, typename Device, typename Scalar, typename Index,
+ typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+ bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+ Scalar, Index,
+ TensorContractionSubMapper<
+ Scalar, Index, Rhs,
+ TensorEvaluator<
+ const TensorReshapingOp<
+ NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+ Device>,
+ nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+ Alignment>,
+ nr, ColMajor, false, false> {
+ typedef TensorContractionSubMapper<
+ Scalar, Index, Rhs,
+ TensorEvaluator<
+ const TensorReshapingOp<
+ NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+ Device>,
+ nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+ Alignment>
+ SubMapper;
+ typedef SubMapper DataMapper;
+
+ EIGEN_DEVICE_FUNC
+ static inline Index ceil_div(Index a, Index b) { return (a + b - 1) / b; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+ Index depth, Index cols, Index stride = 0,
+ Index offset = 0) const {
+ eigen_assert(stride == 0);
+ eigen_assert(offset == 0);
+
+ EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ typedef typename packet_traits<Scalar>::type Packet;
+
+ const int packet_size = 2;
+ const Index packet_cols4 = (cols / 4) * 4;
+ const Index peeled_k = (depth / packet_size) * packet_size;
+ const bool non_standard_patches = rhs.nonStandardPatches();
+
+ for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+ const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+ const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+ const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+ const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+ Index k = 0;
+ if (!non_standard_patches) {
+ const Index patch_depth = rhs.patchDepth();
+ if ((patch_depth % packet_size) == 0) {
+ const Index patch_cols = rhs.patchCols();
+ const Index patch_rows = rhs.patchRows();
+
+ const Index startCol = rhs.colOffset();
+ const Index max_cols = std::min<Index>(
+ ceil_div(peeled_k, patch_rows * patch_depth) + startCol,
+ patch_cols);
+
+ for (Index c = startCol; c < max_cols; ++c) {
+ eigen_assert(k < peeled_k);
+ const Index startRow = (c == startCol) ? rhs.rowOffset() : 0;
+ const Index max_rows = std::min<Index>(
+ ceil_div(peeled_k - c * patch_rows * patch_depth, patch_depth) +
+ startRow,
+ patch_rows);
+
+ const bool pad_col0 = dm0.padCol(c);
+ const bool pad_col1 = dm1.padCol(c);
+ const bool pad_col2 = dm2.padCol(c);
+ const bool pad_col3 = dm3.padCol(c);
+ for (Index r = startRow; r < max_rows; ++r) {
+ eigen_assert(k < peeled_k);
+ const bool pad0 = pad_col0 || dm0.padRow(r);
+ const bool pad1 = pad_col1 || dm1.padRow(r);
+ const bool pad2 = pad_col2 || dm2.padRow(r);
+ const bool pad3 = pad_col3 || dm3.padRow(r);
+
+ const Index idx0 = dm0.baseIndex(r, c);
+ const Index idx1 = dm1.baseIndex(r, c);
+ const Index idx2 = dm2.baseIndex(r, c);
+ const Index idx3 = dm3.baseIndex(r, c);
+
+ const Index startDepth =
+ ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0;
+ const Index max_depth =
+ std::min<Index>(peeled_k - c * patch_rows * patch_depth -
+ r * patch_depth + startDepth,
+ patch_depth);
+ eigen_assert((max_depth - startDepth) % packet_size == 0);
+ for (Index d = startDepth; d < max_depth; d += packet_size) {
+ eigen_assert(k < peeled_k);
+ PacketBlock<Packet, 2> kernel0;
+ PacketBlock<Packet, 2> kernel1;
+ kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+ : rhs.packetNoPadding(d, idx0);
+ kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+ : rhs.packetNoPadding(d, idx1);
+ kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
+ : rhs.packetNoPadding(d, idx2);
+ kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
+ : rhs.packetNoPadding(d, idx3);
+ ptranspose(kernel0);
+ ptranspose(kernel1);
+ pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+ pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+ pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+ pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+ block += 4 * packet_size;
+ k += packet_size;
+ }
+ }
+ }
+
+ for (; k < peeled_k; k += packet_size) {
+ PacketBlock<Packet, 2> kernel0;
+ PacketBlock<Packet, 2> kernel1;
+ kernel0.packet[0] = dm0.loadPacketFast(k);
+ kernel0.packet[1] = dm1.loadPacketFast(k);
+ kernel1.packet[0] = dm2.loadPacketFast(k);
+ kernel1.packet[1] = dm3.loadPacketFast(k);
+ ptranspose(kernel0);
+ ptranspose(kernel1);
+ pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+ pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+ pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+ pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+ block += 4 * packet_size;
+ }
+ } else {
+ for (; k < peeled_k; k += packet_size) {
+ PacketBlock<Packet, 2> kernel0;
+ PacketBlock<Packet, 2> kernel1;
+ kernel0.packet[0] = dm0.loadPacketStandard(k);
+ kernel0.packet[1] = dm1.loadPacketStandard(k);
+ kernel1.packet[0] = dm2.loadPacketStandard(k);
+ kernel1.packet[1] = dm3.loadPacketStandard(k);
+ ptranspose(kernel0);
+ ptranspose(kernel1);
+ pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+ pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+ pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+ pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+ block += 4 * packet_size;
+ }
+ }
+ }
+ if (!rhs.nonStandardPatches()) {
+ for (; k < depth; k++) {
+ block[0] = dm0.loadCoeffStandard(k);
+ block[1] = dm1.loadCoeffStandard(k);
+ block[2] = dm2.loadCoeffStandard(k);
+ block[3] = dm3.loadCoeffStandard(k);
+ block += 4;
+ }
+ } else {
+ for (; k < depth; k++) {
+ block[0] = dm0(k);
+ block[1] = dm1(k);
+ block[2] = dm2(k);
+ block[3] = dm3(k);
+ block += 4;
+ }
+ }
+ }
+
+ // copy the remaining columns one at a time (nr==1)
+ for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+ const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+ for (Index k = 0; k < depth; k++) {
+ *block = dm0(k);
+ block += 1;
+ }
+ }
+ }
+};
+
// Special case for non-vectorized types such as float16.
template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
typename ArgType, typename Device, typename Scalar, typename Index,
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index d6a0f38033..1f4e9753c3 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -266,7 +266,7 @@ REGISTER_OP("Conv2D")
.Input("input: T")
.Input("filter: T")
.Output("output: T")
- .Attr("T: {half, bfloat16, float}")
+ .Attr("T: {half, bfloat16, float, double}")
.Attr("strides: list(int)")
.Attr("use_cudnn_on_gpu: bool = true")
.Attr(GetPaddingAttrString())
@@ -279,7 +279,7 @@ REGISTER_OP("Conv2DBackpropInput")
.Input("filter: T")
.Input("out_backprop: T")
.Output("output: T")
- .Attr("T: {half, bfloat16, float}")
+ .Attr("T: {half, bfloat16, float, double}")
.Attr("strides: list(int)")
.Attr("use_cudnn_on_gpu: bool = true")
.Attr(GetPaddingAttrString())
@@ -301,7 +301,7 @@ REGISTER_OP("Conv2DBackpropFilter")
.Input("filter_sizes: int32")
.Input("out_backprop: T")
.Output("output: T")
- .Attr("T: {half, bfloat16, float}")
+ .Attr("T: {half, bfloat16, float, double}")
.Attr("strides: list(int)")
.Attr("use_cudnn_on_gpu: bool = true")
.Attr(GetPaddingAttrString())
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 25525cc128..a291bef0ad 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -159,11 +159,11 @@ class Conv2DTest(test.TestCase):
def _DtypesToTest(self, use_gpu):
if use_gpu and not test_util.CudaSupportsHalfMatMulAndConv():
- return [dtypes.float32]
+ return [dtypes.float32, dtypes.float64]
else:
# It is important that float32 comes before float16 here,
# as we will be using its gradients as reference for fp16 gradients.
- return [dtypes.float32, dtypes.float16]
+ return [dtypes.float32, dtypes.float16, dtypes.float64]
def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, dilations,
strides, padding, data_format, dtype, use_gpu):