diff options
author | TensorFlower Gardener <gardener@tensorflow.org> | 2018-09-30 22:19:40 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-09-30 22:19:40 -0700 |
commit | 987954ce50583409e54828a044e0866bcfdbd88a (patch) | |
tree | cf8c6a3b2348e325b6c29cc675938e33de24b494 /tensorflow/core/kernels | |
parent | 76c4853b50f201b4a809ac66746c798e049b294c (diff) | |
parent | 0136c7307f036290fa3ca308c1a9c67c053d903f (diff) |
Merge pull request #22571 from Intel-tensorflow:agramesh/fix_mkl_slice
PiperOrigin-RevId: 215161850
Diffstat (limited to 'tensorflow/core/kernels')
-rw-r--r-- | tensorflow/core/kernels/slice_op.cc | 195 |
1 files changed, 0 insertions, 195 deletions
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc index 97f77e45b6..a006c69297 100644 --- a/tensorflow/core/kernels/slice_op.cc +++ b/tensorflow/core/kernels/slice_op.cc @@ -228,191 +228,6 @@ class SliceOp : public OpKernel { } }; -#ifdef INTEL_MKL -template <typename Device, typename T> -class MklSliceOp : public OpKernel { - public: - explicit MklSliceOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - TensorShape output_shape; - gtl::InlinedVector<int64, 4> begin; - gtl::InlinedVector<int64, 4> size; - Tensor* result = nullptr; - bool done = false; - SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result, - &done); - if (!context->status().ok() || done == true) return; - - const Tensor& input = context->input(0); - const int input_dims = input.dims(); - - if (output_shape.num_elements() > 0) { - if (std::is_same<Device, CPUDevice>::value && input_dims == 2 && - DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) { - auto input = context->input(0).tensor<T, 2>(); - auto output = result->tensor<T, 2>(); - // TODO(agarwal): Consider multi-threading this loop for cases where - // size[0] is very large. - for (int i = 0; i < size[0]; ++i) { - const int64 row = begin[0] + i; - if (i + 1 < size[0]) { - port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0)); - port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1])); - } - memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T)); - } - return; - } -#define HANDLE_DIM(NDIM) \ - if (input_dims == NDIM) { \ - HandleCase<NDIM>(context, begin, size, result); \ - return; \ - } - - HANDLE_DIM(1); - HANDLE_DIM(2); - HANDLE_DIM(3); - HANDLE_DIM(4); - HANDLE_DIM(5); - HANDLE_DIM(6); - HANDLE_DIM(7); - -#undef HANDLE_DIM - - OP_REQUIRES( - context, false, - errors::Unimplemented("SliceOp : Unhandled input dimensions")); - } - } - - private: - // Helper function for DoesSliceShapeDifferInOnly1D. Checks if the following - // criteria matches for slice_dim: if indices for slice are 0 in all dims - // except slice_dim and if sizes of all the dimensions of the slice are same - // as the sizes of all the dimensions of the input except slice_dim, then - // returns True. Otherwise, returns False. - bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape, - const gtl::ArraySlice<int64>& begin, - const gtl::ArraySlice<int64>& size, - int slice_dim) { - for (int dim = 0; dim < 4; dim++) { - if (dim != slice_dim && - (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) { - return false; - } - } - return true; - } - - // Is 'input' tensor being sliced over a single dimension out of 4? - // - // This check is applicable in the context of Slice of a 4-D tensor in - // NHWC or NCHW format over channel dimension. - // - // If indices for slice are 0 in all dims except one dimension and if sizes of - // all dimensions of slice are same as sizes of all dimensions of inputs - // except that dimension, then we are slicing over a single dimension. - // - // Returns True if Slicing over a single dimension, and sets slice_dim - // to the number of the dimension that satisfies criteria. - bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape, - const gtl::ArraySlice<int64>& begin, - const gtl::ArraySlice<int64>& size, - int* slice_dim) { - for (int dim = 0; dim < 4; dim++) { - if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) { - *slice_dim = dim; - return true; - } - } - return false; - } - - template <int NDIM> - void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin, - const gtl::ArraySlice<int64>& size, Tensor* result) { - int slice_dim = -1; - TensorShape in_shape = context->input(0).shape(); - // Special case for handling 4-D tensor slice when shape of the slice - // differs from the input tensor in only 1 out of 4 dimensions. - // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW - // format over channel dimension. - if (NDIM == 4 && - DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) { - size_t in_strides[4] = { - (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) * - in_shape.dim_size(3), - (size_t)in_shape.dim_size(2) * in_shape.dim_size(3), - (size_t)in_shape.dim_size(3), (size_t)1}; - - size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3], - (size_t)size[2] * size[3], (size_t)size[3], - (size_t)1}; - - T* in_buf = const_cast<T*>( - const_cast<const T*>(context->input(0).flat<T>().data())); - T* op_buf = result->flat<T>().data(); - - if (slice_dim == 1) { - /* data format = NCHW */ - -#pragma omp parallel for - for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { - T* ip = in_buf + (d0 * in_strides[0]); - T* op = op_buf + ((d0 - begin[0]) * out_strides[0]); -#pragma omp parallel for - for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { - T* ip1 = ip + (d1 * in_strides[1]); - T* op1 = op + ((d1 - begin[1]) * out_strides[1]); - // For NCHW, H and W will be contiguous. So we can copy - // both with one memcpy. - memcpy(static_cast<void*>(op1), static_cast<void*>(ip1), - sizeof(T) * in_strides[1]); - } - } - return; - } else if (slice_dim == 3) { - /* data_format = NHWC */ - -#pragma omp parallel for - for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { - T* ip = in_buf + (d0 * in_strides[0]); - T* op = op_buf + ((d0 - begin[0]) * out_strides[0]); -#pragma omp parallel for - for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { - T* ip1 = ip + (d1 * in_strides[1]); - T* op1 = op + ((d1 - begin[1]) * out_strides[1]); -#pragma omp parallel for - for (ssize_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) { - T* ip2 = ip1 + (d2 * in_strides[2]); - T* ip3 = ip2 + begin[3]; - T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]); - T* op3 = op2; - memcpy(static_cast<void*>(op3), static_cast<void*>(ip3), - sizeof(T) * size[3]); - } - } - } - return; - } - // slice_dim is not 1 or 3, then we fallback to Eigen implementation. - } - - Eigen::DSizes<Eigen::DenseIndex, NDIM> indices; - Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes; - for (int i = 0; i < NDIM; ++i) { - indices[i] = begin[i]; - sizes[i] = size[i]; - } - - functor::Slice<Device, T, NDIM>()( - context->eigen_device<Device>(), result->tensor<T, NDIM>(), - context->input(0).tensor<T, NDIM>(), indices, sizes); - } -}; -#endif // INTEL_MKL - // Forward declarations of the functor specializations for declared in the // sharded source files. namespace functor { @@ -440,15 +255,6 @@ TF_CALL_ALL_TYPES(DECLARE_FOR_N); #undef DECLARE_CPU_SPEC } // namespace functor -#if defined(INTEL_MKL) && defined(ENABLE_MKL) -#define REGISTER_SLICE(type) \ - REGISTER_KERNEL_BUILDER(Name("Slice") \ - .Device(DEVICE_CPU) \ - .TypeConstraint<type>("T") \ - .HostMemory("begin") \ - .HostMemory("size"), \ - MklSliceOp<CPUDevice, type>) -#else #define REGISTER_SLICE(type) \ REGISTER_KERNEL_BUILDER(Name("Slice") \ .Device(DEVICE_CPU) \ @@ -456,7 +262,6 @@ TF_CALL_ALL_TYPES(DECLARE_FOR_N); .HostMemory("begin") \ .HostMemory("size"), \ SliceOp<CPUDevice, type>) -#endif // INTEL_MKL && ENABLE_MKL TF_CALL_POD_STRING_TYPES(REGISTER_SLICE); TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE); |