aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core
diff options
context:
space:
mode:
authorGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-09-30 22:19:40 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-09-30 22:19:40 -0700
commit987954ce50583409e54828a044e0866bcfdbd88a (patch)
treecf8c6a3b2348e325b6c29cc675938e33de24b494 /tensorflow/core
parent76c4853b50f201b4a809ac66746c798e049b294c (diff)
parent0136c7307f036290fa3ca308c1a9c67c053d903f (diff)
Merge pull request #22571 from Intel-tensorflow:agramesh/fix_mkl_slice
PiperOrigin-RevId: 215161850
Diffstat (limited to 'tensorflow/core')
-rw-r--r--tensorflow/core/kernels/slice_op.cc195
1 files changed, 0 insertions, 195 deletions
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 97f77e45b6..a006c69297 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -228,191 +228,6 @@ class SliceOp : public OpKernel {
}
};
-#ifdef INTEL_MKL
-template <typename Device, typename T>
-class MklSliceOp : public OpKernel {
- public:
- explicit MklSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
-
- void Compute(OpKernelContext* context) override {
- TensorShape output_shape;
- gtl::InlinedVector<int64, 4> begin;
- gtl::InlinedVector<int64, 4> size;
- Tensor* result = nullptr;
- bool done = false;
- SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result,
- &done);
- if (!context->status().ok() || done == true) return;
-
- const Tensor& input = context->input(0);
- const int input_dims = input.dims();
-
- if (output_shape.num_elements() > 0) {
- if (std::is_same<Device, CPUDevice>::value && input_dims == 2 &&
- DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
- auto input = context->input(0).tensor<T, 2>();
- auto output = result->tensor<T, 2>();
- // TODO(agarwal): Consider multi-threading this loop for cases where
- // size[0] is very large.
- for (int i = 0; i < size[0]; ++i) {
- const int64 row = begin[0] + i;
- if (i + 1 < size[0]) {
- port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
- port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
- }
- memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T));
- }
- return;
- }
-#define HANDLE_DIM(NDIM) \
- if (input_dims == NDIM) { \
- HandleCase<NDIM>(context, begin, size, result); \
- return; \
- }
-
- HANDLE_DIM(1);
- HANDLE_DIM(2);
- HANDLE_DIM(3);
- HANDLE_DIM(4);
- HANDLE_DIM(5);
- HANDLE_DIM(6);
- HANDLE_DIM(7);
-
-#undef HANDLE_DIM
-
- OP_REQUIRES(
- context, false,
- errors::Unimplemented("SliceOp : Unhandled input dimensions"));
- }
- }
-
- private:
- // Helper function for DoesSliceShapeDifferInOnly1D. Checks if the following
- // criteria matches for slice_dim: if indices for slice are 0 in all dims
- // except slice_dim and if sizes of all the dimensions of the slice are same
- // as the sizes of all the dimensions of the input except slice_dim, then
- // returns True. Otherwise, returns False.
- bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
- const gtl::ArraySlice<int64>& begin,
- const gtl::ArraySlice<int64>& size,
- int slice_dim) {
- for (int dim = 0; dim < 4; dim++) {
- if (dim != slice_dim &&
- (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
- return false;
- }
- }
- return true;
- }
-
- // Is 'input' tensor being sliced over a single dimension out of 4?
- //
- // This check is applicable in the context of Slice of a 4-D tensor in
- // NHWC or NCHW format over channel dimension.
- //
- // If indices for slice are 0 in all dims except one dimension and if sizes of
- // all dimensions of slice are same as sizes of all dimensions of inputs
- // except that dimension, then we are slicing over a single dimension.
- //
- // Returns True if Slicing over a single dimension, and sets slice_dim
- // to the number of the dimension that satisfies criteria.
- bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
- const gtl::ArraySlice<int64>& begin,
- const gtl::ArraySlice<int64>& size,
- int* slice_dim) {
- for (int dim = 0; dim < 4; dim++) {
- if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
- *slice_dim = dim;
- return true;
- }
- }
- return false;
- }
-
- template <int NDIM>
- void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
- const gtl::ArraySlice<int64>& size, Tensor* result) {
- int slice_dim = -1;
- TensorShape in_shape = context->input(0).shape();
- // Special case for handling 4-D tensor slice when shape of the slice
- // differs from the input tensor in only 1 out of 4 dimensions.
- // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW
- // format over channel dimension.
- if (NDIM == 4 &&
- DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
- size_t in_strides[4] = {
- (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) *
- in_shape.dim_size(3),
- (size_t)in_shape.dim_size(2) * in_shape.dim_size(3),
- (size_t)in_shape.dim_size(3), (size_t)1};
-
- size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3],
- (size_t)size[2] * size[3], (size_t)size[3],
- (size_t)1};
-
- T* in_buf = const_cast<T*>(
- const_cast<const T*>(context->input(0).flat<T>().data()));
- T* op_buf = result->flat<T>().data();
-
- if (slice_dim == 1) {
- /* data format = NCHW */
-
-#pragma omp parallel for
- for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
- T* ip = in_buf + (d0 * in_strides[0]);
- T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-#pragma omp parallel for
- for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
- T* ip1 = ip + (d1 * in_strides[1]);
- T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
- // For NCHW, H and W will be contiguous. So we can copy
- // both with one memcpy.
- memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
- sizeof(T) * in_strides[1]);
- }
- }
- return;
- } else if (slice_dim == 3) {
- /* data_format = NHWC */
-
-#pragma omp parallel for
- for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
- T* ip = in_buf + (d0 * in_strides[0]);
- T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-#pragma omp parallel for
- for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
- T* ip1 = ip + (d1 * in_strides[1]);
- T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
-#pragma omp parallel for
- for (ssize_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
- T* ip2 = ip1 + (d2 * in_strides[2]);
- T* ip3 = ip2 + begin[3];
- T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
- T* op3 = op2;
- memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
- sizeof(T) * size[3]);
- }
- }
- }
- return;
- }
- // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
- }
-
- Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
- Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
- for (int i = 0; i < NDIM; ++i) {
- indices[i] = begin[i];
- sizes[i] = size[i];
- }
-
- functor::Slice<Device, T, NDIM>()(
- context->eigen_device<Device>(), result->tensor<T, NDIM>(),
- context->input(0).tensor<T, NDIM>(), indices, sizes);
- }
-};
-#endif // INTEL_MKL
-
// Forward declarations of the functor specializations for declared in the
// sharded source files.
namespace functor {
@@ -440,15 +255,6 @@ TF_CALL_ALL_TYPES(DECLARE_FOR_N);
#undef DECLARE_CPU_SPEC
} // namespace functor
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
-#define REGISTER_SLICE(type) \
- REGISTER_KERNEL_BUILDER(Name("Slice") \
- .Device(DEVICE_CPU) \
- .TypeConstraint<type>("T") \
- .HostMemory("begin") \
- .HostMemory("size"), \
- MklSliceOp<CPUDevice, type>)
-#else
#define REGISTER_SLICE(type) \
REGISTER_KERNEL_BUILDER(Name("Slice") \
.Device(DEVICE_CPU) \
@@ -456,7 +262,6 @@ TF_CALL_ALL_TYPES(DECLARE_FOR_N);
.HostMemory("begin") \
.HostMemory("size"), \
SliceOp<CPUDevice, type>)
-#endif // INTEL_MKL && ENABLE_MKL
TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);