From f3c89936e97c99dead1ca3310246691c1b221adf Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 26 Jun 2017 12:54:12 -0700 Subject: Merge changes from github. END_PUBLIC Note: this CL will break builds. cl/159887762 to follow to fix all the breakages. --- Commit 2336cdf7f authored by Maxwell Paul Brickner Committed by gunan: Updated link to use HTTPS (#10998) Howdy! I just updated a link to use https instead of http. Thanks! --- Commit ad0892df1 authored by Luke Iwanski Committed by Luke Iwanski: [OpenCL] Fixes run_metadata_test for SYCL This test is designed to test CUDA specific behavior --- Commit 6b37a0725 authored by Todd Wang Committed by GitHub: Update comments --- Commit 1699d904a authored by John Lawson Committed by Luke Iwanski: [OpenCL] Fixes CUDA specific test run on SYCL (#56) The testBadParentValuesOnGPU should only be run on CUDA devices, as the test checks for particular CUDA behaviour. We don't actually provide a SYCL kernel for GatherTree and so it's not a problem that the tests don't target SYCL. --- Commit 3c1946230 authored by myPrecious Committed by Shanqing Cai: Java API to get the size of specified input list of operations. (#10865) * Java API to get the size of specified input list of operations * remove unnecessary explain to avoid bring a new term to users. --- Commit e911c7480 authored by Luke Iwanski Committed by Luke Iwanski: [OpenCL] REGISTER -> REGISTER6 --- Commit fbf6c4cec authored by superryanguo Committed by superryanguo: Simplify the Quickstart section with the weblink is better --- Commit 72e2918cc authored by Taehoon Lee Committed by Taehoon Lee: Fix typos --- Commit 90c4406b7 authored by Rishabh Patel Committed by GitHub: Correct the learning rate as per the code snippet --- Commit 03da61134 authored by Todd Wang Committed by GitHub: Update ir_array.cc --- Commit 2df6cd3ac authored by Todd Wang Committed by GitHub: Another try --- Commit af0cbace1 authored by Luke Iwanski Committed by Benoit Steiner: [OpenCL] Transpose to go through Eigen (#10321) --- Commit fc7361081 authored by Luke Iwanski Committed by Benoit Steiner: [OpenCL] Registers RGBToHSV and HSVToRGB (#91) (#10848) * [OpenCL] Added RGBToHSV and HSVToRGB * Aligning '\' --- Commit 832894ef8 authored by Luke Iwanski Committed by Benoit Steiner: [OpenCL] Registers AdjustContrastv2 (#10949) * [OpenCL] Registers AdjustContrastv2 (#93) * [OpenCL] Extended adjust_contrast_op_benchmark_test for OpenCL (#96) * [OpenCL] Extended adjust_contrast_op_benchmark_test for OpenCL * simplified to #ifndef * Changed to "#if GOOGLE_CUDA" * Update adjust_contrast_op_benchmark_test.cc * Added comments --- Commit cb4c2f8d1 authored by Yifei Feng Committed by Yifei Feng: Make TransferBufferToInFeed not virual so it compiles. --- Commit e89f04d80 authored by Yifei Feng Committed by Yifei Feng: Fix calling Literal member functions. --- Commit 15a8df724 authored by Yifei Feng Committed by Yifei Feng: Fix mac build clone from meheff's change: [XLA] Change return type of DeviceAssignment::Deserialize to fix build breakage on mac. The mac build had the following error: error: incomplete type 'xla::DeviceAssignment' used in type trait expression This was due to a static method returning a StatusOr inside of the definition of DeviceAssignment. --- Commit a54d43fa4 authored by Yifei Feng Committed by Yifei Feng: Replace LiteralUtil to Literal in compiler/plugin/executor --- Commit 88a6bb80c authored by Guenther Schmuelling Committed by Guenther Schmuelling: expand inline for debug builds to limit number of symbols --- Commit 62fb49d31 authored by Yifei Feng Committed by Yifei Feng: Fix visibility error for contrib/remote_fused_graph/pylib/BUILD. --- Commit 4c75252f2 authored by Mark Neumann Committed by Mark Neumann: fix initial test values to avoid numerical instability --- Commit b58d98353 authored by sj6077 Committed by Benoit Steiner: Fixes of AutoParallel bug (#10368) * Fix the bug that auto_parallel could replicate variable snapshot name * Use NodeName in grappler:utils instead of substr, convert variables->variable_def of grappler item * remove variable_def from grappler item, exclude snapshot nodes from dont_replicate_nodes in auto_parallel --- Commit a286b7db8 authored by Yifei Feng Committed by Yifei Feng: Make debug_test slice integer. --- Commit 97fcfdfa6 authored by Toby Boyd Committed by GitHub: Fixed path to seq2seq.py and minor formatting --- Commit 63c1befb8 authored by Anish Shah Committed by Anish Shah: Improve docs for tf.nn.depthwise_conv2d_native --- Commit 8d42202b2 authored by Yong Tang Committed by Yong Tang: Fix mismatched delete in mkl_tfconv_op.cc This fix fixes mismatched new[]-delete in mkl_tfconv_op.cc (the file went through clang-format so there are some additional changes) Signed-off-by: Yong Tang --- Commit 26301bd55 authored by Danny Goodman Committed by Danny Goodman: fix error format --- Commit b3f33ad46 authored by Yao Zhang Committed by TensorFlower Gardener: Make changes to prepare for the fused option of batch norm to be set to None (None means using fused batch norm if possible). PiperOrigin-RevId: 159649743 --- Commit a4a469832 authored by A. Unique TensorFlower Committed by TensorFlower Gardener: [XLA] Add tests for select ops and while loops that produce tuples that contain predicates. PiperOrigin-RevId: 159645900 --- Commit 980d3f2be authored by A. Unique TensorFlower Committed by TensorFlower Gardener: Use C API to implement Operation.name property This name property is used in many existing tests including those that already run with C API enabled (math_ops_test, framework_ops_test, session_test, session_partial_run_test, math_ops_test_gpu, etc). PiperOrigin-RevId: 159645767 --- Commit 26239c706 authored by A. Unique TensorFlower Committed by TensorFlower Gardener: Previously we didn't have an implementation of BatchNormInference and BatchNormTraining, which gives a linker error if anyone ever tries to call that. A dummy implementation is friendlier than a linker error. PiperOrigin-RevId: 159645612 --- Commit f671c5caa authored by A. Unique TensorFlower Committed by TensorFlower Gardener: BEGIN_PUBLIC Automated g4 rollback of changelist 159570549 PiperOrigin-RevId: 160182040 --- tensorflow/core/kernels/slice_op.cc | 258 +++++++++++++++++++++++++++++++++--- 1 file changed, 242 insertions(+), 16 deletions(-) (limited to 'tensorflow/core/kernels/slice_op.cc') diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc index ee6f9a28cd..d46701749b 100644 --- a/tensorflow/core/kernels/slice_op.cc +++ b/tensorflow/core/kernels/slice_op.cc @@ -118,6 +118,43 @@ static void SharedValidation(OpKernelContext* context, } } +// Extracted out code in SliceOp::Compute so that MklSliceOp can reuse this +// generic code +template +static void SharedSliceCommonCases(OpKernelContext* context, + TensorShape* output_shape, + gtl::InlinedVector* begin, + gtl::InlinedVector* size, + Tensor** result, + bool* done) { + bool is_identity = true; + bool slice_dim0 = true; + *done = false; + + SharedValidation(context, output_shape, &is_identity, &slice_dim0, begin, + size); + if (!context->status().ok()) return; + const Tensor& input = context->input(0); + if (is_identity) { + VLOG(1) << "Slice identity"; + context->set_output(0, input); + *done = true; + return; + } + + if (slice_dim0 && IsDim0SliceAligned(input.shape(), (*begin)[0], + (*size)[0])) { + VLOG(1) << "Slice dim 0: " << input.shape().DebugString(); + CHECK_GE(input.dims(), 1); // Otherwise, is_identity should be true. + context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0])); + *done = true; + return; + } + + OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result)); +} + + template class SliceOp : public OpKernel { public: @@ -125,29 +162,89 @@ class SliceOp : public OpKernel { void Compute(OpKernelContext* context) override { TensorShape output_shape; - bool is_identity = true; - bool slice_dim0 = true; gtl::InlinedVector begin; gtl::InlinedVector size; - SharedValidation(context, &output_shape, &is_identity, &slice_dim0, &begin, - &size); - if (!context->status().ok()) return; + Tensor* result = nullptr; + bool done = false; + SharedSliceCommonCases(context, &output_shape, &begin, &size, &result, + &done); + if (!context->status().ok() || done == true) return; + const Tensor& input = context->input(0); - if (is_identity) { - VLOG(1) << "Slice identity"; - context->set_output(0, input); - return; + const int input_dims = input.dims(); + + if (output_shape.num_elements() > 0) { + if (std::is_same::value && input_dims == 2 && + DataTypeCanUseMemcpy(DataTypeToEnum::v())) { + auto input = context->input(0).tensor(); + auto output = result->tensor(); + // TODO(agarwal): Consider multi-threading this loop for cases where + // size[0] is very large. + for (int i = 0; i < size[0]; ++i) { + const int64 row = begin[0] + i; + if (i + 1 < size[0]) { + port::prefetch(&output(i + 1, 0)); + port::prefetch(&input(row + 1, begin[1])); + } + memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T)); + } + return; + } +#define HANDLE_DIM(NDIM) \ + if (input_dims == NDIM) { \ + HandleCase(context, begin, size, result); \ + return; \ + } + + HANDLE_DIM(1); + HANDLE_DIM(2); + HANDLE_DIM(3); + HANDLE_DIM(4); + HANDLE_DIM(5); + HANDLE_DIM(6); + HANDLE_DIM(7); + +#undef HANDLE_DIM + + OP_REQUIRES(context, false, errors::Unimplemented( + "SliceOp : Unhandled input dimensions")); } + } - if (slice_dim0 && IsDim0SliceAligned(input.shape(), begin[0], size[0])) { - VLOG(1) << "Slice dim 0: " << input.shape().DebugString(); - CHECK_GE(input.dims(), 1); // Otherwise, is_identity should be true. - context->set_output(0, input.Slice(begin[0], begin[0] + size[0])); - return; + private: + template + void HandleCase(OpKernelContext* context, const gtl::ArraySlice& begin, + const gtl::ArraySlice& size, Tensor* result) { + Eigen::DSizes indices; + Eigen::DSizes sizes; + for (int i = 0; i < NDIM; ++i) { + indices[i] = begin[i]; + sizes[i] = size[i]; } + functor::Slice()( + context->eigen_device(), result->tensor(), + context->input(0).tensor(), indices, sizes); + } +}; + +#ifdef INTEL_MKL +template +class MklSliceOp : public OpKernel { + public: + explicit MklSliceOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + TensorShape output_shape; + gtl::InlinedVector begin; + gtl::InlinedVector size; Tensor* result = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result)); + bool done = false; + SharedSliceCommonCases(context, &output_shape, &begin, &size, &result, + &done); + if (!context->status().ok() || done == true) return; + + const Tensor& input = context->input(0); const int input_dims = input.dims(); if (output_shape.num_elements() > 0) { @@ -189,9 +286,123 @@ class SliceOp : public OpKernel { } private: + // Helper function for DoesSliceShapeDifferInOnly1D. Checks if the following + // criteria matches for slice_dim: if indices for slice are 0 in all dims + // except slice_dim and if sizes of all the dimensions of the slice are same + // as the sizes of all the dimensions of the input except slice_dim, then + // returns True. Otherwise, returns False. + bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape, + const gtl::ArraySlice& begin, + const gtl::ArraySlice& size, + int slice_dim) { + for (int dim = 0; dim < 4; dim++) { + if (dim != slice_dim && + (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) { + return false; + } + } + return true; + } + + // Is 'input' tensor being sliced over a single dimension out of 4? + // + // This check is applicable in the context of Slice of a 4-D tensor in + // NHWC or NCHW format over channel dimension. + // + // If indices for slice are 0 in all dims except one dimension and if sizes of + // all dimensions of slice are same as sizes of all dimensions of inputs + // except that dimension, then we are slicing over a single dimension. + // + // Returns True if Slicing over a single dimension, and sets slice_dim + // to the number of the dimension that satisfies criteria. + bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape, + const gtl::ArraySlice& begin, + const gtl::ArraySlice& size, + int* slice_dim) { + for (int dim = 0; dim < 4; dim++) { + if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) { + *slice_dim = dim; + return true; + } + } + return false; + } + template - void HandleCase(OpKernelContext* context, const gtl::ArraySlice& begin, + void HandleCase(OpKernelContext* context, + const gtl::ArraySlice& begin, const gtl::ArraySlice& size, Tensor* result) { + int slice_dim = -1; + TensorShape in_shape = context->input(0).shape(); + // Special case for handling 4-D tensor slice when shape of the slice + // differs from the input tensor in only 1 out of 4 dimensions. + // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW + // format over channel dimension. + if (NDIM == 4 && + DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) { + size_t in_strides[4] = { (size_t) in_shape.dim_size(1) * + in_shape.dim_size(2) * + in_shape.dim_size(3), + (size_t) in_shape.dim_size(2) * + in_shape.dim_size(3), + (size_t) in_shape.dim_size(3), + (size_t) 1 + }; + + size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3], + (size_t) size[2] * size[3], + (size_t) size[3], + (size_t) 1 }; + + T *in_buf = const_cast(const_cast( + context->input(0).flat().data())); + T *op_buf = result->flat().data(); + + if (slice_dim == 1) { + /* data format = NCHW */ + + #pragma omp parallel for + for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { + T *ip = in_buf + (d0 * in_strides[0]); + T *op = op_buf + ((d0 - begin[0]) * out_strides[0]); + #pragma omp parallel for + for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { + T *ip1 = ip + (d1 * in_strides[1]); + T *op1 = op + ((d1 - begin[1]) * out_strides[1]); + // For NCHW, H and W will be contiguous. So we can copy + // both with one memcpy. + memcpy(static_cast(op1), static_cast(ip1), + sizeof(T) * in_strides[1]); + } + } + return; + } else if (slice_dim == 3) { + /* data_format = NHWC */ + + #pragma omp parallel for + for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { + T *ip = in_buf + (d0 * in_strides[0]); + T *op = op_buf + ((d0 - begin[0]) * out_strides[0]); + #pragma omp parallel for + for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { + T *ip1 = ip + (d1 * in_strides[1]); + T *op1 = op + ((d1 - begin[1]) * out_strides[1]); + #pragma omp parallel for + for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) { + T *ip2 = ip1 + (d2 * in_strides[2]); + T *ip3 = ip2 + begin[3]; + T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]); + T *op3 = op2; + memcpy(static_cast(op3), static_cast(ip3), + sizeof(T) * size[3]); + } + } + } + return; + } + // slice_dim is not 1 or 3, then we fallback to Eigen implementation. + } + Eigen::DSizes indices; Eigen::DSizes sizes; for (int i = 0; i < NDIM; ++i) { @@ -204,6 +415,7 @@ class SliceOp : public OpKernel { context->input(0).tensor(), indices, sizes); } }; +#endif // Forward declarations of the functor specializations for declared in the // sharded source files. @@ -233,6 +445,7 @@ DECLARE_FOR_N(bfloat16); #undef DECLARE_CPU_SPEC } // namespace functor +#ifndef INTEL_MKL #define REGISTER_SLICE(type) \ REGISTER_KERNEL_BUILDER(Name("Slice") \ .Device(DEVICE_CPU) \ @@ -244,8 +457,21 @@ DECLARE_FOR_N(bfloat16); TF_CALL_POD_STRING_TYPES(REGISTER_SLICE); TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE); REGISTER_SLICE(bfloat16); +#undef REGISTER_SLICE +#else +#define REGISTER_SLICE(type) \ + REGISTER_KERNEL_BUILDER(Name("Slice") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .HostMemory("begin") \ + .HostMemory("size"), \ + MklSliceOp) +TF_CALL_POD_STRING_TYPES(REGISTER_SLICE); +TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE); +REGISTER_SLICE(bfloat16); #undef REGISTER_SLICE +#endif // INTEL_MKL #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. -- cgit v1.2.3