Merge changes from github.

END_PUBLIC --- Commit cf375f067 authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Adds cudnn_rnn_ops_op_lib and cudnn_rnn_kernels to contrib_ops_op_lib and contrib_kernels respectively. PiperOrigin-RevId: 164170971 --- Commit 95ec58e27 authored by Asim Shankar<ashankar@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: C API: Make TF_TensorFromTensor return an error instead of just logging it. PiperOrigin-RevId: 164167582 --- Commit 15175c870 authored by Jonathan Hseu<jhseu@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Build fixes. - Allow var_list as a positional argument in CrossShardOptimizer. - Set the number of shards to 1 when not running on TPU, to allow evaluate() and predict() on CPU/GPU to work. PiperOrigin-RevId: 164161640 --- Commit bd3e894f7 authored by Yao Zhang<yaozhang@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Support freeze mode for fused batch norm. PiperOrigin-RevId: 164149032 --- Commit e6b6b84c0 authored by Asim Shankar<ashankar@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: C API: TF_Tensors will always be in host memory. This change undoes some experimentation in commit 22651083406ca01ac9d481e3367a3510d25f88cd and restores TF_Tensor behavior to what is was prior to that change. PiperOrigin-RevId: 164146670 --- Commit 8bf3f88f7 authored by Peter Hawkins<phawkins@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: [TF:XLA] Add _XLASend and _XLARecv TF ops that wrap the XLA Send/Recv HLO ops. PiperOrigin-RevId: 164124764 --- Commit 626d3200f authored by Peter Hawkins<phawkins@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: [XLA] Add test blacklist mechanism for XLA C++ unit tests. PiperOrigin-RevId: 164124423 --- Commit 359cc5f5e authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Document dict ordering in nest and make it consistent with sonnet. PiperOrigin-RevId: 164114335 --- Commit 05813b531 authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Go: Update generated wrapper functions for TensorFlow ops. PiperOrigin-RevId: 164089206 --- Commit c451f465d authored by Anna R<annarev@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: BEGIN_PUBLIC Automated g4 rollback of changelist 164078808 PiperOrigin-RevId: 164318935
author: Benoit Steiner <bsteiner@google.com> 2017-08-04 15:30:53 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-08-04 15:34:51 -0700
commit: 0815de21239955e346b562e899640649c8d2b9cb (patch)
tree: 064a9da2aa7d84db1095944fb9394a838d47c2a2 /tensorflow/core
parent: 0ba2a1f6db399cbb5be3e71acdad1123af29348a (diff)
7 files changed, 801 insertions, 23 deletions
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 6887d8cfb6..fdf45beed3 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3035,7 +3035,7 @@ tf_kernel_library(
         "maxpooling_op.h",
         "pooling_ops_3d.h",
         "pooling_ops_common.h",
-    ],
+    ] + if_sycl(["pooling_ops_3d_sycl.h"]),
     gpu_srcs = [
         "avgpooling_op.h",
         "avgpooling_op_gpu.cu.cc",
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 538dca24ae..a406317213 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -37,10 +37,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/kernels/pooling_ops_3d_sycl.h"
+#endif  // TENSORFLOW_USE_SYCL
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif  // TENSORFLOW_USE_SYCL
 
 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
                                    const std::vector<int32>& ksize,
@@ -89,11 +97,6 @@ TensorShape Pool3dParameters::forward_output_shape() {
                          {{out_plane, out_height, out_width}}, depth);
 }
 
-enum PoolingType { MAX, AVG };
-
-template <typename Device, typename T, PoolingType Type>
-struct LaunchPoolingOp;
-
 template <typename T>
 struct LaunchPoolingOp<CPUDevice, T, AVG> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
@@ -200,9 +203,6 @@ class Pooling3DOp : public UnaryOp<T> {
   TensorFormat data_format_;
 };
 
-template <typename Device, typename T>
-struct LaunchMaxPooling3dGradOp;
-
 template <typename T>
 struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
@@ -377,9 +377,6 @@ class MaxPooling3dGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-template <typename Device, typename T>
-struct LaunchAvgPooling3dGradOp;
-
 template <typename T>
 struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context,
@@ -541,9 +538,6 @@ class AvgPooling3dGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-template <typename Device, typename T>
-struct LaunchMaxPooling3dGradGradOp;
-
 template <typename T>
 struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context, const Pool3dParameters& params,
@@ -837,6 +831,12 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 
 #endif  // GOOGLE_CUDA
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
+#undef REGISTER_SYCL_KERNELS
+#endif  // TENSORFLOW_USE_SYCL
+
 #undef REGISTER_KERNELS
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.h b/tensorflow/core/kernels/pooling_ops_3d.h
index 7954e2cf83..d1be3ba407 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.h
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@@ -22,6 +22,20 @@ limitations under the License.
 
 namespace tensorflow {
 
+enum PoolingType { MAX, AVG };
+
+template <typename Device, typename T, PoolingType Type>
+struct LaunchPoolingOp;
+
+template <typename Device, typename T>
+struct LaunchAvgPooling3dGradOp;
+
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradOp;
+
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradGradOp;
+
 // A helper class to manage sizes and shapes for 3d pooling operations.
 struct Pool3dParameters {
   // Updates context->status if there is an invalid input.
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
new file mode 100644
index 0000000000..d8cbc589a1
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
@@ -0,0 +1,759 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !TENSORFLOW_USE_SYCL
+#error This file must only be included when building with SYCL support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
+
+#include "tensorflow/core/kernels/pooling_ops_3d.h"
+
+namespace tensorflow {
+
+typedef Eigen::SyclDevice SYCLDevice;
+
+// Helper struct to contain the various pool parameters used in the SYCL
+// pooling kernels. Similar to the Pool3dParameters, but with a number of
+// convenient constructors.
+struct SYCL3DPoolParams {
+  SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
+                   const int in_rows, const int in_cols, const int out_planes,
+                   const int out_rows, const int out_cols,
+                   const std::array<int64, 3>& window,
+                   const std::array<int64, 3>& stride,
+                   const std::array<int64, 3>& padding)
+      : depth_(depth),
+        batch_(batch),
+        in_planes_(in_planes),
+        in_rows_(in_rows),
+        in_cols_(in_cols),
+        window_planes_(window[2]),
+        window_rows_(window[1]),
+        window_cols_(window[0]),
+        stride_planes_(stride[2]),
+        stride_rows_(stride[1]),
+        stride_cols_(stride[0]),
+        out_planes_(out_planes),
+        out_rows_(out_rows),
+        out_cols_(out_cols),
+        pad_planes_(padding[2]),
+        pad_rows_(padding[1]),
+        pad_cols_(padding[0]) {}
+
+  SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
+                   const int in_rows, const int in_cols,
+                   const std::array<int64, 3>& out_shape,
+                   const std::array<int64, 3>& window,
+                   const std::array<int64, 3>& stride,
+                   const std::array<int64, 3>& padding)
+      : SYCL3DPoolParams(depth, batch, in_planes, in_rows, in_cols,
+                         out_shape[2], out_shape[1], out_shape[0], window,
+                         stride, padding) {}
+
+  SYCL3DPoolParams(const Pool3dParameters& params)
+      : depth_(params.depth),
+        batch_(params.tensor_in_batch),
+        in_planes_(params.tensor_in_planes),
+        in_rows_(params.tensor_in_rows),
+        in_cols_(params.tensor_in_cols),
+        window_planes_(params.window_planes),
+        window_rows_(params.window_rows),
+        window_cols_(params.window_cols),
+        stride_planes_(params.plane_stride),
+        stride_rows_(params.row_stride),
+        stride_cols_(params.col_stride),
+        out_planes_(params.out_plane),
+        out_rows_(params.out_height),
+        out_cols_(params.out_width),
+        pad_planes_(params.pad_planes),
+        pad_rows_(params.pad_rows),
+        pad_cols_(params.pad_cols) {}
+
+  const int depth_;
+  const int batch_;
+  const int in_planes_;
+  const int in_rows_;
+  const int in_cols_;
+
+  const int window_planes_;
+  const int window_rows_;
+  const int window_cols_;
+
+  const int stride_planes_;
+  const int stride_rows_;
+  const int stride_cols_;
+
+  const int out_planes_;
+  const int out_rows_;
+  const int out_cols_;
+
+  const int pad_planes_;
+  const int pad_rows_;
+  const int pad_cols_;
+};
+// MaxPool3d SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output tensor.
+//
+// For each output element, find the corresponding input window and run over
+// all values in the window to find the maximum value. This value is then
+// copied into that output element.
+template <typename T>
+class MaxPool3DSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  MaxPool3DSYCL(const int depth, const int batch, const int in_planes,
+                const int in_rows, const int in_cols, const int out_planes,
+                const int out_rows, const int out_cols,
+                const std::array<int64, 3>& window,
+                const std::array<int64, 3>& stride,
+                const std::array<int64, 3>& padding,
+                const read_accessor input_accessor,
+                write_accessor output_accessor)
+      : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
+           out_cols, window, stride, padding),
+        input_accessor_(input_accessor),
+        output_accessor_(output_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
+    T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
+
+    int index = item.get_linear_id();
+    int n = index;
+    int d = n % p_.depth_;
+    n /= p_.depth_;
+    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
+    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+    cstart = std::max(cstart, 0);
+    n /= p_.out_cols_;
+    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
+    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+    rstart = std::max(rstart, 0);
+    n /= p_.out_rows_;
+    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
+    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+    pstart = std::max(pstart, 0);
+    n /= p_.out_planes_;
+    T maxval = Eigen::NumTraits<T>::lowest();
+    const T* input_data_n =
+        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+    for (int p = pstart; p < pend; ++p) {
+      for (int r = rstart; r < rend; ++r) {
+        for (int c = cstart; c < cend; ++c) {
+          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
+          if (input_data_n[idx] > maxval) {
+            maxval = input_data_n[idx];
+          }
+        }
+      }
+    }
+    output_data[index] = maxval;
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+  const read_accessor input_accessor_;
+  write_accessor output_accessor_;
+};
+template <typename T>
+struct LaunchPoolingOp<SYCLDevice, T, MAX> {
+  static void launch(OpKernelContext* context, const Tensor& tensor_in,
+                     const std::array<int64, 3>& window,
+                     const std::array<int64, 3>& stride,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Padding padding_type,
+                     Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+    const int out_planes = GetTensorDim(*output, data_format, '0');
+    const int out_rows = GetTensorDim(*output, data_format, '1');
+    const int out_cols = GetTensorDim(*output, data_format, '2');
+    const int batch = GetTensorDim(tensor_in, data_format, 'N');
+    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
+    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
+    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
+    const int depth = GetTensorDim(tensor_in, data_format, 'C');
+
+    const int num_threads = output->NumElements();
+
+    auto input_buffer =
+        device.get_sycl_buffer(tensor_in.template flat<T>().data());
+    auto output_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_access =
+          input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_access =
+          output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
+      MaxPool3DSYCL<T> max_pool(depth, batch, in_planes, in_rows, in_cols,
+                                out_planes, out_rows, out_cols, window, stride,
+                                padding, input_access, output_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(num_threads), max_pool);
+    });
+  }
+};
+// MaxPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output backprop tenor (i.e. the number of elements
+// in the input data tensor).
+//
+// For each output backprop element we compute the possible window of values in
+// the input backprop tensor which might contribute to this element. Then for
+// each error in this window, compute the corresponding input window which was
+// pooled into that element in the output. Walk through this input window to
+// determine whether the input value is the first maximum value, and so the
+// error should be propagated back to the corresponding backprop element.
+template <typename T>
+class MaxPool3DGradSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  MaxPool3DGradSYCL(const int depth, const int batch, const int in_planes,
+                    const int in_rows, const int in_cols,
+                    const std::array<int64, 3>& output_shape,
+                    const std::array<int64, 3>& window,
+                    const std::array<int64, 3>& stride,
+                    const std::array<int64, 3>& padding,
+                    const read_accessor input_data_accessor,
+                    const read_accessor output_data_accessor,
+                    const read_accessor input_backprop_accessor,
+                    write_accessor output_backprop_accessor)
+      : p_(depth, batch, in_planes, in_rows, in_cols, output_shape, window,
+           stride, padding),
+        input_data_accessor_(input_data_accessor),
+        output_data_accessor_(output_data_accessor),
+        input_backprop_accessor_(input_backprop_accessor),
+        output_backprop_accessor_(output_backprop_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
+    T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
+    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
+    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
+
+    const int index = item.get_linear_id();
+    T output_value = 0;
+    int n = index;
+    const int d = n % p_.depth_;
+    n /= p_.depth_;
+    const int c = (n % p_.in_cols_) + p_.pad_cols_;
+    const int poolcstart =
+        (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
+    const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
+    n /= p_.in_cols_;
+    const int r = (n % p_.in_rows_) + p_.pad_rows_;
+    const int poolrstart =
+        (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
+    const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
+    n /= p_.in_rows_;
+    const int p = (n % p_.in_planes_) + p_.pad_planes_;
+    const int poolpstart =
+        (p < p_.window_planes_)
+            ? 0
+            : (p - p_.window_planes_) / p_.stride_planes_ + 1;
+    const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
+    n /= p_.in_planes_;
+    const int index_no_n =
+        index - n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+
+    const T* input_data_n =
+        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+    const T* output_data_n =
+        output_data +
+        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n =
+        input_backprop +
+        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
+      int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
+      const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+      pstart = std::max(pstart, 0);
+
+      for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
+        int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
+        const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+        rstart = std::max(rstart, 0);
+
+        for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
+          int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
+          const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+          cstart = std::max(cstart, 0);
+
+          const int output_data_idx =
+              ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
+                  p_.depth_ +
+              d;
+          bool should_continue = true;
+          bool is_max = (input_data[index] == output_data_n[output_data_idx]);
+          for (int win_p = pstart; win_p < pend && should_continue; ++win_p) {
+            for (int win_r = rstart; win_r < rend && should_continue; ++win_r) {
+              for (int win_c = cstart; win_c < cend && should_continue;
+                   ++win_c) {
+                const int input_data_idx =
+                    ((win_p * p_.in_rows_ + win_r) * p_.in_cols_ + win_c) *
+                        p_.depth_ +
+                    d;
+                if (input_data_idx == index_no_n) {
+                  should_continue = false;
+                } else if (input_data_n[input_data_idx] ==
+                           output_data_n[output_data_idx]) {
+                  should_continue = false;
+                  is_max = false;
+                }
+              }
+            }
+          }
+          if (is_max) {
+            output_value += input_backprop_n[output_data_idx];
+          }
+        }
+      }
+    }
+    output_backprop[index] = output_value;
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+
+  const read_accessor input_data_accessor_;
+  const read_accessor output_data_accessor_;
+  const read_accessor input_backprop_accessor_;
+  write_accessor output_backprop_accessor_;
+};
+template <typename T>
+struct LaunchMaxPooling3dGradOp<SYCLDevice, T> {
+  static void launch(OpKernelContext* context, const Tensor& tensor_in,
+                     const Tensor& tensor_out, const Tensor& out_backprop,
+                     const std::array<int64, 3>& window,
+                     const std::array<int64, 3>& stride,
+                     const std::array<int64, 3>& out,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+    const int batch = GetTensorDim(tensor_in, data_format, 'N');
+    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
+    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
+    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
+    const int depth = GetTensorDim(tensor_in, data_format, 'C');
+
+    const int output_size = output->NumElements();
+
+    auto input_data_buffer =
+        device.get_sycl_buffer(tensor_in.template flat<T>().data());
+    auto output_data_buffer =
+        device.get_sycl_buffer(tensor_out.template flat<T>().data());
+    auto input_backprop_buffer =
+        device.get_sycl_buffer(out_backprop.template flat<T>().data());
+    auto output_backprop_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_data_access =
+          input_data_buffer.template get_access<cl::sycl::access::mode::read>(
+              cgh);
+      auto output_data_access =
+          output_data_buffer.template get_access<cl::sycl::access::mode::read>(
+              cgh);
+      auto input_backprop_access =
+          input_backprop_buffer
+              .template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_backprop_access =
+          output_backprop_buffer
+              .template get_access<cl::sycl::access::mode::write>(cgh);
+      MaxPool3DGradSYCL<T> max_pool(
+          depth, batch, in_planes, in_rows, in_cols, out, window, stride,
+          padding, input_data_access, output_data_access, input_backprop_access,
+          output_backprop_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(output_size), max_pool);
+    });
+  }
+};
+// MaxPool3DGradGrad SYCL kernel. Expects the number of threads to be equal to
+// the number of elements in the output backprop tensor, i.e. the number of
+// elements in the output tensor.
+//
+// For each element in the output backprop tensor, find the corresponding input
+// window, and compare the input and output data to find the index of the
+// maximum value in the input tensor. This is then the index of the gradient to
+// pass through to the output backprop tensor.
+template <typename T>
+class MaxPool3DGradGradSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  MaxPool3DGradGradSYCL(const Pool3dParameters& params,
+                        const read_accessor input_data_accessor,
+                        const read_accessor output_data_accessor,
+                        const read_accessor input_backprop_accessor,
+                        write_accessor output_backprop_accessor)
+      : p_(params),
+        input_data_accessor_(input_data_accessor),
+        output_data_accessor_(output_data_accessor),
+        input_backprop_accessor_(input_backprop_accessor),
+        output_backprop_accessor_(output_backprop_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
+    T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
+    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
+    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
+
+    int index = item.get_linear_id();
+    int n = index;
+    int d = n % p_.depth_;
+    n /= p_.depth_;
+    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
+    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+    cstart = std::max(cstart, 0);
+    n /= p_.out_cols_;
+    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
+    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+    rstart = std::max(rstart, 0);
+    n /= p_.out_rows_;
+    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
+    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+    pstart = std::max(pstart, 0);
+    n /= p_.out_planes_;
+    int maxidx = -1;
+    bool should_stop = false;
+    const T* input_data_n =
+        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+    for (int p = pstart; p < pend && !should_stop; ++p) {
+      for (int r = rstart; r < rend && !should_stop; ++r) {
+        for (int c = cstart; c < cend && !should_stop; ++c) {
+          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
+          if (output_data[index] == input_data_n[idx]) {
+            maxidx = idx;
+            should_stop = true;
+          }
+        }
+      }
+    }
+    if (maxidx != -1) {
+      output_backprop[index] = input_backprop[n * p_.in_planes_ * p_.in_rows_ *
+                                                  p_.in_cols_ * p_.depth_ +
+                                              maxidx];
+    }
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+
+  const read_accessor input_data_accessor_;
+  const read_accessor output_data_accessor_;
+  const read_accessor input_backprop_accessor_;
+  write_accessor output_backprop_accessor_;
+};
+template <typename T>
+struct LaunchMaxPooling3dGradGradOp<SYCLDevice, T> {
+  static void launch(OpKernelContext* context, const Pool3dParameters& params,
+                     const Tensor& tensor_in, const Tensor& tensor_out,
+                     const Tensor& out_backprop, Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+
+    const int num_threads = output->NumElements();
+
+    auto input_data_buffer =
+        device.get_sycl_buffer(tensor_in.template flat<T>().data());
+    auto output_data_buffer =
+        device.get_sycl_buffer(tensor_out.template flat<T>().data());
+    auto input_backprop_buffer =
+        device.get_sycl_buffer(out_backprop.template flat<T>().data());
+    auto output_backprop_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_data_access =
+          input_data_buffer.template get_access<cl::sycl::access::mode::read>(
+              cgh);
+      auto output_data_access =
+          output_data_buffer.template get_access<cl::sycl::access::mode::read>(
+              cgh);
+      auto input_backprop_access =
+          input_backprop_buffer
+              .template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_backprop_access =
+          output_backprop_buffer
+              .template get_access<cl::sycl::access::mode::write>(cgh);
+      MaxPool3DGradGradSYCL<T> functor(
+          params, input_data_access, output_data_access, input_backprop_access,
+          output_backprop_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
+    });
+  }
+};
+// AvgPool3D SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output tensor.
+//
+// For each output value find the corresponding input window, and run through
+// the window accumulating the values to form an average. We divide each value
+// before accumulating to prevent the accumulator from becoming significantly
+// bigger than the values we are adding and so decrease any errors.
+template <typename T>
+class AvgPool3DSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  AvgPool3DSYCL(const int depth, const int batch, const int in_planes,
+                const int in_rows, const int in_cols, const int out_planes,
+                const int out_rows, const int out_cols,
+                const std::array<int64, 3>& window,
+                const std::array<int64, 3>& stride,
+                const std::array<int64, 3>& padding,
+                const read_accessor input_accessor,
+                write_accessor output_accessor)
+      : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
+           out_cols, window, stride, padding),
+        input_accessor_(input_accessor),
+        output_accessor_(output_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
+    T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
+
+    int index = item.get_linear_id();
+    int n = index;
+    int d = n % p_.depth_;
+    n /= p_.depth_;
+    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
+    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+    cstart = std::max(cstart, 0);
+    n /= p_.out_cols_;
+    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
+    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+    rstart = std::max(rstart, 0);
+    n /= p_.out_rows_;
+    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
+    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+    pstart = std::max(pstart, 0);
+    n /= p_.out_planes_;
+    T accum = T(0);
+    T count =
+        static_cast<T>((pend - pstart) * (rend - rstart) * (cend - cstart));
+    const T* input_data_n =
+        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+    for (int p = pstart; p < pend; ++p) {
+      for (int r = rstart; r < rend; ++r) {
+        for (int c = cstart; c < cend; ++c) {
+          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
+          accum += input_data_n[idx] / count;
+        }
+      }
+    }
+    output_data[index] = accum;
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+  const read_accessor input_accessor_;
+  write_accessor output_accessor_;
+};
+template <typename T>
+struct LaunchPoolingOp<SYCLDevice, T, AVG> {
+  static void launch(OpKernelContext* context, const Tensor& tensor_in,
+                     const std::array<int64, 3>& window,
+                     const std::array<int64, 3>& stride,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Padding padding_type,
+                     Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+    const int out_planes = GetTensorDim(*output, data_format, '0');
+    const int out_rows = GetTensorDim(*output, data_format, '1');
+    const int out_cols = GetTensorDim(*output, data_format, '2');
+    const int batch = GetTensorDim(tensor_in, data_format, 'N');
+    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
+    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
+    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
+    const int depth = GetTensorDim(tensor_in, data_format, 'C');
+
+    const int num_threads = output->NumElements();
+
+    auto input_buffer =
+        device.get_sycl_buffer(tensor_in.template flat<T>().data());
+    auto output_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_access =
+          input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_access =
+          output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
+      AvgPool3DSYCL<T> avg_pool(depth, batch, in_planes, in_rows, in_cols,
+                                out_planes, out_rows, out_cols, window, stride,
+                                padding, input_access, output_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(num_threads), avg_pool);
+    });
+  }
+};
+// AvgPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output backprop tensor, i.e. the number of
+// elements in the input tensor.
+//
+// For each output backprop index find a window in the input backprop tensor
+// which corresponds to all the values of the output which were affected by the
+// input value at this index. Then for each gradient in this window, compute
+// the size of the input window which was averaged to give this output, and use
+// this size to scale the gradient accordingly. Add this scaled gradient to the
+// output backprop value.
+template <typename T>
+class AvgPool3DGradSYCL {
+  using write_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+                         cl::sycl::access::target::global_buffer>;
+  using read_accessor =
+      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+                         cl::sycl::access::target::global_buffer>;
+
+ public:
+  AvgPool3DGradSYCL(const int depth, const int batch, const int in_planes,
+                    const int in_rows, const int in_cols,
+                    const std::array<int64, 3>& out_shape,
+                    const std::array<int64, 3>& window,
+                    const std::array<int64, 3>& stride,
+                    const std::array<int64, 3>& padding,
+                    const read_accessor input_backprop_accessor,
+                    write_accessor output_backprop_accessor)
+      : p_(depth, batch, in_planes, in_rows, in_cols, out_shape, window, stride,
+           padding),
+        input_backprop_accessor_(input_backprop_accessor),
+        output_backprop_accessor_(output_backprop_accessor) {}
+  void operator()(cl::sycl::item<1> item) {
+    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
+    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
+
+    const int index = item.get_linear_id();
+    int n = index;
+    const int d = n % p_.depth_;
+    n /= p_.depth_;
+    const int c = (n % p_.in_cols_) + p_.pad_cols_;
+    const int poolcstart =
+        (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
+    const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
+    n /= p_.in_cols_;
+    const int r = (n % p_.in_rows_) + p_.pad_rows_;
+    const int poolrstart =
+        (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
+    const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
+    n /= p_.in_rows_;
+    const int p = (n % p_.in_planes_) + p_.pad_planes_;
+    const int poolpstart =
+        (p < p_.window_planes_)
+            ? 0
+            : (p - p_.window_planes_) / p_.stride_planes_ + 1;
+    const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
+    n /= p_.in_planes_;
+
+    T gradient = T(0);
+    const T* input_backprop_n =
+        input_backprop +
+        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
+      int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
+      const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+      pstart = std::max(pstart, 0);
+      const int plane_window_size = pend - pstart;
+      for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
+        int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
+        const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+        rstart = std::max(rstart, 0);
+        const int row_window_size = rend - rstart;
+        for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
+          const int idx =
+              ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
+                  p_.depth_ +
+              d;
+          int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
+          const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+          cstart = std::max(cstart, 0);
+          const int col_window_size = cend - cstart;
+          const int window_size =
+              plane_window_size * row_window_size * col_window_size;
+          gradient += input_backprop_n[idx] / static_cast<T>(window_size);
+        }
+      }
+    }
+    output_backprop[index] = gradient;
+  }
+
+ private:
+  const SYCL3DPoolParams p_;
+  const read_accessor input_backprop_accessor_;
+  write_accessor output_backprop_accessor_;
+};
+template <typename T>
+struct LaunchAvgPooling3dGradOp<SYCLDevice, T> {
+  static void launch(OpKernelContext* context,
+                     const TensorShape& tensor_in_shape,
+                     const Tensor& out_backprop,
+                     const std::array<int64, 3>& window,
+                     const std::array<int64, 3>& stride,
+                     const std::array<int64, 3>& output_shape,
+                     const std::array<int64, 3>& padding,
+                     TensorFormat data_format, Tensor* output) {
+    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+    const int batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+    const int in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
+    const int in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
+    const int in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
+    const int depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+
+    const int num_threads = output->NumElements();
+
+    auto input_backprop_buffer =
+        device.get_sycl_buffer(out_backprop.template flat<T>().data());
+    auto output_backprop_buffer =
+        device.get_sycl_buffer(output->template flat<T>().data());
+
+    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+      auto input_backprop_access =
+          input_backprop_buffer
+              .template get_access<cl::sycl::access::mode::read>(cgh);
+      auto output_backprop_access =
+          output_backprop_buffer
+              .template get_access<cl::sycl::access::mode::write>(cgh);
+      AvgPool3DGradSYCL<T> functor(
+          depth, batch, in_planes, in_rows, in_cols, output_shape, window,
+          stride, padding, input_backprop_access, output_backprop_access);
+
+      cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
+    });
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 5b71f046c3..1bfa37f5a7 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -710,9 +710,9 @@ bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
 bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
 height of the underlying image.
 
-For example, if an image is 100 x 200 pixels and the bounding box is
-`[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
-bounding box will be `(10, 40)` to `(50, 180)`.
+For example, if an image is 100 x 200 pixels (height x width) and the bounding 
+box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of 
+the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
 
 Parts of the bounding box may fall outside the image.
 
diff --git a/tensorflow/core/platform/default/gpu_tracer.cc b/tensorflow/core/platform/default/gpu_tracer.cc
index 86ab70afdd..50c27b3cf6 100644
--- a/tensorflow/core/platform/default/gpu_tracer.cc
+++ b/tensorflow/core/platform/default/gpu_tracer.cc
@@ -205,7 +205,7 @@ Status CUPTIManager::DisableTrace() {
   CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
   CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY2));
   CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
-  CUPTI_CALL(ActivityFlushAll(0));
+  CUPTI_CALL(ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
   {
     // Don't acquire this lock until Flush returns, since Flush
     // will potentially cause callbacks into BufferCompleted.
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 69198019cd..70b91c37e4 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -25,7 +25,7 @@ bool CountAsAcceleratorTime(const string& device) {
 }
 
 bool CountAsCPUTime(const string& device) {
-  return RE2::FullMatch(device, ".*/(gpu|cpu):\\d+");
+  return RE2::FullMatch(device, ".*/(gpu|cpu|device:sycl):\\d+");
 }
 
 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
@@ -145,7 +145,7 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
   // See run_metadata_test.py
   // It can be /job:0/replica:0/xxxx/gpu:0, or simply /gpu:0.
   // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
-  if (IsCanonicalDevice(device)) {
+  if (IsCanonicalDevice(dev)) {
     if (!canonical_device_.empty()) {
       if (canonical_device_ != dev) {
         fprintf(stderr, "Unexpected: graph node changed device: %s->%s.\n",
@@ -155,7 +155,11 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
     } else {
       canonical_device_ = dev;
       // TODO(xpan): Support things other than gpu?
-      host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0");
+      if (dev.find("sycl") != dev.npos) {
+        host_device_ = StringReplace(dev, "device:sycl:\\d+", "cpu:0");
+      } else {
+        host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0");
+      }
       AddOpType(canonical_device_);
     }
   }
@@ -229,7 +233,8 @@ TensorShapeProto VecToShapeProto(const std::vector<int64> shape_vec) {
 }
 
 bool IsPlacedOnAccelerator(const string& device) {
-  return device.find("gpu") != device.npos;
+  return device.find("gpu") != device.npos ||
+         device.find("sycl") != device.npos;
 }
 }  // namespace tfprof
 }  // namespace tensorflow
author	Benoit Steiner <bsteiner@google.com>	2017-08-04 15:30:53 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-08-04 15:34:51 -0700
commit	0815de21239955e346b562e899640649c8d2b9cb (patch)
tree	064a9da2aa7d84db1095944fb9394a838d47c2a2 /tensorflow/core
parent	0ba2a1f6db399cbb5be3e71acdad1123af29348a (diff)