aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <bsteiner@google.com>2017-08-04 15:30:53 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-08-04 15:34:51 -0700
commit0815de21239955e346b562e899640649c8d2b9cb (patch)
tree064a9da2aa7d84db1095944fb9394a838d47c2a2 /tensorflow/core
parent0ba2a1f6db399cbb5be3e71acdad1123af29348a (diff)
Merge changes from github.
END_PUBLIC --- Commit cf375f067 authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Adds cudnn_rnn_ops_op_lib and cudnn_rnn_kernels to contrib_ops_op_lib and contrib_kernels respectively. PiperOrigin-RevId: 164170971 --- Commit 95ec58e27 authored by Asim Shankar<ashankar@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: C API: Make TF_TensorFromTensor return an error instead of just logging it. PiperOrigin-RevId: 164167582 --- Commit 15175c870 authored by Jonathan Hseu<jhseu@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Build fixes. - Allow var_list as a positional argument in CrossShardOptimizer. - Set the number of shards to 1 when not running on TPU, to allow evaluate() and predict() on CPU/GPU to work. PiperOrigin-RevId: 164161640 --- Commit bd3e894f7 authored by Yao Zhang<yaozhang@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Support freeze mode for fused batch norm. PiperOrigin-RevId: 164149032 --- Commit e6b6b84c0 authored by Asim Shankar<ashankar@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: C API: TF_Tensors will always be in host memory. This change undoes some experimentation in commit 22651083406ca01ac9d481e3367a3510d25f88cd and restores TF_Tensor behavior to what is was prior to that change. PiperOrigin-RevId: 164146670 --- Commit 8bf3f88f7 authored by Peter Hawkins<phawkins@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: [TF:XLA] Add _XLASend and _XLARecv TF ops that wrap the XLA Send/Recv HLO ops. PiperOrigin-RevId: 164124764 --- Commit 626d3200f authored by Peter Hawkins<phawkins@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: [XLA] Add test blacklist mechanism for XLA C++ unit tests. PiperOrigin-RevId: 164124423 --- Commit 359cc5f5e authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Document dict ordering in nest and make it consistent with sonnet. PiperOrigin-RevId: 164114335 --- Commit 05813b531 authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: Go: Update generated wrapper functions for TensorFlow ops. PiperOrigin-RevId: 164089206 --- Commit c451f465d authored by Anna R<annarev@google.com> Committed by Benoit Steiner<benoitsteiner@users.noreply.github.com>: BEGIN_PUBLIC Automated g4 rollback of changelist 164078808 PiperOrigin-RevId: 164318935
Diffstat (limited to 'tensorflow/core')
-rw-r--r--tensorflow/core/kernels/BUILD2
-rw-r--r--tensorflow/core/kernels/pooling_ops_3d.cc28
-rw-r--r--tensorflow/core/kernels/pooling_ops_3d.h14
-rw-r--r--tensorflow/core/kernels/pooling_ops_3d_sycl.h759
-rw-r--r--tensorflow/core/ops/image_ops.cc6
-rw-r--r--tensorflow/core/platform/default/gpu_tracer.cc2
-rw-r--r--tensorflow/core/profiler/internal/tfprof_node.cc13
7 files changed, 801 insertions, 23 deletions
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 6887d8cfb6..fdf45beed3 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3035,7 +3035,7 @@ tf_kernel_library(
"maxpooling_op.h",
"pooling_ops_3d.h",
"pooling_ops_common.h",
- ],
+ ] + if_sycl(["pooling_ops_3d_sycl.h"]),
gpu_srcs = [
"avgpooling_op.h",
"avgpooling_op_gpu.cu.cc",
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 538dca24ae..a406317213 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -37,10 +37,18 @@ limitations under the License.
#include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/kernels/pooling_ops_3d_sycl.h"
+#endif // TENSORFLOW_USE_SYCL
+
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif // TENSORFLOW_USE_SYCL
Pool3dParameters::Pool3dParameters(OpKernelContext* context,
const std::vector<int32>& ksize,
@@ -89,11 +97,6 @@ TensorShape Pool3dParameters::forward_output_shape() {
{{out_plane, out_height, out_width}}, depth);
}
-enum PoolingType { MAX, AVG };
-
-template <typename Device, typename T, PoolingType Type>
-struct LaunchPoolingOp;
-
template <typename T>
struct LaunchPoolingOp<CPUDevice, T, AVG> {
static void launch(OpKernelContext* context, const Tensor& tensor_in,
@@ -200,9 +203,6 @@ class Pooling3DOp : public UnaryOp<T> {
TensorFormat data_format_;
};
-template <typename Device, typename T>
-struct LaunchMaxPooling3dGradOp;
-
template <typename T>
struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
static void launch(OpKernelContext* context, const Tensor& tensor_in,
@@ -377,9 +377,6 @@ class MaxPooling3dGradOp : public OpKernel {
TensorFormat data_format_;
};
-template <typename Device, typename T>
-struct LaunchAvgPooling3dGradOp;
-
template <typename T>
struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
static void launch(OpKernelContext* context,
@@ -541,9 +538,6 @@ class AvgPooling3dGradOp : public OpKernel {
TensorFormat data_format_;
};
-template <typename Device, typename T>
-struct LaunchMaxPooling3dGradGradOp;
-
template <typename T>
struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
static void launch(OpKernelContext* context, const Pool3dParameters& params,
@@ -837,6 +831,12 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
#endif // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
+#undef REGISTER_SYCL_KERNELS
+#endif // TENSORFLOW_USE_SYCL
+
#undef REGISTER_KERNELS
} // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.h b/tensorflow/core/kernels/pooling_ops_3d.h
index 7954e2cf83..d1be3ba407 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.h
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@@ -22,6 +22,20 @@ limitations under the License.
namespace tensorflow {
+enum PoolingType { MAX, AVG };
+
+template <typename Device, typename T, PoolingType Type>
+struct LaunchPoolingOp;
+
+template <typename Device, typename T>
+struct LaunchAvgPooling3dGradOp;
+
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradOp;
+
+template <typename Device, typename T>
+struct LaunchMaxPooling3dGradGradOp;
+
// A helper class to manage sizes and shapes for 3d pooling operations.
struct Pool3dParameters {
// Updates context->status if there is an invalid input.
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
new file mode 100644
index 0000000000..d8cbc589a1
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
@@ -0,0 +1,759 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if !TENSORFLOW_USE_SYCL
+#error This file must only be included when building with SYCL support
+#endif
+
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
+
+#include "tensorflow/core/kernels/pooling_ops_3d.h"
+
+namespace tensorflow {
+
+typedef Eigen::SyclDevice SYCLDevice;
+
+// Helper struct to contain the various pool parameters used in the SYCL
+// pooling kernels. Similar to the Pool3dParameters, but with a number of
+// convenient constructors.
+struct SYCL3DPoolParams {
+ SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
+ const int in_rows, const int in_cols, const int out_planes,
+ const int out_rows, const int out_cols,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& padding)
+ : depth_(depth),
+ batch_(batch),
+ in_planes_(in_planes),
+ in_rows_(in_rows),
+ in_cols_(in_cols),
+ window_planes_(window[2]),
+ window_rows_(window[1]),
+ window_cols_(window[0]),
+ stride_planes_(stride[2]),
+ stride_rows_(stride[1]),
+ stride_cols_(stride[0]),
+ out_planes_(out_planes),
+ out_rows_(out_rows),
+ out_cols_(out_cols),
+ pad_planes_(padding[2]),
+ pad_rows_(padding[1]),
+ pad_cols_(padding[0]) {}
+
+ SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
+ const int in_rows, const int in_cols,
+ const std::array<int64, 3>& out_shape,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& padding)
+ : SYCL3DPoolParams(depth, batch, in_planes, in_rows, in_cols,
+ out_shape[2], out_shape[1], out_shape[0], window,
+ stride, padding) {}
+
+ SYCL3DPoolParams(const Pool3dParameters& params)
+ : depth_(params.depth),
+ batch_(params.tensor_in_batch),
+ in_planes_(params.tensor_in_planes),
+ in_rows_(params.tensor_in_rows),
+ in_cols_(params.tensor_in_cols),
+ window_planes_(params.window_planes),
+ window_rows_(params.window_rows),
+ window_cols_(params.window_cols),
+ stride_planes_(params.plane_stride),
+ stride_rows_(params.row_stride),
+ stride_cols_(params.col_stride),
+ out_planes_(params.out_plane),
+ out_rows_(params.out_height),
+ out_cols_(params.out_width),
+ pad_planes_(params.pad_planes),
+ pad_rows_(params.pad_rows),
+ pad_cols_(params.pad_cols) {}
+
+ const int depth_;
+ const int batch_;
+ const int in_planes_;
+ const int in_rows_;
+ const int in_cols_;
+
+ const int window_planes_;
+ const int window_rows_;
+ const int window_cols_;
+
+ const int stride_planes_;
+ const int stride_rows_;
+ const int stride_cols_;
+
+ const int out_planes_;
+ const int out_rows_;
+ const int out_cols_;
+
+ const int pad_planes_;
+ const int pad_rows_;
+ const int pad_cols_;
+};
+// MaxPool3d SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output tensor.
+//
+// For each output element, find the corresponding input window and run over
+// all values in the window to find the maximum value. This value is then
+// copied into that output element.
+template <typename T>
+class MaxPool3DSYCL {
+ using write_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+ cl::sycl::access::target::global_buffer>;
+ using read_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+ cl::sycl::access::target::global_buffer>;
+
+ public:
+ MaxPool3DSYCL(const int depth, const int batch, const int in_planes,
+ const int in_rows, const int in_cols, const int out_planes,
+ const int out_rows, const int out_cols,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& padding,
+ const read_accessor input_accessor,
+ write_accessor output_accessor)
+ : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
+ out_cols, window, stride, padding),
+ input_accessor_(input_accessor),
+ output_accessor_(output_accessor) {}
+ void operator()(cl::sycl::item<1> item) {
+ T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
+ T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
+
+ int index = item.get_linear_id();
+ int n = index;
+ int d = n % p_.depth_;
+ n /= p_.depth_;
+ int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
+ int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+ cstart = std::max(cstart, 0);
+ n /= p_.out_cols_;
+ int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
+ int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+ rstart = std::max(rstart, 0);
+ n /= p_.out_rows_;
+ int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
+ int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+ pstart = std::max(pstart, 0);
+ n /= p_.out_planes_;
+ T maxval = Eigen::NumTraits<T>::lowest();
+ const T* input_data_n =
+ input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+ for (int p = pstart; p < pend; ++p) {
+ for (int r = rstart; r < rend; ++r) {
+ for (int c = cstart; c < cend; ++c) {
+ int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
+ if (input_data_n[idx] > maxval) {
+ maxval = input_data_n[idx];
+ }
+ }
+ }
+ }
+ output_data[index] = maxval;
+ }
+
+ private:
+ const SYCL3DPoolParams p_;
+ const read_accessor input_accessor_;
+ write_accessor output_accessor_;
+};
+template <typename T>
+struct LaunchPoolingOp<SYCLDevice, T, MAX> {
+ static void launch(OpKernelContext* context, const Tensor& tensor_in,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& padding,
+ TensorFormat data_format, Padding padding_type,
+ Tensor* output) {
+ const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+ const int out_planes = GetTensorDim(*output, data_format, '0');
+ const int out_rows = GetTensorDim(*output, data_format, '1');
+ const int out_cols = GetTensorDim(*output, data_format, '2');
+ const int batch = GetTensorDim(tensor_in, data_format, 'N');
+ const int in_planes = GetTensorDim(tensor_in, data_format, '0');
+ const int in_rows = GetTensorDim(tensor_in, data_format, '1');
+ const int in_cols = GetTensorDim(tensor_in, data_format, '2');
+ const int depth = GetTensorDim(tensor_in, data_format, 'C');
+
+ const int num_threads = output->NumElements();
+
+ auto input_buffer =
+ device.get_sycl_buffer(tensor_in.template flat<T>().data());
+ auto output_buffer =
+ device.get_sycl_buffer(output->template flat<T>().data());
+
+ device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+ auto input_access =
+ input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
+ auto output_access =
+ output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
+ MaxPool3DSYCL<T> max_pool(depth, batch, in_planes, in_rows, in_cols,
+ out_planes, out_rows, out_cols, window, stride,
+ padding, input_access, output_access);
+
+ cgh.parallel_for(cl::sycl::range<1>(num_threads), max_pool);
+ });
+ }
+};
+// MaxPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output backprop tenor (i.e. the number of elements
+// in the input data tensor).
+//
+// For each output backprop element we compute the possible window of values in
+// the input backprop tensor which might contribute to this element. Then for
+// each error in this window, compute the corresponding input window which was
+// pooled into that element in the output. Walk through this input window to
+// determine whether the input value is the first maximum value, and so the
+// error should be propagated back to the corresponding backprop element.
+template <typename T>
+class MaxPool3DGradSYCL {
+ using write_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+ cl::sycl::access::target::global_buffer>;
+ using read_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+ cl::sycl::access::target::global_buffer>;
+
+ public:
+ MaxPool3DGradSYCL(const int depth, const int batch, const int in_planes,
+ const int in_rows, const int in_cols,
+ const std::array<int64, 3>& output_shape,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& padding,
+ const read_accessor input_data_accessor,
+ const read_accessor output_data_accessor,
+ const read_accessor input_backprop_accessor,
+ write_accessor output_backprop_accessor)
+ : p_(depth, batch, in_planes, in_rows, in_cols, output_shape, window,
+ stride, padding),
+ input_data_accessor_(input_data_accessor),
+ output_data_accessor_(output_data_accessor),
+ input_backprop_accessor_(input_backprop_accessor),
+ output_backprop_accessor_(output_backprop_accessor) {}
+ void operator()(cl::sycl::item<1> item) {
+ T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
+ T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
+ T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
+ T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
+
+ const int index = item.get_linear_id();
+ T output_value = 0;
+ int n = index;
+ const int d = n % p_.depth_;
+ n /= p_.depth_;
+ const int c = (n % p_.in_cols_) + p_.pad_cols_;
+ const int poolcstart =
+ (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
+ const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
+ n /= p_.in_cols_;
+ const int r = (n % p_.in_rows_) + p_.pad_rows_;
+ const int poolrstart =
+ (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
+ const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
+ n /= p_.in_rows_;
+ const int p = (n % p_.in_planes_) + p_.pad_planes_;
+ const int poolpstart =
+ (p < p_.window_planes_)
+ ? 0
+ : (p - p_.window_planes_) / p_.stride_planes_ + 1;
+ const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
+ n /= p_.in_planes_;
+ const int index_no_n =
+ index - n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+
+ const T* input_data_n =
+ input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+ const T* output_data_n =
+ output_data +
+ n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+ const T* input_backprop_n =
+ input_backprop +
+ n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+ for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
+ int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
+ const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+ pstart = std::max(pstart, 0);
+
+ for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
+ int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
+ const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+ rstart = std::max(rstart, 0);
+
+ for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
+ int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
+ const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+ cstart = std::max(cstart, 0);
+
+ const int output_data_idx =
+ ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
+ p_.depth_ +
+ d;
+ bool should_continue = true;
+ bool is_max = (input_data[index] == output_data_n[output_data_idx]);
+ for (int win_p = pstart; win_p < pend && should_continue; ++win_p) {
+ for (int win_r = rstart; win_r < rend && should_continue; ++win_r) {
+ for (int win_c = cstart; win_c < cend && should_continue;
+ ++win_c) {
+ const int input_data_idx =
+ ((win_p * p_.in_rows_ + win_r) * p_.in_cols_ + win_c) *
+ p_.depth_ +
+ d;
+ if (input_data_idx == index_no_n) {
+ should_continue = false;
+ } else if (input_data_n[input_data_idx] ==
+ output_data_n[output_data_idx]) {
+ should_continue = false;
+ is_max = false;
+ }
+ }
+ }
+ }
+ if (is_max) {
+ output_value += input_backprop_n[output_data_idx];
+ }
+ }
+ }
+ }
+ output_backprop[index] = output_value;
+ }
+
+ private:
+ const SYCL3DPoolParams p_;
+
+ const read_accessor input_data_accessor_;
+ const read_accessor output_data_accessor_;
+ const read_accessor input_backprop_accessor_;
+ write_accessor output_backprop_accessor_;
+};
+template <typename T>
+struct LaunchMaxPooling3dGradOp<SYCLDevice, T> {
+ static void launch(OpKernelContext* context, const Tensor& tensor_in,
+ const Tensor& tensor_out, const Tensor& out_backprop,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& out,
+ const std::array<int64, 3>& padding,
+ TensorFormat data_format, Tensor* output) {
+ const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+ const int batch = GetTensorDim(tensor_in, data_format, 'N');
+ const int in_planes = GetTensorDim(tensor_in, data_format, '0');
+ const int in_rows = GetTensorDim(tensor_in, data_format, '1');
+ const int in_cols = GetTensorDim(tensor_in, data_format, '2');
+ const int depth = GetTensorDim(tensor_in, data_format, 'C');
+
+ const int output_size = output->NumElements();
+
+ auto input_data_buffer =
+ device.get_sycl_buffer(tensor_in.template flat<T>().data());
+ auto output_data_buffer =
+ device.get_sycl_buffer(tensor_out.template flat<T>().data());
+ auto input_backprop_buffer =
+ device.get_sycl_buffer(out_backprop.template flat<T>().data());
+ auto output_backprop_buffer =
+ device.get_sycl_buffer(output->template flat<T>().data());
+
+ device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+ auto input_data_access =
+ input_data_buffer.template get_access<cl::sycl::access::mode::read>(
+ cgh);
+ auto output_data_access =
+ output_data_buffer.template get_access<cl::sycl::access::mode::read>(
+ cgh);
+ auto input_backprop_access =
+ input_backprop_buffer
+ .template get_access<cl::sycl::access::mode::read>(cgh);
+ auto output_backprop_access =
+ output_backprop_buffer
+ .template get_access<cl::sycl::access::mode::write>(cgh);
+ MaxPool3DGradSYCL<T> max_pool(
+ depth, batch, in_planes, in_rows, in_cols, out, window, stride,
+ padding, input_data_access, output_data_access, input_backprop_access,
+ output_backprop_access);
+
+ cgh.parallel_for(cl::sycl::range<1>(output_size), max_pool);
+ });
+ }
+};
+// MaxPool3DGradGrad SYCL kernel. Expects the number of threads to be equal to
+// the number of elements in the output backprop tensor, i.e. the number of
+// elements in the output tensor.
+//
+// For each element in the output backprop tensor, find the corresponding input
+// window, and compare the input and output data to find the index of the
+// maximum value in the input tensor. This is then the index of the gradient to
+// pass through to the output backprop tensor.
+template <typename T>
+class MaxPool3DGradGradSYCL {
+ using write_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+ cl::sycl::access::target::global_buffer>;
+ using read_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+ cl::sycl::access::target::global_buffer>;
+
+ public:
+ MaxPool3DGradGradSYCL(const Pool3dParameters& params,
+ const read_accessor input_data_accessor,
+ const read_accessor output_data_accessor,
+ const read_accessor input_backprop_accessor,
+ write_accessor output_backprop_accessor)
+ : p_(params),
+ input_data_accessor_(input_data_accessor),
+ output_data_accessor_(output_data_accessor),
+ input_backprop_accessor_(input_backprop_accessor),
+ output_backprop_accessor_(output_backprop_accessor) {}
+ void operator()(cl::sycl::item<1> item) {
+ T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
+ T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
+ T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
+ T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
+
+ int index = item.get_linear_id();
+ int n = index;
+ int d = n % p_.depth_;
+ n /= p_.depth_;
+ int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
+ int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+ cstart = std::max(cstart, 0);
+ n /= p_.out_cols_;
+ int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
+ int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+ rstart = std::max(rstart, 0);
+ n /= p_.out_rows_;
+ int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
+ int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+ pstart = std::max(pstart, 0);
+ n /= p_.out_planes_;
+ int maxidx = -1;
+ bool should_stop = false;
+ const T* input_data_n =
+ input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+ for (int p = pstart; p < pend && !should_stop; ++p) {
+ for (int r = rstart; r < rend && !should_stop; ++r) {
+ for (int c = cstart; c < cend && !should_stop; ++c) {
+ int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
+ if (output_data[index] == input_data_n[idx]) {
+ maxidx = idx;
+ should_stop = true;
+ }
+ }
+ }
+ }
+ if (maxidx != -1) {
+ output_backprop[index] = input_backprop[n * p_.in_planes_ * p_.in_rows_ *
+ p_.in_cols_ * p_.depth_ +
+ maxidx];
+ }
+ }
+
+ private:
+ const SYCL3DPoolParams p_;
+
+ const read_accessor input_data_accessor_;
+ const read_accessor output_data_accessor_;
+ const read_accessor input_backprop_accessor_;
+ write_accessor output_backprop_accessor_;
+};
+template <typename T>
+struct LaunchMaxPooling3dGradGradOp<SYCLDevice, T> {
+ static void launch(OpKernelContext* context, const Pool3dParameters& params,
+ const Tensor& tensor_in, const Tensor& tensor_out,
+ const Tensor& out_backprop, Tensor* output) {
+ const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+
+ const int num_threads = output->NumElements();
+
+ auto input_data_buffer =
+ device.get_sycl_buffer(tensor_in.template flat<T>().data());
+ auto output_data_buffer =
+ device.get_sycl_buffer(tensor_out.template flat<T>().data());
+ auto input_backprop_buffer =
+ device.get_sycl_buffer(out_backprop.template flat<T>().data());
+ auto output_backprop_buffer =
+ device.get_sycl_buffer(output->template flat<T>().data());
+
+ device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+ auto input_data_access =
+ input_data_buffer.template get_access<cl::sycl::access::mode::read>(
+ cgh);
+ auto output_data_access =
+ output_data_buffer.template get_access<cl::sycl::access::mode::read>(
+ cgh);
+ auto input_backprop_access =
+ input_backprop_buffer
+ .template get_access<cl::sycl::access::mode::read>(cgh);
+ auto output_backprop_access =
+ output_backprop_buffer
+ .template get_access<cl::sycl::access::mode::write>(cgh);
+ MaxPool3DGradGradSYCL<T> functor(
+ params, input_data_access, output_data_access, input_backprop_access,
+ output_backprop_access);
+
+ cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
+ });
+ }
+};
+// AvgPool3D SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output tensor.
+//
+// For each output value find the corresponding input window, and run through
+// the window accumulating the values to form an average. We divide each value
+// before accumulating to prevent the accumulator from becoming significantly
+// bigger than the values we are adding and so decrease any errors.
+template <typename T>
+class AvgPool3DSYCL {
+ using write_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+ cl::sycl::access::target::global_buffer>;
+ using read_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+ cl::sycl::access::target::global_buffer>;
+
+ public:
+ AvgPool3DSYCL(const int depth, const int batch, const int in_planes,
+ const int in_rows, const int in_cols, const int out_planes,
+ const int out_rows, const int out_cols,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& padding,
+ const read_accessor input_accessor,
+ write_accessor output_accessor)
+ : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
+ out_cols, window, stride, padding),
+ input_accessor_(input_accessor),
+ output_accessor_(output_accessor) {}
+ void operator()(cl::sycl::item<1> item) {
+ T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
+ T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
+
+ int index = item.get_linear_id();
+ int n = index;
+ int d = n % p_.depth_;
+ n /= p_.depth_;
+ int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
+ int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+ cstart = std::max(cstart, 0);
+ n /= p_.out_cols_;
+ int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
+ int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+ rstart = std::max(rstart, 0);
+ n /= p_.out_rows_;
+ int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
+ int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+ pstart = std::max(pstart, 0);
+ n /= p_.out_planes_;
+ T accum = T(0);
+ T count =
+ static_cast<T>((pend - pstart) * (rend - rstart) * (cend - cstart));
+ const T* input_data_n =
+ input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
+ for (int p = pstart; p < pend; ++p) {
+ for (int r = rstart; r < rend; ++r) {
+ for (int c = cstart; c < cend; ++c) {
+ int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
+ accum += input_data_n[idx] / count;
+ }
+ }
+ }
+ output_data[index] = accum;
+ }
+
+ private:
+ const SYCL3DPoolParams p_;
+ const read_accessor input_accessor_;
+ write_accessor output_accessor_;
+};
+template <typename T>
+struct LaunchPoolingOp<SYCLDevice, T, AVG> {
+ static void launch(OpKernelContext* context, const Tensor& tensor_in,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& padding,
+ TensorFormat data_format, Padding padding_type,
+ Tensor* output) {
+ const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+ const int out_planes = GetTensorDim(*output, data_format, '0');
+ const int out_rows = GetTensorDim(*output, data_format, '1');
+ const int out_cols = GetTensorDim(*output, data_format, '2');
+ const int batch = GetTensorDim(tensor_in, data_format, 'N');
+ const int in_planes = GetTensorDim(tensor_in, data_format, '0');
+ const int in_rows = GetTensorDim(tensor_in, data_format, '1');
+ const int in_cols = GetTensorDim(tensor_in, data_format, '2');
+ const int depth = GetTensorDim(tensor_in, data_format, 'C');
+
+ const int num_threads = output->NumElements();
+
+ auto input_buffer =
+ device.get_sycl_buffer(tensor_in.template flat<T>().data());
+ auto output_buffer =
+ device.get_sycl_buffer(output->template flat<T>().data());
+
+ device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+ auto input_access =
+ input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
+ auto output_access =
+ output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
+ AvgPool3DSYCL<T> avg_pool(depth, batch, in_planes, in_rows, in_cols,
+ out_planes, out_rows, out_cols, window, stride,
+ padding, input_access, output_access);
+
+ cgh.parallel_for(cl::sycl::range<1>(num_threads), avg_pool);
+ });
+ }
+};
+// AvgPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
+// number of elements in the output backprop tensor, i.e. the number of
+// elements in the input tensor.
+//
+// For each output backprop index find a window in the input backprop tensor
+// which corresponds to all the values of the output which were affected by the
+// input value at this index. Then for each gradient in this window, compute
+// the size of the input window which was averaged to give this output, and use
+// this size to scale the gradient accordingly. Add this scaled gradient to the
+// output backprop value.
+template <typename T>
+class AvgPool3DGradSYCL {
+ using write_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
+ cl::sycl::access::target::global_buffer>;
+ using read_accessor =
+ cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
+ cl::sycl::access::target::global_buffer>;
+
+ public:
+ AvgPool3DGradSYCL(const int depth, const int batch, const int in_planes,
+ const int in_rows, const int in_cols,
+ const std::array<int64, 3>& out_shape,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& padding,
+ const read_accessor input_backprop_accessor,
+ write_accessor output_backprop_accessor)
+ : p_(depth, batch, in_planes, in_rows, in_cols, out_shape, window, stride,
+ padding),
+ input_backprop_accessor_(input_backprop_accessor),
+ output_backprop_accessor_(output_backprop_accessor) {}
+ void operator()(cl::sycl::item<1> item) {
+ T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
+ T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
+
+ const int index = item.get_linear_id();
+ int n = index;
+ const int d = n % p_.depth_;
+ n /= p_.depth_;
+ const int c = (n % p_.in_cols_) + p_.pad_cols_;
+ const int poolcstart =
+ (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
+ const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
+ n /= p_.in_cols_;
+ const int r = (n % p_.in_rows_) + p_.pad_rows_;
+ const int poolrstart =
+ (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
+ const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
+ n /= p_.in_rows_;
+ const int p = (n % p_.in_planes_) + p_.pad_planes_;
+ const int poolpstart =
+ (p < p_.window_planes_)
+ ? 0
+ : (p - p_.window_planes_) / p_.stride_planes_ + 1;
+ const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
+ n /= p_.in_planes_;
+
+ T gradient = T(0);
+ const T* input_backprop_n =
+ input_backprop +
+ n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+ for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
+ int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
+ const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
+ pstart = std::max(pstart, 0);
+ const int plane_window_size = pend - pstart;
+ for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
+ int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
+ const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
+ rstart = std::max(rstart, 0);
+ const int row_window_size = rend - rstart;
+ for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
+ const int idx =
+ ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
+ p_.depth_ +
+ d;
+ int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
+ const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
+ cstart = std::max(cstart, 0);
+ const int col_window_size = cend - cstart;
+ const int window_size =
+ plane_window_size * row_window_size * col_window_size;
+ gradient += input_backprop_n[idx] / static_cast<T>(window_size);
+ }
+ }
+ }
+ output_backprop[index] = gradient;
+ }
+
+ private:
+ const SYCL3DPoolParams p_;
+ const read_accessor input_backprop_accessor_;
+ write_accessor output_backprop_accessor_;
+};
+template <typename T>
+struct LaunchAvgPooling3dGradOp<SYCLDevice, T> {
+ static void launch(OpKernelContext* context,
+ const TensorShape& tensor_in_shape,
+ const Tensor& out_backprop,
+ const std::array<int64, 3>& window,
+ const std::array<int64, 3>& stride,
+ const std::array<int64, 3>& output_shape,
+ const std::array<int64, 3>& padding,
+ TensorFormat data_format, Tensor* output) {
+ const SYCLDevice& device = context->eigen_device<SYCLDevice>();
+ const int batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+ const int in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
+ const int in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
+ const int in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
+ const int depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+
+ const int num_threads = output->NumElements();
+
+ auto input_backprop_buffer =
+ device.get_sycl_buffer(out_backprop.template flat<T>().data());
+ auto output_backprop_buffer =
+ device.get_sycl_buffer(output->template flat<T>().data());
+
+ device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
+ auto input_backprop_access =
+ input_backprop_buffer
+ .template get_access<cl::sycl::access::mode::read>(cgh);
+ auto output_backprop_access =
+ output_backprop_buffer
+ .template get_access<cl::sycl::access::mode::write>(cgh);
+ AvgPool3DGradSYCL<T> functor(
+ depth, batch, in_planes, in_rows, in_cols, output_shape, window,
+ stride, padding, input_backprop_access, output_backprop_access);
+
+ cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
+ });
+ }
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 5b71f046c3..1bfa37f5a7 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -710,9 +710,9 @@ bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
height of the underlying image.
-For example, if an image is 100 x 200 pixels and the bounding box is
-`[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
-bounding box will be `(10, 40)` to `(50, 180)`.
+For example, if an image is 100 x 200 pixels (height x width) and the bounding
+box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
Parts of the bounding box may fall outside the image.
diff --git a/tensorflow/core/platform/default/gpu_tracer.cc b/tensorflow/core/platform/default/gpu_tracer.cc
index 86ab70afdd..50c27b3cf6 100644
--- a/tensorflow/core/platform/default/gpu_tracer.cc
+++ b/tensorflow/core/platform/default/gpu_tracer.cc
@@ -205,7 +205,7 @@ Status CUPTIManager::DisableTrace() {
CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY2));
CUPTI_CALL(ActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
- CUPTI_CALL(ActivityFlushAll(0));
+ CUPTI_CALL(ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
{
// Don't acquire this lock until Flush returns, since Flush
// will potentially cause callbacks into BufferCompleted.
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 69198019cd..70b91c37e4 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -25,7 +25,7 @@ bool CountAsAcceleratorTime(const string& device) {
}
bool CountAsCPUTime(const string& device) {
- return RE2::FullMatch(device, ".*/(gpu|cpu):\\d+");
+ return RE2::FullMatch(device, ".*/(gpu|cpu|device:sycl):\\d+");
}
bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
@@ -145,7 +145,7 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
// See run_metadata_test.py
// It can be /job:0/replica:0/xxxx/gpu:0, or simply /gpu:0.
// It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
- if (IsCanonicalDevice(device)) {
+ if (IsCanonicalDevice(dev)) {
if (!canonical_device_.empty()) {
if (canonical_device_ != dev) {
fprintf(stderr, "Unexpected: graph node changed device: %s->%s.\n",
@@ -155,7 +155,11 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
} else {
canonical_device_ = dev;
// TODO(xpan): Support things other than gpu?
- host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0");
+ if (dev.find("sycl") != dev.npos) {
+ host_device_ = StringReplace(dev, "device:sycl:\\d+", "cpu:0");
+ } else {
+ host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0");
+ }
AddOpType(canonical_device_);
}
}
@@ -229,7 +233,8 @@ TensorShapeProto VecToShapeProto(const std::vector<int64> shape_vec) {
}
bool IsPlacedOnAccelerator(const string& device) {
- return device.find("gpu") != device.npos;
+ return device.find("gpu") != device.npos ||
+ device.find("sycl") != device.npos;
}
} // namespace tfprof
} // namespace tensorflow