aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/resampler
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-01-30 10:43:03 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-01-30 12:33:54 -0800
commit4463d105a8a4a83642b9709ba79310e8f4ddf577 (patch)
tree240e9a0a9a6b9ad956c704776a33126ba00cbfe8 /tensorflow/contrib/resampler
parent8f0e7207774279f4fe50f4d6c4fbd576e2941463 (diff)
Cleanup: Ran clang-format on all *.{cc,h} files in tensorflow/contrib/.../*.{hh,c}.
PiperOrigin-RevId: 183855242
Diffstat (limited to 'tensorflow/contrib/resampler')
-rw-r--r--tensorflow/contrib/resampler/kernels/resampler_ops.cc223
-rw-r--r--tensorflow/contrib/resampler/kernels/resampler_ops.h39
-rw-r--r--tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc188
3 files changed, 180 insertions, 270 deletions
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
index e02c1b6a2b..63c72836d7 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
@@ -36,17 +36,12 @@ using GPUDevice = Eigen::GpuDevice;
namespace functor {
template <typename T>
-struct Resampler2DFunctor<CPUDevice, T>{
- void operator ()(::tensorflow::OpKernelContext* ctx,
- const CPUDevice& d,
- const T* __restrict__ data,
- const T* __restrict__ warp,
- T* __restrict__ output,
- const int batch_size,
- const int data_height,
- const int data_width,
- const int data_channels,
- const int num_sampling_points){
+struct Resampler2DFunctor<CPUDevice, T> {
+ void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+ const T* __restrict__ data, const T* __restrict__ warp,
+ T* __restrict__ output, const int batch_size,
+ const int data_height, const int data_width,
+ const int data_channels, const int num_sampling_points) {
const int warp_batch_stride = num_sampling_points * 2;
const int data_batch_stride = data_height * data_width * data_channels;
const int output_batch_stride = num_sampling_points * data_channels;
@@ -59,24 +54,19 @@ struct Resampler2DFunctor<CPUDevice, T>{
// The functions take care of performing the relevant pointer
// arithmetics abstracting away the low level details in the
// main loop over samples. Note that data is stored in NHWC format.
- auto set_output = [&](const int sample_id,
- const int channel,
+ auto set_output = [&](const int sample_id, const int channel,
const T value) {
- output[batch_id * output_batch_stride +
- sample_id * data_channels +
+ output[batch_id * output_batch_stride + sample_id * data_channels +
channel] = value;
};
- auto get_data_point = [&](const int x,
- const int y,
- const int chan) {
+ auto get_data_point = [&](const int x, const int y, const int chan) {
const bool point_is_in_range =
(x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
return point_is_in_range
- ? data[batch_id * data_batch_stride +
- data_channels * (y * data_width + x) +
- chan]
- : zero;
+ ? data[batch_id * data_batch_stride +
+ data_channels * (y * data_width + x) + chan]
+ : zero;
};
for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -89,8 +79,7 @@ struct Resampler2DFunctor<CPUDevice, T>{
// The effect is that the sampled signal smoothly goes to 0 outside
// the original input domain, rather than presenting a jump
// discontinuity at the image boundaries.
- if (x > static_cast<T>(-1.0) &&
- y > static_cast<T>(-1.0) &&
+ if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
x < static_cast<T>(data_width) &&
y < static_cast<T>(data_height)) {
// Precompute floor (f) and ceil (c) values for x and y.
@@ -103,12 +92,10 @@ struct Resampler2DFunctor<CPUDevice, T>{
for (int chan = 0; chan < data_channels; ++chan) {
const T img_fxfy = dx * dy * get_data_point(fx, fy, chan);
- const T img_cxcy = (one - dx) * (one - dy) *
- get_data_point(cx, cy, chan);
- const T img_fxcy = dx * (one - dy) *
- get_data_point(fx, cy, chan);
- const T img_cxfy = (one - dx) * dy *
- get_data_point(cx, fy, chan);
+ const T img_cxcy =
+ (one - dx) * (one - dy) * get_data_point(cx, cy, chan);
+ const T img_fxcy = dx * (one - dy) * get_data_point(fx, cy, chan);
+ const T img_cxfy = (one - dx) * dy * get_data_point(cx, fy, chan);
set_output(sample_id, chan,
img_fxfy + img_cxcy + img_fxcy + img_cxfy);
}
@@ -125,8 +112,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
// estimate of the cost of each work unit is needed to correctly shard the
// workload. Shard assumes each cost unit is 1ns, minimum cost per shard
// being 10us.
- const int64 cost = static_cast<int64>(num_sampling_points) *
- data_channels * 1000;
+ const int64 cost =
+ static_cast<int64>(num_sampling_points) * data_channels * 1000;
auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
batch_size, cost, resample_batches);
@@ -138,8 +125,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
template <typename Device, typename T>
class ResamplerOp : public ::tensorflow::OpKernel {
public:
- explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) :
- ::tensorflow::OpKernel(context) {}
+ explicit ResamplerOp(::tensorflow::OpKernelConstruction* context)
+ : ::tensorflow::OpKernel(context) {}
void Compute(::tensorflow::OpKernelContext* ctx) override {
const ::tensorflow::Tensor& data = ctx->input(0);
@@ -158,16 +145,17 @@ class ResamplerOp : public ::tensorflow::OpKernel {
::tensorflow::errors::InvalidArgument(
"warp should be at least a matrix, got shape ",
warp_shape.DebugString()));
- OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+ OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
::tensorflow::errors::Unimplemented(
"Only bilinear interpolation is supported, warping "
"coordinates must be 2D; warp shape last entry should be "
- "2, but shape vector is: ", warp_shape.DebugString()));
+ "2, but shape vector is: ",
+ warp_shape.DebugString()));
OP_REQUIRES(ctx, data_shape.dim_size(0) == warp_shape.dim_size(0),
::tensorflow::errors::InvalidArgument(
"Batch size of data and warp tensor must be the same, but "
- "input shapes are: ", data_shape.DebugString(), ", ",
- warp_shape.DebugString()));
+ "input shapes are: ",
+ data_shape.DebugString(), ", ", warp_shape.DebugString()));
const int batch_size = data_shape.dim_size(0);
const int data_height = data_shape.dim_size(1);
const int data_width = data_shape.dim_size(2);
@@ -180,16 +168,10 @@ class ResamplerOp : public ::tensorflow::OpKernel {
// Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
if (num_sampling_points > 0) {
- functor::Resampler2DFunctor<Device, T>()(ctx,
- ctx->eigen_device<Device>(),
- data.flat<T>().data(),
- warp.flat<T>().data(),
- output->flat<T>().data(),
- batch_size,
- data_height,
- data_width,
- data_channels,
- num_sampling_points);
+ functor::Resampler2DFunctor<Device, T>()(
+ ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+ warp.flat<T>().data(), output->flat<T>().data(), batch_size,
+ data_height, data_width, data_channels, num_sampling_points);
}
}
@@ -197,12 +179,9 @@ class ResamplerOp : public ::tensorflow::OpKernel {
TF_DISALLOW_COPY_AND_ASSIGN(ResamplerOp);
};
-
-#define REGISTER(TYPE) \
- REGISTER_KERNEL_BUILDER( \
- Name("Resampler") \
- .Device(DEVICE_CPU) \
- .TypeConstraint<TYPE>("T"), \
+#define REGISTER(TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Resampler").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
ResamplerOp<CPUDevice, TYPE>);
TF_CALL_half(REGISTER);
@@ -211,40 +190,32 @@ TF_CALL_double(REGISTER);
#undef REGISTER
#if GOOGLE_CUDA
-#define REGISTER(TYPE) \
- REGISTER_KERNEL_BUILDER(Name("Resampler") \
- .Device(DEVICE_GPU) \
- .TypeConstraint<TYPE>("T"), \
- ResamplerOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("Resampler").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+ ResamplerOp<GPUDevice, TYPE>)
TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
#undef REGISTER
#endif // GOOGLE_CUDA
-
namespace functor {
template <typename T>
-struct ResamplerGrad2DFunctor<CPUDevice, T>{
- void operator ()(::tensorflow::OpKernelContext* ctx,
- const CPUDevice& d,
- const T* __restrict__ data,
- const T* __restrict__ warp,
- const T* __restrict__ grad_output,
- T* __restrict__ grad_data,
- T* __restrict__ grad_warp,
- const int batch_size,
- const int data_height,
- const int data_width,
- const int data_channels,
- const int num_sampling_points){
+struct ResamplerGrad2DFunctor<CPUDevice, T> {
+ void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+ const T* __restrict__ data, const T* __restrict__ warp,
+ const T* __restrict__ grad_output, T* __restrict__ grad_data,
+ T* __restrict__ grad_warp, const int batch_size,
+ const int data_height, const int data_width,
+ const int data_channels, const int num_sampling_points) {
// Set gradients to 0, because the kernel incrementally updates the
// tensor entries by adding partial contributions.
- const int resampler_output_size = batch_size * num_sampling_points *
- data_channels;
+ const int resampler_output_size =
+ batch_size * num_sampling_points * data_channels;
const int grad_warp_size = resampler_output_size / data_channels * 2;
- const int grad_data_size = data_height * data_width * data_channels *
- batch_size;
+ const int grad_data_size =
+ data_height * data_width * data_channels * batch_size;
memset(grad_data, 0, sizeof(T) * grad_data_size);
memset(grad_warp, 0, sizeof(T) * grad_warp_size);
@@ -260,35 +231,29 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
// The functions take care of performing the relevant pointer
// arithmetics abstracting away the low level details in the
// main loop over samples. Note that data is stored in NHWC format.
- auto get_data_point = [&](const int x,
- const int y,
- const int chan) {
+ auto get_data_point = [&](const int x, const int y, const int chan) {
const bool point_is_in_range =
- (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
+ (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
return point_is_in_range
- ? data[batch_id * data_batch_stride +
- data_channels * (y * data_width + x) +
- chan]
- : zero;
+ ? data[batch_id * data_batch_stride +
+ data_channels * (y * data_width + x) + chan]
+ : zero;
};
auto update_grad_data = [&](const int x, const int y, const int chan,
const T value) {
const bool point_is_in_range =
(x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
- if (point_is_in_range){
+ if (point_is_in_range) {
grad_data[batch_id * data_batch_stride +
- data_channels * (y * data_width + x) +
- chan] += value;
+ data_channels * (y * data_width + x) + chan] += value;
}
};
- auto update_grad_warp = [&](const int sample_id,
- const int channel,
+ auto update_grad_warp = [&](const int sample_id, const int channel,
const T value) {
- grad_warp[batch_id * warp_batch_stride +
- sample_id * 2 +
- channel] += value;
+ grad_warp[batch_id * warp_batch_stride + sample_id * 2 + channel] +=
+ value;
};
for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -301,8 +266,7 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
// The effect is that the sampled signal smoothly goes to 0 outside
// the original input domain, rather than presenting a jump
// discontinuity at the image boundaries.
- if (x > static_cast<T>(-1.0) &&
- y > static_cast<T>(-1.0) &&
+ if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
x < static_cast<T>(data_width) &&
y < static_cast<T>(data_height)) {
// Precompute floor (f) and ceil (c) values for x and y.
@@ -316,27 +280,25 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
for (int chan = 0; chan < data_channels; ++chan) {
const T grad_output_value =
grad_output[batch_id * output_batch_stride +
- sample_id * data_channels +
- chan];
+ sample_id * data_channels + chan];
const T img_fxfy = get_data_point(fx, fy, chan);
const T img_cxcy = get_data_point(cx, cy, chan);
const T img_fxcy = get_data_point(fx, cy, chan);
const T img_cxfy = get_data_point(cx, fy, chan);
// Update partial gradients wrt relevant warp field entries
- update_grad_warp(sample_id, 0,
- grad_output_value *
- ((one - dy) * (img_cxcy - img_fxcy) +
- dy * (img_cxfy - img_fxfy)));
+ update_grad_warp(
+ sample_id, 0,
+ grad_output_value * ((one - dy) * (img_cxcy - img_fxcy) +
+ dy * (img_cxfy - img_fxfy)));
- update_grad_warp(sample_id, 1,
- grad_output_value *
- ((one - dx) * (img_cxcy - img_cxfy) +
- dx * (img_fxcy - img_fxfy)));
+ update_grad_warp(
+ sample_id, 1,
+ grad_output_value * ((one - dx) * (img_cxcy - img_cxfy) +
+ dx * (img_fxcy - img_fxfy)));
// Update partial gradients wrt sampled data
- update_grad_data(fx, fy, chan,
- grad_output_value * dx * dy);
+ update_grad_data(fx, fy, chan, grad_output_value * dx * dy);
update_grad_data(cx, cy, chan,
grad_output_value * (one - dx) * (one - dy));
update_grad_data(fx, cy, chan,
@@ -355,8 +317,8 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
// being 10us.
// TODO(fviola): Check out if there is a better way of doing this.
auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
- const int64 cost = static_cast<int64>(num_sampling_points) *
- data_channels * 1000;
+ const int64 cost =
+ static_cast<int64>(num_sampling_points) * data_channels * 1000;
::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
batch_size, cost, update_grads_for_batches);
}
@@ -364,12 +326,11 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
} // namespace functor
-
template <typename Device, typename T>
class ResamplerGradOp : public ::tensorflow::OpKernel {
public:
- explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) :
- ::tensorflow::OpKernel(context) {}
+ explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context)
+ : ::tensorflow::OpKernel(context) {}
void Compute(::tensorflow::OpKernelContext* ctx) override {
const ::tensorflow::Tensor& data = ctx->input(0);
@@ -383,7 +344,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
"tensor must be a batch of 2d data; data shape should have "
"4 entries corresponding to [batch_size, data_height, "
"data_width, data_channels], but is: ",
- data_shape.DebugString()));
+ data_shape.DebugString()));
const int batch_size = data_shape.dim_size(0);
const int data_height = data_shape.dim_size(1);
const int data_width = data_shape.dim_size(2);
@@ -394,7 +355,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
::tensorflow::errors::InvalidArgument(
"warp should be at least a matrix, got shape ",
warp_shape.DebugString()));
- OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+ OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
::tensorflow::errors::Unimplemented(
"Only bilinear interpolation is supported, warping "
"coordinates must be 2D; warp shape last entry should be "
@@ -417,18 +378,11 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, warp.shape(), &grad_warp));
// Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
if (num_sampling_points > 0) {
- functor::ResamplerGrad2DFunctor<Device, T>()(ctx,
- ctx->eigen_device<Device>(),
- data.flat<T>().data(),
- warp.flat<T>().data(),
- grad_output.flat<T>().data(),
- grad_data->flat<T>().data(),
- grad_warp->flat<T>().data(),
- batch_size,
- data_height,
- data_width,
- data_channels,
- num_sampling_points);
+ functor::ResamplerGrad2DFunctor<Device, T>()(
+ ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+ warp.flat<T>().data(), grad_output.flat<T>().data(),
+ grad_data->flat<T>().data(), grad_warp->flat<T>().data(), batch_size,
+ data_height, data_width, data_channels, num_sampling_points);
}
}
@@ -436,11 +390,9 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
TF_DISALLOW_COPY_AND_ASSIGN(ResamplerGradOp);
};
-#define REGISTER(TYPE) \
- REGISTER_KERNEL_BUILDER( \
- Name("ResamplerGrad") \
- .Device(DEVICE_CPU) \
- .TypeConstraint<TYPE>("T"), \
+#define REGISTER(TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ResamplerGrad").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
ResamplerGradOp<CPUDevice, TYPE>);
TF_CALL_half(REGISTER);
@@ -449,11 +401,10 @@ TF_CALL_double(REGISTER);
#undef REGISTER
#if GOOGLE_CUDA
-#define REGISTER(TYPE) \
- REGISTER_KERNEL_BUILDER(Name("ResamplerGrad") \
- .Device(DEVICE_GPU) \
- .TypeConstraint<TYPE>("T"), \
- ResamplerGradOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE) \
+ REGISTER_KERNEL_BUILDER( \
+ Name("ResamplerGrad").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+ ResamplerGradOp<GPUDevice, TYPE>)
// Disable half and double precision since atomicAdds are not supported
// TF_CALL_half(REGISTER);
// TF_CALL_double(REGISTER);
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.h b/tensorflow/contrib/resampler/kernels/resampler_ops.h
index 85d3676efa..7fe3b9c0df 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.h
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.h
@@ -29,38 +29,25 @@ namespace functor {
// Helper functor for the Resampler Op in 2D
template <typename Device, typename T>
-struct Resampler2DFunctor{
- void operator ()(::tensorflow::OpKernelContext* ctx,
- const Device& d,
- const T* __restrict__ data,
- const T* __restrict__ warp,
- T* __restrict__ output,
- const int batch_size,
- const int data_height,
- const int data_width,
- const int data_channels,
- const int num_sampling_points);
+struct Resampler2DFunctor {
+ void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+ const T* __restrict__ data, const T* __restrict__ warp,
+ T* __restrict__ output, const int batch_size,
+ const int data_height, const int data_width,
+ const int data_channels, const int num_sampling_points);
};
-
// Helper functor for the Resampler Gradient Op in 2D
template <typename Device, typename T>
-struct ResamplerGrad2DFunctor{
- void operator ()(::tensorflow::OpKernelContext* ctx,
- const Device& d,
- const T* __restrict__ data,
- const T* __restrict__ warp,
- const T* __restrict__ grad_output,
- T* __restrict__ grad_data,
- T* __restrict__ grad_warp,
- const int batch_size,
- const int data_height,
- const int data_width,
- const int data_channels,
- const int num_sampling_points);
+struct ResamplerGrad2DFunctor {
+ void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+ const T* __restrict__ data, const T* __restrict__ warp,
+ const T* __restrict__ grad_output, T* __restrict__ grad_data,
+ T* __restrict__ grad_warp, const int batch_size,
+ const int data_height, const int data_width,
+ const int data_channels, const int num_sampling_points);
};
-
} // namespace functor
} // namespace tensorflow
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
index 636847a212..3c07051f68 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
@@ -31,18 +31,15 @@ using GPUDevice = Eigen::GpuDevice;
namespace {
-#define GET_DATA_POINT(x, y) \
- data[batch_id * data_batch_stride + \
- data_channels * (y * data_width + x) + \
+#define GET_DATA_POINT(x, y) \
+ data[batch_id * data_batch_stride + data_channels * (y * data_width + x) + \
chan]
template <typename T>
__global__ void Resampler2DKernel(const T* __restrict__ data,
const T* __restrict__ warp,
- T* __restrict__ output,
- const int batch_size,
- const int data_height,
- const int data_width,
+ T* __restrict__ output, const int batch_size,
+ const int data_height, const int data_width,
const int data_channels,
const int num_sampling_points) {
const int output_data_size = batch_size * num_sampling_points * data_channels;
@@ -75,10 +72,8 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
// The effect is that the sampled signal smoothly goes to 0 outside
// the original input domain, rather than presenting a jump
// discontinuity at the image boundaries.
- if (x > static_cast<T>(-1.0) &&
- y > static_cast<T>(-1.0) &&
- x < static_cast<T>(data_width) &&
- y < static_cast<T>(data_height)) {
+ if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+ x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
// Precompute floor (f) and ceil (c) values for x and y.
const int fx = std::floor(static_cast<float>(x));
const int fy = std::floor(static_cast<float>(y));
@@ -87,21 +82,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
const T dx = static_cast<T>(cx) - x;
const T dy = static_cast<T>(cy) - y;
- const T img_fxfy = (fx >= 0 && fy >= 0)
- ? dx * dy * GET_DATA_POINT(fx, fy)
- : zero;
+ const T img_fxfy =
+ (fx >= 0 && fy >= 0) ? dx * dy * GET_DATA_POINT(fx, fy) : zero;
const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
- ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
- : zero;
+ ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
+ : zero;
const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
- ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
- : zero;
+ ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
+ : zero;
const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
- ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
- : zero;
+ ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
+ : zero;
output[out_index] = img_fxfy + img_cxcy + img_fxcy + img_cxfy;
} else {
@@ -115,24 +109,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
namespace functor {
template <typename T>
-struct Resampler2DFunctor<GPUDevice, T>{
- void operator ()(::tensorflow::OpKernelContext* ctx,
- const GPUDevice& d,
- const T* __restrict__ data,
- const T* __restrict__ warp,
- T* __restrict__ output,
- const int batch_size,
- const int data_height,
- const int data_width,
- const int data_channels,
- const int num_sampling_points) {
- const int output_data_size = batch_size * num_sampling_points * data_channels;
- ::tensorflow::CudaLaunchConfig config =
- ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
- Resampler2DKernel<T>
- <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
- data, warp, output, batch_size, data_height, data_width,
- data_channels, num_sampling_points);
+struct Resampler2DFunctor<GPUDevice, T> {
+ void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+ const T* __restrict__ data, const T* __restrict__ warp,
+ T* __restrict__ output, const int batch_size,
+ const int data_height, const int data_width,
+ const int data_channels, const int num_sampling_points) {
+ const int output_data_size =
+ batch_size * num_sampling_points * data_channels;
+ ::tensorflow::CudaLaunchConfig config =
+ ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
+ Resampler2DKernel<T>
+ <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+ data, warp, output, batch_size, data_height, data_width,
+ data_channels, num_sampling_points);
}
};
@@ -145,26 +135,20 @@ template struct Resampler2DFunctor<GPUDevice, double>;
namespace {
-#define UPDATE_GRAD_DATA_POINT(x, y, v) \
- atomicAdd(grad_data + (batch_id * data_batch_stride + \
- data_channels * (y * data_width + x) + \
- chan), \
+#define UPDATE_GRAD_DATA_POINT(x, y, v) \
+ atomicAdd(grad_data + (batch_id * data_batch_stride + \
+ data_channels * (y * data_width + x) + chan), \
v)
-
template <typename T>
-__global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
- const T* __restrict__ warp,
- const T* __restrict__ grad_output,
- T* __restrict__ grad_data,
- T* __restrict__ grad_warp,
- const int batch_size,
- const int data_height,
- const int data_width,
- const int data_channels,
- const int num_sampling_points) {
- const int resampler_output_size = batch_size * num_sampling_points *
- data_channels;
+__global__ void ResamplerGrad2DKernel(
+ const T* __restrict__ data, const T* __restrict__ warp,
+ const T* __restrict__ grad_output, T* __restrict__ grad_data,
+ T* __restrict__ grad_warp, const int batch_size, const int data_height,
+ const int data_width, const int data_channels,
+ const int num_sampling_points) {
+ const int resampler_output_size =
+ batch_size * num_sampling_points * data_channels;
CUDA_1D_KERNEL_LOOP(index, resampler_output_size) {
const int out_index = index;
@@ -199,10 +183,8 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
// The effect is that the sampled signal smoothly goes to 0 outside
// the original input domain, rather than presenting a jump
// discontinuity at the image boundaries.
- if (x > static_cast<T>(-1.0) &&
- y > static_cast<T>(-1.0) &&
- x < static_cast<T>(data_width) &&
- y < static_cast<T>(data_height)) {
+ if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+ x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
// Precompute floor (f) and ceil (c) values for x and y.
const int fx = std::floor(static_cast<float>(x));
const int fy = std::floor(static_cast<float>(y));
@@ -211,21 +193,17 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
const T dx = static_cast<T>(cx) - x;
const T dy = static_cast<T>(cy) - y;
- const T img_fxfy = (fx >= 0 && fy >= 0)
- ? GET_DATA_POINT(fx, fy)
- : zero;
+ const T img_fxfy = (fx >= 0 && fy >= 0) ? GET_DATA_POINT(fx, fy) : zero;
const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
- ? GET_DATA_POINT(cx, cy)
- : zero;
+ ? GET_DATA_POINT(cx, cy)
+ : zero;
- const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
- ? GET_DATA_POINT(fx, cy)
- : zero;
+ const T img_fxcy =
+ (fx >= 0 && cy <= data_height - 1) ? GET_DATA_POINT(fx, cy) : zero;
- const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
- ? GET_DATA_POINT(cx, fy)
- : zero;
+ const T img_cxfy =
+ (cx <= data_width - 1 && fy >= 0) ? GET_DATA_POINT(cx, fy) : zero;
// Update partial gradients wrt relevant warp field entries
atomicAdd(grad_warp + warp_id_x,
@@ -241,7 +219,7 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
}
if (cx <= data_width - 1 && cy <= data_height - 1) {
UPDATE_GRAD_DATA_POINT(cx, cy,
- grad_output_value * (one - dx) * (one - dy));
+ grad_output_value * (one - dx) * (one - dy));
}
if (fx >= 0 && cy <= data_height - 1) {
UPDATE_GRAD_DATA_POINT(fx, cy, grad_output_value * dx * (one - dy));
@@ -261,43 +239,37 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
namespace functor {
template <typename T>
-struct ResamplerGrad2DFunctor<GPUDevice, T>{
- void operator ()(::tensorflow::OpKernelContext* ctx,
- const GPUDevice& d,
- const T* __restrict__ data,
- const T* __restrict__ warp,
- const T* __restrict__ grad_output,
- T* __restrict__ grad_data,
- T* __restrict__ grad_warp,
- const int batch_size,
- const int data_height,
- const int data_width,
- const int data_channels,
- const int num_sampling_points) {
- // Set gradients to 0, because the kernel incrementally updates the
- // tensor entries by adding partial contributions.
- const int grad_warp_size = batch_size * num_sampling_points * 2;
- const int grad_data_size = batch_size * data_height * data_width *
- data_channels;
-
- ::tensorflow::CudaLaunchConfig config =
- ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
- ::tensorflow::SetZero
- <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
- grad_warp_size, grad_warp);
-
- config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
- ::tensorflow::SetZero
- <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
- grad_data_size, grad_data);
-
- const int resampler_output_size = batch_size * num_sampling_points *
- data_channels;
- config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
- ResamplerGrad2DKernel<T>
- <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
- data, warp, grad_output, grad_data, grad_warp, batch_size,
- data_height, data_width, data_channels, num_sampling_points);
+struct ResamplerGrad2DFunctor<GPUDevice, T> {
+ void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+ const T* __restrict__ data, const T* __restrict__ warp,
+ const T* __restrict__ grad_output, T* __restrict__ grad_data,
+ T* __restrict__ grad_warp, const int batch_size,
+ const int data_height, const int data_width,
+ const int data_channels, const int num_sampling_points) {
+ // Set gradients to 0, because the kernel incrementally updates the
+ // tensor entries by adding partial contributions.
+ const int grad_warp_size = batch_size * num_sampling_points * 2;
+ const int grad_data_size =
+ batch_size * data_height * data_width * data_channels;
+
+ ::tensorflow::CudaLaunchConfig config =
+ ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
+ ::tensorflow::
+ SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+ grad_warp_size, grad_warp);
+
+ config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
+ ::tensorflow::
+ SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+ grad_data_size, grad_data);
+
+ const int resampler_output_size =
+ batch_size * num_sampling_points * data_channels;
+ config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
+ ResamplerGrad2DKernel<T>
+ <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+ data, warp, grad_output, grad_data, grad_warp, batch_size,
+ data_height, data_width, data_channels, num_sampling_points);
}
};