diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-01-30 10:43:03 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-01-30 12:33:54 -0800 |
commit | 4463d105a8a4a83642b9709ba79310e8f4ddf577 (patch) | |
tree | 240e9a0a9a6b9ad956c704776a33126ba00cbfe8 /tensorflow/contrib/resampler | |
parent | 8f0e7207774279f4fe50f4d6c4fbd576e2941463 (diff) |
Cleanup: Ran clang-format on all *.{cc,h} files in tensorflow/contrib/.../*.{hh,c}.
PiperOrigin-RevId: 183855242
Diffstat (limited to 'tensorflow/contrib/resampler')
3 files changed, 180 insertions, 270 deletions
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc index e02c1b6a2b..63c72836d7 100644 --- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc +++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc @@ -36,17 +36,12 @@ using GPUDevice = Eigen::GpuDevice; namespace functor { template <typename T> -struct Resampler2DFunctor<CPUDevice, T>{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const CPUDevice& d, - const T* __restrict__ data, - const T* __restrict__ warp, - T* __restrict__ output, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points){ +struct Resampler2DFunctor<CPUDevice, T> { + void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d, + const T* __restrict__ data, const T* __restrict__ warp, + T* __restrict__ output, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points) { const int warp_batch_stride = num_sampling_points * 2; const int data_batch_stride = data_height * data_width * data_channels; const int output_batch_stride = num_sampling_points * data_channels; @@ -59,24 +54,19 @@ struct Resampler2DFunctor<CPUDevice, T>{ // The functions take care of performing the relevant pointer // arithmetics abstracting away the low level details in the // main loop over samples. Note that data is stored in NHWC format. - auto set_output = [&](const int sample_id, - const int channel, + auto set_output = [&](const int sample_id, const int channel, const T value) { - output[batch_id * output_batch_stride + - sample_id * data_channels + + output[batch_id * output_batch_stride + sample_id * data_channels + channel] = value; }; - auto get_data_point = [&](const int x, - const int y, - const int chan) { + auto get_data_point = [&](const int x, const int y, const int chan) { const bool point_is_in_range = (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1); return point_is_in_range - ? data[batch_id * data_batch_stride + - data_channels * (y * data_width + x) + - chan] - : zero; + ? data[batch_id * data_batch_stride + + data_channels * (y * data_width + x) + chan] + : zero; }; for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) { @@ -89,8 +79,7 @@ struct Resampler2DFunctor<CPUDevice, T>{ // The effect is that the sampled signal smoothly goes to 0 outside // the original input domain, rather than presenting a jump // discontinuity at the image boundaries. - if (x > static_cast<T>(-1.0) && - y > static_cast<T>(-1.0) && + if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) && x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) { // Precompute floor (f) and ceil (c) values for x and y. @@ -103,12 +92,10 @@ struct Resampler2DFunctor<CPUDevice, T>{ for (int chan = 0; chan < data_channels; ++chan) { const T img_fxfy = dx * dy * get_data_point(fx, fy, chan); - const T img_cxcy = (one - dx) * (one - dy) * - get_data_point(cx, cy, chan); - const T img_fxcy = dx * (one - dy) * - get_data_point(fx, cy, chan); - const T img_cxfy = (one - dx) * dy * - get_data_point(cx, fy, chan); + const T img_cxcy = + (one - dx) * (one - dy) * get_data_point(cx, cy, chan); + const T img_fxcy = dx * (one - dy) * get_data_point(fx, cy, chan); + const T img_cxfy = (one - dx) * dy * get_data_point(cx, fy, chan); set_output(sample_id, chan, img_fxfy + img_cxcy + img_fxcy + img_cxfy); } @@ -125,8 +112,8 @@ struct Resampler2DFunctor<CPUDevice, T>{ // estimate of the cost of each work unit is needed to correctly shard the // workload. Shard assumes each cost unit is 1ns, minimum cost per shard // being 10us. - const int64 cost = static_cast<int64>(num_sampling_points) * - data_channels * 1000; + const int64 cost = + static_cast<int64>(num_sampling_points) * data_channels * 1000; auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers, batch_size, cost, resample_batches); @@ -138,8 +125,8 @@ struct Resampler2DFunctor<CPUDevice, T>{ template <typename Device, typename T> class ResamplerOp : public ::tensorflow::OpKernel { public: - explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) : - ::tensorflow::OpKernel(context) {} + explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) + : ::tensorflow::OpKernel(context) {} void Compute(::tensorflow::OpKernelContext* ctx) override { const ::tensorflow::Tensor& data = ctx->input(0); @@ -158,16 +145,17 @@ class ResamplerOp : public ::tensorflow::OpKernel { ::tensorflow::errors::InvalidArgument( "warp should be at least a matrix, got shape ", warp_shape.DebugString())); - OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2, + OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2, ::tensorflow::errors::Unimplemented( "Only bilinear interpolation is supported, warping " "coordinates must be 2D; warp shape last entry should be " - "2, but shape vector is: ", warp_shape.DebugString())); + "2, but shape vector is: ", + warp_shape.DebugString())); OP_REQUIRES(ctx, data_shape.dim_size(0) == warp_shape.dim_size(0), ::tensorflow::errors::InvalidArgument( "Batch size of data and warp tensor must be the same, but " - "input shapes are: ", data_shape.DebugString(), ", ", - warp_shape.DebugString())); + "input shapes are: ", + data_shape.DebugString(), ", ", warp_shape.DebugString())); const int batch_size = data_shape.dim_size(0); const int data_height = data_shape.dim_size(1); const int data_width = data_shape.dim_size(2); @@ -180,16 +168,10 @@ class ResamplerOp : public ::tensorflow::OpKernel { // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU. if (num_sampling_points > 0) { - functor::Resampler2DFunctor<Device, T>()(ctx, - ctx->eigen_device<Device>(), - data.flat<T>().data(), - warp.flat<T>().data(), - output->flat<T>().data(), - batch_size, - data_height, - data_width, - data_channels, - num_sampling_points); + functor::Resampler2DFunctor<Device, T>()( + ctx, ctx->eigen_device<Device>(), data.flat<T>().data(), + warp.flat<T>().data(), output->flat<T>().data(), batch_size, + data_height, data_width, data_channels, num_sampling_points); } } @@ -197,12 +179,9 @@ class ResamplerOp : public ::tensorflow::OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(ResamplerOp); }; - -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Resampler") \ - .Device(DEVICE_CPU) \ - .TypeConstraint<TYPE>("T"), \ +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Resampler").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \ ResamplerOp<CPUDevice, TYPE>); TF_CALL_half(REGISTER); @@ -211,40 +190,32 @@ TF_CALL_double(REGISTER); #undef REGISTER #if GOOGLE_CUDA -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER(Name("Resampler") \ - .Device(DEVICE_GPU) \ - .TypeConstraint<TYPE>("T"), \ - ResamplerOp<GPUDevice, TYPE>) +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Resampler").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \ + ResamplerOp<GPUDevice, TYPE>) TF_CALL_float(REGISTER); TF_CALL_double(REGISTER); #undef REGISTER #endif // GOOGLE_CUDA - namespace functor { template <typename T> -struct ResamplerGrad2DFunctor<CPUDevice, T>{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const CPUDevice& d, - const T* __restrict__ data, - const T* __restrict__ warp, - const T* __restrict__ grad_output, - T* __restrict__ grad_data, - T* __restrict__ grad_warp, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points){ +struct ResamplerGrad2DFunctor<CPUDevice, T> { + void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d, + const T* __restrict__ data, const T* __restrict__ warp, + const T* __restrict__ grad_output, T* __restrict__ grad_data, + T* __restrict__ grad_warp, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points) { // Set gradients to 0, because the kernel incrementally updates the // tensor entries by adding partial contributions. - const int resampler_output_size = batch_size * num_sampling_points * - data_channels; + const int resampler_output_size = + batch_size * num_sampling_points * data_channels; const int grad_warp_size = resampler_output_size / data_channels * 2; - const int grad_data_size = data_height * data_width * data_channels * - batch_size; + const int grad_data_size = + data_height * data_width * data_channels * batch_size; memset(grad_data, 0, sizeof(T) * grad_data_size); memset(grad_warp, 0, sizeof(T) * grad_warp_size); @@ -260,35 +231,29 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{ // The functions take care of performing the relevant pointer // arithmetics abstracting away the low level details in the // main loop over samples. Note that data is stored in NHWC format. - auto get_data_point = [&](const int x, - const int y, - const int chan) { + auto get_data_point = [&](const int x, const int y, const int chan) { const bool point_is_in_range = - (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1); + (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1); return point_is_in_range - ? data[batch_id * data_batch_stride + - data_channels * (y * data_width + x) + - chan] - : zero; + ? data[batch_id * data_batch_stride + + data_channels * (y * data_width + x) + chan] + : zero; }; auto update_grad_data = [&](const int x, const int y, const int chan, const T value) { const bool point_is_in_range = (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1); - if (point_is_in_range){ + if (point_is_in_range) { grad_data[batch_id * data_batch_stride + - data_channels * (y * data_width + x) + - chan] += value; + data_channels * (y * data_width + x) + chan] += value; } }; - auto update_grad_warp = [&](const int sample_id, - const int channel, + auto update_grad_warp = [&](const int sample_id, const int channel, const T value) { - grad_warp[batch_id * warp_batch_stride + - sample_id * 2 + - channel] += value; + grad_warp[batch_id * warp_batch_stride + sample_id * 2 + channel] += + value; }; for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) { @@ -301,8 +266,7 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{ // The effect is that the sampled signal smoothly goes to 0 outside // the original input domain, rather than presenting a jump // discontinuity at the image boundaries. - if (x > static_cast<T>(-1.0) && - y > static_cast<T>(-1.0) && + if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) && x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) { // Precompute floor (f) and ceil (c) values for x and y. @@ -316,27 +280,25 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{ for (int chan = 0; chan < data_channels; ++chan) { const T grad_output_value = grad_output[batch_id * output_batch_stride + - sample_id * data_channels + - chan]; + sample_id * data_channels + chan]; const T img_fxfy = get_data_point(fx, fy, chan); const T img_cxcy = get_data_point(cx, cy, chan); const T img_fxcy = get_data_point(fx, cy, chan); const T img_cxfy = get_data_point(cx, fy, chan); // Update partial gradients wrt relevant warp field entries - update_grad_warp(sample_id, 0, - grad_output_value * - ((one - dy) * (img_cxcy - img_fxcy) + - dy * (img_cxfy - img_fxfy))); + update_grad_warp( + sample_id, 0, + grad_output_value * ((one - dy) * (img_cxcy - img_fxcy) + + dy * (img_cxfy - img_fxfy))); - update_grad_warp(sample_id, 1, - grad_output_value * - ((one - dx) * (img_cxcy - img_cxfy) + - dx * (img_fxcy - img_fxfy))); + update_grad_warp( + sample_id, 1, + grad_output_value * ((one - dx) * (img_cxcy - img_cxfy) + + dx * (img_fxcy - img_fxfy))); // Update partial gradients wrt sampled data - update_grad_data(fx, fy, chan, - grad_output_value * dx * dy); + update_grad_data(fx, fy, chan, grad_output_value * dx * dy); update_grad_data(cx, cy, chan, grad_output_value * (one - dx) * (one - dy)); update_grad_data(fx, cy, chan, @@ -355,8 +317,8 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{ // being 10us. // TODO(fviola): Check out if there is a better way of doing this. auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); - const int64 cost = static_cast<int64>(num_sampling_points) * - data_channels * 1000; + const int64 cost = + static_cast<int64>(num_sampling_points) * data_channels * 1000; ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers, batch_size, cost, update_grads_for_batches); } @@ -364,12 +326,11 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{ } // namespace functor - template <typename Device, typename T> class ResamplerGradOp : public ::tensorflow::OpKernel { public: - explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) : - ::tensorflow::OpKernel(context) {} + explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) + : ::tensorflow::OpKernel(context) {} void Compute(::tensorflow::OpKernelContext* ctx) override { const ::tensorflow::Tensor& data = ctx->input(0); @@ -383,7 +344,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel { "tensor must be a batch of 2d data; data shape should have " "4 entries corresponding to [batch_size, data_height, " "data_width, data_channels], but is: ", - data_shape.DebugString())); + data_shape.DebugString())); const int batch_size = data_shape.dim_size(0); const int data_height = data_shape.dim_size(1); const int data_width = data_shape.dim_size(2); @@ -394,7 +355,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel { ::tensorflow::errors::InvalidArgument( "warp should be at least a matrix, got shape ", warp_shape.DebugString())); - OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2, + OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2, ::tensorflow::errors::Unimplemented( "Only bilinear interpolation is supported, warping " "coordinates must be 2D; warp shape last entry should be " @@ -417,18 +378,11 @@ class ResamplerGradOp : public ::tensorflow::OpKernel { OP_REQUIRES_OK(ctx, ctx->allocate_output(1, warp.shape(), &grad_warp)); // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU. if (num_sampling_points > 0) { - functor::ResamplerGrad2DFunctor<Device, T>()(ctx, - ctx->eigen_device<Device>(), - data.flat<T>().data(), - warp.flat<T>().data(), - grad_output.flat<T>().data(), - grad_data->flat<T>().data(), - grad_warp->flat<T>().data(), - batch_size, - data_height, - data_width, - data_channels, - num_sampling_points); + functor::ResamplerGrad2DFunctor<Device, T>()( + ctx, ctx->eigen_device<Device>(), data.flat<T>().data(), + warp.flat<T>().data(), grad_output.flat<T>().data(), + grad_data->flat<T>().data(), grad_warp->flat<T>().data(), batch_size, + data_height, data_width, data_channels, num_sampling_points); } } @@ -436,11 +390,9 @@ class ResamplerGradOp : public ::tensorflow::OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(ResamplerGradOp); }; -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("ResamplerGrad") \ - .Device(DEVICE_CPU) \ - .TypeConstraint<TYPE>("T"), \ +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("ResamplerGrad").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \ ResamplerGradOp<CPUDevice, TYPE>); TF_CALL_half(REGISTER); @@ -449,11 +401,10 @@ TF_CALL_double(REGISTER); #undef REGISTER #if GOOGLE_CUDA -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER(Name("ResamplerGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint<TYPE>("T"), \ - ResamplerGradOp<GPUDevice, TYPE>) +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("ResamplerGrad").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \ + ResamplerGradOp<GPUDevice, TYPE>) // Disable half and double precision since atomicAdds are not supported // TF_CALL_half(REGISTER); // TF_CALL_double(REGISTER); diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.h b/tensorflow/contrib/resampler/kernels/resampler_ops.h index 85d3676efa..7fe3b9c0df 100644 --- a/tensorflow/contrib/resampler/kernels/resampler_ops.h +++ b/tensorflow/contrib/resampler/kernels/resampler_ops.h @@ -29,38 +29,25 @@ namespace functor { // Helper functor for the Resampler Op in 2D template <typename Device, typename T> -struct Resampler2DFunctor{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const Device& d, - const T* __restrict__ data, - const T* __restrict__ warp, - T* __restrict__ output, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points); +struct Resampler2DFunctor { + void operator()(::tensorflow::OpKernelContext* ctx, const Device& d, + const T* __restrict__ data, const T* __restrict__ warp, + T* __restrict__ output, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points); }; - // Helper functor for the Resampler Gradient Op in 2D template <typename Device, typename T> -struct ResamplerGrad2DFunctor{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const Device& d, - const T* __restrict__ data, - const T* __restrict__ warp, - const T* __restrict__ grad_output, - T* __restrict__ grad_data, - T* __restrict__ grad_warp, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points); +struct ResamplerGrad2DFunctor { + void operator()(::tensorflow::OpKernelContext* ctx, const Device& d, + const T* __restrict__ data, const T* __restrict__ warp, + const T* __restrict__ grad_output, T* __restrict__ grad_data, + T* __restrict__ grad_warp, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points); }; - } // namespace functor } // namespace tensorflow diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc index 636847a212..3c07051f68 100644 --- a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc +++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc @@ -31,18 +31,15 @@ using GPUDevice = Eigen::GpuDevice; namespace { -#define GET_DATA_POINT(x, y) \ - data[batch_id * data_batch_stride + \ - data_channels * (y * data_width + x) + \ +#define GET_DATA_POINT(x, y) \ + data[batch_id * data_batch_stride + data_channels * (y * data_width + x) + \ chan] template <typename T> __global__ void Resampler2DKernel(const T* __restrict__ data, const T* __restrict__ warp, - T* __restrict__ output, - const int batch_size, - const int data_height, - const int data_width, + T* __restrict__ output, const int batch_size, + const int data_height, const int data_width, const int data_channels, const int num_sampling_points) { const int output_data_size = batch_size * num_sampling_points * data_channels; @@ -75,10 +72,8 @@ __global__ void Resampler2DKernel(const T* __restrict__ data, // The effect is that the sampled signal smoothly goes to 0 outside // the original input domain, rather than presenting a jump // discontinuity at the image boundaries. - if (x > static_cast<T>(-1.0) && - y > static_cast<T>(-1.0) && - x < static_cast<T>(data_width) && - y < static_cast<T>(data_height)) { + if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) && + x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) { // Precompute floor (f) and ceil (c) values for x and y. const int fx = std::floor(static_cast<float>(x)); const int fy = std::floor(static_cast<float>(y)); @@ -87,21 +82,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data, const T dx = static_cast<T>(cx) - x; const T dy = static_cast<T>(cy) - y; - const T img_fxfy = (fx >= 0 && fy >= 0) - ? dx * dy * GET_DATA_POINT(fx, fy) - : zero; + const T img_fxfy = + (fx >= 0 && fy >= 0) ? dx * dy * GET_DATA_POINT(fx, fy) : zero; const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1) - ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy) - : zero; + ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy) + : zero; const T img_fxcy = (fx >= 0 && cy <= data_height - 1) - ? dx * (one - dy) * GET_DATA_POINT(fx, cy) - : zero; + ? dx * (one - dy) * GET_DATA_POINT(fx, cy) + : zero; const T img_cxfy = (cx <= data_width - 1 && fy >= 0) - ? (one - dx) * dy * GET_DATA_POINT(cx, fy) - : zero; + ? (one - dx) * dy * GET_DATA_POINT(cx, fy) + : zero; output[out_index] = img_fxfy + img_cxcy + img_fxcy + img_cxfy; } else { @@ -115,24 +109,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data, namespace functor { template <typename T> -struct Resampler2DFunctor<GPUDevice, T>{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const GPUDevice& d, - const T* __restrict__ data, - const T* __restrict__ warp, - T* __restrict__ output, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points) { - const int output_data_size = batch_size * num_sampling_points * data_channels; - ::tensorflow::CudaLaunchConfig config = - ::tensorflow::GetCudaLaunchConfig(output_data_size, d); - Resampler2DKernel<T> - <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( - data, warp, output, batch_size, data_height, data_width, - data_channels, num_sampling_points); +struct Resampler2DFunctor<GPUDevice, T> { + void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d, + const T* __restrict__ data, const T* __restrict__ warp, + T* __restrict__ output, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points) { + const int output_data_size = + batch_size * num_sampling_points * data_channels; + ::tensorflow::CudaLaunchConfig config = + ::tensorflow::GetCudaLaunchConfig(output_data_size, d); + Resampler2DKernel<T> + <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + data, warp, output, batch_size, data_height, data_width, + data_channels, num_sampling_points); } }; @@ -145,26 +135,20 @@ template struct Resampler2DFunctor<GPUDevice, double>; namespace { -#define UPDATE_GRAD_DATA_POINT(x, y, v) \ - atomicAdd(grad_data + (batch_id * data_batch_stride + \ - data_channels * (y * data_width + x) + \ - chan), \ +#define UPDATE_GRAD_DATA_POINT(x, y, v) \ + atomicAdd(grad_data + (batch_id * data_batch_stride + \ + data_channels * (y * data_width + x) + chan), \ v) - template <typename T> -__global__ void ResamplerGrad2DKernel(const T* __restrict__ data, - const T* __restrict__ warp, - const T* __restrict__ grad_output, - T* __restrict__ grad_data, - T* __restrict__ grad_warp, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points) { - const int resampler_output_size = batch_size * num_sampling_points * - data_channels; +__global__ void ResamplerGrad2DKernel( + const T* __restrict__ data, const T* __restrict__ warp, + const T* __restrict__ grad_output, T* __restrict__ grad_data, + T* __restrict__ grad_warp, const int batch_size, const int data_height, + const int data_width, const int data_channels, + const int num_sampling_points) { + const int resampler_output_size = + batch_size * num_sampling_points * data_channels; CUDA_1D_KERNEL_LOOP(index, resampler_output_size) { const int out_index = index; @@ -199,10 +183,8 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data, // The effect is that the sampled signal smoothly goes to 0 outside // the original input domain, rather than presenting a jump // discontinuity at the image boundaries. - if (x > static_cast<T>(-1.0) && - y > static_cast<T>(-1.0) && - x < static_cast<T>(data_width) && - y < static_cast<T>(data_height)) { + if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) && + x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) { // Precompute floor (f) and ceil (c) values for x and y. const int fx = std::floor(static_cast<float>(x)); const int fy = std::floor(static_cast<float>(y)); @@ -211,21 +193,17 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data, const T dx = static_cast<T>(cx) - x; const T dy = static_cast<T>(cy) - y; - const T img_fxfy = (fx >= 0 && fy >= 0) - ? GET_DATA_POINT(fx, fy) - : zero; + const T img_fxfy = (fx >= 0 && fy >= 0) ? GET_DATA_POINT(fx, fy) : zero; const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1) - ? GET_DATA_POINT(cx, cy) - : zero; + ? GET_DATA_POINT(cx, cy) + : zero; - const T img_fxcy = (fx >= 0 && cy <= data_height - 1) - ? GET_DATA_POINT(fx, cy) - : zero; + const T img_fxcy = + (fx >= 0 && cy <= data_height - 1) ? GET_DATA_POINT(fx, cy) : zero; - const T img_cxfy = (cx <= data_width - 1 && fy >= 0) - ? GET_DATA_POINT(cx, fy) - : zero; + const T img_cxfy = + (cx <= data_width - 1 && fy >= 0) ? GET_DATA_POINT(cx, fy) : zero; // Update partial gradients wrt relevant warp field entries atomicAdd(grad_warp + warp_id_x, @@ -241,7 +219,7 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data, } if (cx <= data_width - 1 && cy <= data_height - 1) { UPDATE_GRAD_DATA_POINT(cx, cy, - grad_output_value * (one - dx) * (one - dy)); + grad_output_value * (one - dx) * (one - dy)); } if (fx >= 0 && cy <= data_height - 1) { UPDATE_GRAD_DATA_POINT(fx, cy, grad_output_value * dx * (one - dy)); @@ -261,43 +239,37 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data, namespace functor { template <typename T> -struct ResamplerGrad2DFunctor<GPUDevice, T>{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const GPUDevice& d, - const T* __restrict__ data, - const T* __restrict__ warp, - const T* __restrict__ grad_output, - T* __restrict__ grad_data, - T* __restrict__ grad_warp, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points) { - // Set gradients to 0, because the kernel incrementally updates the - // tensor entries by adding partial contributions. - const int grad_warp_size = batch_size * num_sampling_points * 2; - const int grad_data_size = batch_size * data_height * data_width * - data_channels; - - ::tensorflow::CudaLaunchConfig config = - ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d); - ::tensorflow::SetZero - <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( - grad_warp_size, grad_warp); - - config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d); - ::tensorflow::SetZero - <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( - grad_data_size, grad_data); - - const int resampler_output_size = batch_size * num_sampling_points * - data_channels; - config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d); - ResamplerGrad2DKernel<T> - <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( - data, warp, grad_output, grad_data, grad_warp, batch_size, - data_height, data_width, data_channels, num_sampling_points); +struct ResamplerGrad2DFunctor<GPUDevice, T> { + void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d, + const T* __restrict__ data, const T* __restrict__ warp, + const T* __restrict__ grad_output, T* __restrict__ grad_data, + T* __restrict__ grad_warp, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points) { + // Set gradients to 0, because the kernel incrementally updates the + // tensor entries by adding partial contributions. + const int grad_warp_size = batch_size * num_sampling_points * 2; + const int grad_data_size = + batch_size * data_height * data_width * data_channels; + + ::tensorflow::CudaLaunchConfig config = + ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d); + ::tensorflow:: + SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + grad_warp_size, grad_warp); + + config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d); + ::tensorflow:: + SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + grad_data_size, grad_data); + + const int resampler_output_size = + batch_size * num_sampling_points * data_channels; + config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d); + ResamplerGrad2DKernel<T> + <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + data, warp, grad_output, grad_data, grad_warp, batch_size, + data_height, data_width, data_channels, num_sampling_points); } }; |