diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2017-02-07 09:34:15 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-02-07 09:54:56 -0800 |
commit | ab0426e203cd81fb2247475eccdfcec0ddb8bf39 (patch) | |
tree | 99095ea742171f9a218471eb30a0dbc26bf45ef1 /tensorflow/core/kernels/resize_bilinear_op.cc | |
parent | 661058e52460b36417573e2c5a73de9a8b9e5edb (diff) |
Improve performance of ResizeBilinear:
- use pointer arithmetic instead of accessing data through eigen single-cell
expressions.
- use only scale_down case, rename to resize_image. This simplifies the code,
at the cost of slightly worse performance for the scale_up case with one
channel (note though that special-casing single-channel case would make it 3x
faster than the AFTER times below).
- pull some common arithmetic out of loops manually, so that more common
indexing operations don't require multiplication.
- remove some assignments in inner loops.
(the first of these had the largest effect).
Add some more test and benchmark cases. Also fix a typo in the ResizeArea test.
Change: 146795860
Diffstat (limited to 'tensorflow/core/kernels/resize_bilinear_op.cc')
-rw-r--r-- | tensorflow/core/kernels/resize_bilinear_op.cc | 299 |
1 files changed, 81 insertions, 218 deletions
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc index 2c5aeaada4..d9cb993a4b 100644 --- a/tensorflow/core/kernels/resize_bilinear_op.cc +++ b/tensorflow/core/kernels/resize_bilinear_op.cc @@ -72,43 +72,19 @@ struct CachedInterpolation { // 1-D linear iterpolation scale (see: // https://en.wikipedia.org/wiki/Bilinear_interpolation) float lerp; - // How many consecutive points use the same lower & upper indices - int consecutive; }; -enum ImageScalePattern { SCALE_UP, SIMILAR, SCALE_DOWN }; - -inline ImageScalePattern compute_image_scale_pattern(const int64 out_height, - const int64 out_width, - const int64 in_height, - const int64 in_width) { - if (in_height * 2 < out_height || in_width * 2 < out_width) { - return SCALE_UP; - } else if (out_height * 2 < in_height || out_width * 2 < in_width) { - return SCALE_DOWN; - } else { - return SIMILAR; - } -} - -inline void compute_interpolation_weights(const ImageScalePattern scale_pattern, - const int64 out_size, +inline void compute_interpolation_weights(const int64 out_size, const int64 in_size, const float scale, CachedInterpolation* interpolation) { interpolation[out_size].lower = 0; interpolation[out_size].upper = 0; - interpolation[out_size].consecutive = 0; for (int64 i = out_size - 1; i >= 0; --i) { const float in = i * scale; interpolation[i].lower = static_cast<int64>(in); interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1); interpolation[i].lerp = in - interpolation[i].lower; - interpolation[i].consecutive = - interpolation[i + 1].lower == interpolation[i].lower && - interpolation[i + 1].upper == interpolation[i].upper - ? interpolation[i + 1].consecutive + 1 - : 1; } } @@ -125,200 +101,97 @@ inline float compute_lerp(const float top_left, const float top_right, } template <typename T> -inline float image_lerp(const T* input_image, int64 in_x_lower, - int64 in_x_upper, float xs_lerp, int64 in_y_lower, - int64 in_y_upper, float ys_lerp, int c) { - const float top_left(input_image[in_y_lower + in_x_lower + c]); - const float top_right(input_image[in_y_lower + in_x_upper + c]); - const float bottom_left(input_image[in_y_upper + in_x_lower + c]); - const float bottom_right(input_image[in_y_upper + in_x_upper + c]); - return compute_lerp(top_left, top_right, bottom_left, bottom_right, xs_lerp, - ys_lerp); -} - -template <typename T> -void scale_down_image( +void resize_image( typename TTypes<T, 4>::ConstTensor images, const int batch_size, - const int64 out_height, const int64 out_width, const int channels, + const int64 in_height, const int64 in_width, const int64 out_height, + const int64 out_width, const int channels, const std::vector<CachedInterpolation>& xs, const std::vector<CachedInterpolation>& ys, typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE; template <typename T> -void scale_down_image(typename TTypes<T, 4>::ConstTensor images, - const int batch_size, const int64 out_height, - const int64 out_width, const int channels, - const std::vector<CachedInterpolation>& xs_vec, - const std::vector<CachedInterpolation>& ys, - typename TTypes<float, 4>::Tensor output) { - // Do not eagerly convert all input data points, as we ignore most. +void resize_image(typename TTypes<T, 4>::ConstTensor images, + const int batch_size, const int64 in_height, + const int64 in_width, const int64 out_height, + const int64 out_width, const int channels, + const std::vector<CachedInterpolation>& xs_vec, + const std::vector<CachedInterpolation>& ys, + typename TTypes<float, 4>::Tensor output) { + const int64 in_row_size = in_width * channels; + const int64 in_batch_num_values = in_height * in_row_size; + const int64 out_row_size = out_width * channels; + + const T* input_b_ptr = images.data(); + const CachedInterpolation* xs = xs_vec.data(); + if (channels == 3) { + float* output_y_ptr = output.data(); for (int b = 0; b < batch_size; ++b) { - // Compute the interpolation for (int64 y = 0; y < out_height; ++y) { - const int64 ys_lower = ys[y].lower; - const int64 ys_upper = ys[y].upper; + const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size; + const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size; const float ys_lerp = ys[y].lerp; - const CachedInterpolation* xs_ptr = xs_vec.data(); for (int64 x = 0; x < out_width; ++x) { - const int64 xs_lower = xs_ptr->lower; - const int64 xs_upper = xs_ptr->upper; - const float xs_lerp = xs_ptr->lerp; - xs_ptr++; - - const float top_left0(images(b, ys_lower, xs_lower, 0)); - const float top_right0(images(b, ys_lower, xs_upper, 0)); - const float bottom_left0(images(b, ys_upper, xs_lower, 0)); - const float bottom_right0(images(b, ys_upper, xs_upper, 0)); - const float out0 = compute_lerp(top_left0, top_right0, bottom_left0, - bottom_right0, xs_lerp, ys_lerp); - - const float top_left1(images(b, ys_lower, xs_lower, 1)); - const float top_right1(images(b, ys_lower, xs_upper, 1)); - const float bottom_left1(images(b, ys_upper, xs_lower, 1)); - const float bottom_right1(images(b, ys_upper, xs_upper, 1)); - const float out1 = compute_lerp(top_left1, top_right1, bottom_left1, - bottom_right1, xs_lerp, ys_lerp); - - const float top_left2(images(b, ys_lower, xs_lower, 2)); - const float top_right2(images(b, ys_lower, xs_upper, 2)); - const float bottom_left2(images(b, ys_upper, xs_lower, 2)); - const float bottom_right2(images(b, ys_upper, xs_upper, 2)); - const float out2 = compute_lerp(top_left2, top_right2, bottom_left2, - bottom_right2, xs_lerp, ys_lerp); - - float* dest = &output(b, y, x, 0); - dest[0] = out0; - dest[1] = out1; - dest[2] = out2; + const int64 xs_lower = xs[x].lower; + const int64 xs_upper = xs[x].upper; + const float xs_lerp = xs[x].lerp; + + // Read channel 0. + const float top_left0(ys_input_lower_ptr[xs_lower + 0]); + const float top_right0(ys_input_lower_ptr[xs_upper + 0]); + const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); + const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); + + // Read channel 1. + const float top_left1(ys_input_lower_ptr[xs_lower + 1]); + const float top_right1(ys_input_lower_ptr[xs_upper + 1]); + const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); + const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); + + // Read channel 2. + const float top_left2(ys_input_lower_ptr[xs_lower + 2]); + const float top_right2(ys_input_lower_ptr[xs_upper + 2]); + const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); + const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); + + // Compute output. + output_y_ptr[x * channels + 0] = + compute_lerp(top_left0, top_right0, bottom_left0, bottom_right0, + xs_lerp, ys_lerp); + output_y_ptr[x * channels + 1] = + compute_lerp(top_left1, top_right1, bottom_left1, bottom_right1, + xs_lerp, ys_lerp); + output_y_ptr[x * channels + 2] = + compute_lerp(top_left2, top_right2, bottom_left2, bottom_right2, + xs_lerp, ys_lerp); } + output_y_ptr += out_row_size; } + input_b_ptr += in_batch_num_values; } } else { + float* output_y_ptr = output.data(); for (int b = 0; b < batch_size; ++b) { - // Compute the interpolation for (int64 y = 0; y < out_height; ++y) { - const CachedInterpolation* xs = xs_vec.data(); + const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size; + const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size; + const float ys_lerp = ys[y].lerp; for (int64 x = 0; x < out_width; ++x) { + auto xs_lower = xs[x].lower; + auto xs_upper = xs[x].upper; + auto xs_lerp = xs[x].lerp; for (int c = 0; c < channels; ++c) { - const float top_left(images(b, ys[y].lower, xs[x].lower, c)); - const float top_right(images(b, ys[y].lower, xs[x].upper, c)); - const float bottom_left(images(b, ys[y].upper, xs[x].lower, c)); - const float bottom_right(images(b, ys[y].upper, xs[x].upper, c)); - output(b, y, x, c) = - compute_lerp(top_left, top_right, bottom_left, bottom_right, - xs[x].lerp, ys[y].lerp); - } - } - } - } - } -} - -template <typename T> -void scale_up_image( - const T* input_image, const int batch_index, const int64 out_height, - const int64 out_width, const int channels, const int64 in_height, - const int64 in_width, const std::vector<CachedInterpolation>& xs, - const std::vector<CachedInterpolation>& ys, - typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE; - -template <typename T> -void scale_up_image(const T* input_image, const int batch_index, - const int64 out_height, const int64 out_width, - const int channels, const int64 in_height, - const int64 in_width, - const std::vector<CachedInterpolation>& xs, - const std::vector<CachedInterpolation>& ys, - typename TTypes<float, 4>::Tensor output) { - for (int64 y = 0; y < out_height; y += ys[y].consecutive) { - const int64 in_y_lower = ys[y].lower * in_width * channels; - const int64 in_y_upper = ys[y].upper * in_width * channels; - for (int64 x = 0; x < out_width; x += xs[x].consecutive) { - const int64 in_x_lower = xs[x].lower * channels; - const int64 in_x_upper = xs[x].upper * channels; - for (int c = 0; c < channels; ++c) { - const float top_left(input_image[in_y_lower + in_x_lower + c]); - const float top_right(input_image[in_y_lower + in_x_upper + c]); - const float bottom_left(input_image[in_y_upper + in_x_lower + c]); - const float bottom_right(input_image[in_y_upper + in_x_upper + c]); - for (int64 y_inner = y; y_inner < y + ys[y].consecutive; ++y_inner) { - for (int64 x_inner = x; x_inner < x + xs[x].consecutive; ++x_inner) { - output(batch_index, y_inner, x_inner, c) = + const float top_left(ys_input_lower_ptr[xs_lower + c]); + const float top_right(ys_input_lower_ptr[xs_upper + c]); + const float bottom_left(ys_input_upper_ptr[xs_lower + c]); + const float bottom_right(ys_input_upper_ptr[xs_upper + c]); + output_y_ptr[x * channels + c] = compute_lerp(top_left, top_right, bottom_left, bottom_right, - xs[x_inner].lerp, ys[y_inner].lerp); + xs_lerp, ys_lerp); } } + output_y_ptr += out_row_size; } - } - } -} - -template <typename T> -void scale_similar_image( - const T* input_image, const int b, const int64 out_height, - const int64 out_width, const int channels, const int64 in_height, - const int64 in_width, const std::vector<CachedInterpolation>& xs_vec, - const std::vector<CachedInterpolation>& ys, - typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE; -template <typename T> -void scale_similar_image(const T* input_image, const int b, - const int64 out_height, const int64 out_width, - const int channels, const int64 in_height, - const int64 in_width, - const std::vector<CachedInterpolation>& xs_vec, - const std::vector<CachedInterpolation>& ys, - typename TTypes<float, 4>::Tensor output) { - if (channels == 3) { - // Compute the interpolation - for (int64 y = 0; y < out_height; ++y) { - const int64 in_y_lower = ys[y].lower * in_width * channels; - const int64 in_y_upper = ys[y].upper * in_width * channels; - const float ys_lerp = ys[y].lerp; - // Similar-sized images do not have a set of inner loops. - const CachedInterpolation* xs_ptr = xs_vec.data(); - for (int64 x = 0; x < out_width; ++x) { - const int64 in_x_lower = xs_ptr->lower * 3; - const int64 in_x_upper = xs_ptr->upper * 3; - const float xs_lerp = xs_ptr->lerp; - xs_ptr++; - - const float out0 = - image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower, - in_y_upper, ys_lerp, 0); - const float out1 = - image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower, - in_y_upper, ys_lerp, 1); - const float out2 = - image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower, - in_y_upper, ys_lerp, 2); - float* dest = &output(b, y, x, 0); - dest[0] = out0; - dest[1] = out1; - dest[2] = out2; - } - } - } else { - // Compute the interpolation - for (int64 y = 0; y < out_height; ++y) { - const int64 in_y_lower = ys[y].lower * in_width * channels; - const int64 in_y_upper = ys[y].upper * in_width * channels; - const float ys_lerp = ys[y].lerp; - // Similar-sized images do not have a set of inner loops. - const CachedInterpolation* xs_ptr = xs_vec.data(); - for (int64 x = 0; x < out_width; ++x) { - const int64 in_x_lower = xs_ptr->lower * channels; - const int64 in_x_upper = xs_ptr->upper * channels; - const float xs_lerp = xs_ptr->lerp; - xs_ptr++; - for (int c = 0; c < channels; ++c) { - const float top_left(input_image[in_y_lower + in_x_lower + c]); - const float top_right(input_image[in_y_lower + in_x_upper + c]); - const float bottom_left(input_image[in_y_upper + in_x_lower + c]); - const float bottom_right(input_image[in_y_upper + in_x_upper + c]); - output(b, y, x, c) = compute_lerp(top_left, top_right, bottom_left, - bottom_right, xs_lerp, ys_lerp); - } - } + input_b_ptr += in_batch_num_values; } } } @@ -346,32 +219,22 @@ struct ResizeBilinear<CPUDevice, T> { return; } - const ImageScalePattern scale_pattern = - compute_image_scale_pattern(out_height, out_width, in_height, in_width); std::vector<CachedInterpolation> ys(out_height + 1); std::vector<CachedInterpolation> xs(out_width + 1); // Compute the cached interpolation weights on the x and y dimensions. - compute_interpolation_weights(scale_pattern, out_height, in_height, - height_scale, ys.data()); - compute_interpolation_weights(scale_pattern, out_width, in_width, - width_scale, xs.data()); - - if (scale_pattern == SCALE_UP) { - for (int b = 0; b < batch_size; ++b) { - scale_up_image<T>(&images(b, 0, 0, 0), b, out_height, out_width, - channels, in_height, in_width, xs, ys, output); - } - } else if (scale_pattern == SCALE_DOWN) { - // Do not eagerly convert all input data points, as we ignore most. - scale_down_image<T>(images, batch_size, out_height, out_width, channels, - xs, ys, output); - } else { - for (int b = 0; b < batch_size; ++b) { - scale_similar_image<T>(&images(b, 0, 0, 0), b, out_height, out_width, - channels, in_height, in_width, xs, ys, output); - } + compute_interpolation_weights(out_height, in_height, height_scale, + ys.data()); + compute_interpolation_weights(out_width, in_width, width_scale, xs.data()); + + // Scale x interpolation weights to avoid a multiplication during iteration. + for (int i = 0; i < xs.size(); ++i) { + xs[i].lower *= channels; + xs[i].upper *= channels; } + + resize_image<T>(images, batch_size, in_height, in_width, out_height, + out_width, channels, xs, ys, output); } }; } // namespace functor |