diff options
Diffstat (limited to 'tensorflow/examples/android/jni/object_tracking/image_neon.cc')
-rw-r--r-- | tensorflow/examples/android/jni/object_tracking/image_neon.cc | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/tensorflow/examples/android/jni/object_tracking/image_neon.cc b/tensorflow/examples/android/jni/object_tracking/image_neon.cc new file mode 100644 index 0000000000..ddd8447bf3 --- /dev/null +++ b/tensorflow/examples/android/jni/object_tracking/image_neon.cc @@ -0,0 +1,270 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NEON implementations of Image methods for compatible devices. Control +// should never enter this compilation unit on incompatible devices. + +#ifdef __ARM_NEON + +#include <arm_neon.h> + +#include "tensorflow/core/platform/types.h" + +#include "tensorflow/examples/android/jni/object_tracking/image-inl.h" +#include "tensorflow/examples/android/jni/object_tracking/image.h" +#include "tensorflow/examples/android/jni/object_tracking/image_utils.h" +#include "tensorflow/examples/android/jni/object_tracking/utils.h" + +using namespace tensorflow; + +namespace tf_tracking { + +// This function does the bulk of the work. +template <> +void Image<uint8>::Downsample2x32ColumnsNeon(const uint8* const original, + const int stride, + const int orig_x) { + // Divide input x offset by 2 to find output offset. + const int new_x = orig_x >> 1; + + // Initial offset into top row. + const uint8* offset = original + orig_x; + + // This points to the leftmost pixel of our 8 horizontally arranged + // pixels in the destination data. + uint8* ptr_dst = (*this)[0] + new_x; + + // Sum along vertical columns. + // Process 32x2 input pixels and 16x1 output pixels per iteration. + for (int new_y = 0; new_y < height_; ++new_y) { + uint16x8_t accum1 = vdupq_n_u16(0); + uint16x8_t accum2 = vdupq_n_u16(0); + + // Go top to bottom across the four rows of input pixels that make up + // this output row. + for (int row_num = 0; row_num < 2; ++row_num) { + // First 16 bytes. + { + // Load 16 bytes of data from current offset. + const uint8x16_t curr_data1 = vld1q_u8(offset); + + // Pairwise add and accumulate into accum vectors (16 bit to account + // for values above 255). + accum1 = vpadalq_u8(accum1, curr_data1); + } + + // Second 16 bytes. + { + // Load 16 bytes of data from current offset. + const uint8x16_t curr_data2 = vld1q_u8(offset + 16); + + // Pairwise add and accumulate into accum vectors (16 bit to account + // for values above 255). + accum2 = vpadalq_u8(accum2, curr_data2); + } + + // Move offset down one row. + offset += stride; + } + + // Divide by 4 (number of input pixels per output + // pixel) and narrow data from 16 bits per pixel to 8 bpp. + const uint8x8_t tmp_pix1 = vqshrn_n_u16(accum1, 2); + const uint8x8_t tmp_pix2 = vqshrn_n_u16(accum2, 2); + + // Concatenate 8x1 pixel strips into 16x1 pixel strip. + const uint8x16_t allpixels = vcombine_u8(tmp_pix1, tmp_pix2); + + // Copy all pixels from composite 16x1 vector into output strip. + vst1q_u8(ptr_dst, allpixels); + + ptr_dst += stride_; + } +} + +// This function does the bulk of the work. +template <> +void Image<uint8>::Downsample4x32ColumnsNeon(const uint8* const original, + const int stride, + const int orig_x) { + // Divide input x offset by 4 to find output offset. + const int new_x = orig_x >> 2; + + // Initial offset into top row. + const uint8* offset = original + orig_x; + + // This points to the leftmost pixel of our 8 horizontally arranged + // pixels in the destination data. + uint8* ptr_dst = (*this)[0] + new_x; + + // Sum along vertical columns. + // Process 32x4 input pixels and 8x1 output pixels per iteration. + for (int new_y = 0; new_y < height_; ++new_y) { + uint16x8_t accum1 = vdupq_n_u16(0); + uint16x8_t accum2 = vdupq_n_u16(0); + + // Go top to bottom across the four rows of input pixels that make up + // this output row. + for (int row_num = 0; row_num < 4; ++row_num) { + // First 16 bytes. + { + // Load 16 bytes of data from current offset. + const uint8x16_t curr_data1 = vld1q_u8(offset); + + // Pairwise add and accumulate into accum vectors (16 bit to account + // for values above 255). + accum1 = vpadalq_u8(accum1, curr_data1); + } + + // Second 16 bytes. + { + // Load 16 bytes of data from current offset. + const uint8x16_t curr_data2 = vld1q_u8(offset + 16); + + // Pairwise add and accumulate into accum vectors (16 bit to account + // for values above 255). + accum2 = vpadalq_u8(accum2, curr_data2); + } + + // Move offset down one row. + offset += stride; + } + + // Add and widen, then divide by 16 (number of input pixels per output + // pixel) and narrow data from 32 bits per pixel to 16 bpp. + const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4); + const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4); + + // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from + // 16 bits to 8 bits per pixel. + const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2)); + + // Copy all pixels from composite 8x1 vector into output strip. + vst1_u8(ptr_dst, allpixels); + + ptr_dst += stride_; + } +} + + +// Hardware accelerated downsampling method for supported devices. +// Requires that image size be a multiple of 16 pixels in each dimension, +// and that downsampling be by a factor of 2 or 4. +template <> +void Image<uint8>::DownsampleAveragedNeon(const uint8* const original, + const int stride, const int factor) { + // TODO(andrewharp): stride is a bad approximation for the src image's width. + // Better to pass that in directly. + SCHECK(width_ * factor <= stride, "Uh oh!"); + const int last_starting_index = width_ * factor - 32; + + // We process 32 input pixels lengthwise at a time. + // The output per pass of this loop is an 8 wide by downsampled height tall + // pixel strip. + int orig_x = 0; + for (; orig_x <= last_starting_index; orig_x += 32) { + if (factor == 2) { + Downsample2x32ColumnsNeon(original, stride, orig_x); + } else { + Downsample4x32ColumnsNeon(original, stride, orig_x); + } + } + + // If a last pass is required, push it to the left enough so that it never + // goes out of bounds. This will result in some extra computation on devices + // whose frame widths are multiples of 16 and not 32. + if (orig_x < last_starting_index + 32) { + if (factor == 2) { + Downsample2x32ColumnsNeon(original, stride, last_starting_index); + } else { + Downsample4x32ColumnsNeon(original, stride, last_starting_index); + } + } +} + + +// Puts the image gradient matrix about a pixel into the 2x2 float array G. +// vals_x should be an array of the window x gradient values, whose indices +// can be in any order but are parallel to the vals_y entries. +// See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details. +void CalculateGNeon(const float* const vals_x, const float* const vals_y, + const int num_vals, float* const G) { + const float32_t* const arm_vals_x = (const float32_t*) vals_x; + const float32_t* const arm_vals_y = (const float32_t*) vals_y; + + // Running sums. + float32x4_t xx = vdupq_n_f32(0.0f); + float32x4_t xy = vdupq_n_f32(0.0f); + float32x4_t yy = vdupq_n_f32(0.0f); + + // Maximum index we can load 4 consecutive values from. + // e.g. if there are 81 values, our last full pass can be from index 77: + // 81-4=>77 (77, 78, 79, 80) + const int max_i = num_vals - 4; + + // Defined here because we want to keep track of how many values were + // processed by NEON, so that we can finish off the remainder the normal + // way. + int i = 0; + + // Process values 4 at a time, accumulating the sums of + // the pixel-wise x*x, x*y, and y*y values. + for (; i <= max_i; i += 4) { + // Load xs + float32x4_t x = vld1q_f32(arm_vals_x + i); + + // Multiply x*x and accumulate. + xx = vmlaq_f32(xx, x, x); + + // Load ys + float32x4_t y = vld1q_f32(arm_vals_y + i); + + // Multiply x*y and accumulate. + xy = vmlaq_f32(xy, x, y); + + // Multiply y*y and accumulate. + yy = vmlaq_f32(yy, y, y); + } + + static float32_t xx_vals[4]; + static float32_t xy_vals[4]; + static float32_t yy_vals[4]; + + vst1q_f32(xx_vals, xx); + vst1q_f32(xy_vals, xy); + vst1q_f32(yy_vals, yy); + + // Accumulated values are store in sets of 4, we have to manually add + // the last bits together. + for (int j = 0; j < 4; ++j) { + G[0] += xx_vals[j]; + G[1] += xy_vals[j]; + G[3] += yy_vals[j]; + } + + // Finishes off last few values (< 4) from above. + for (; i < num_vals; ++i) { + G[0] += Square(vals_x[i]); + G[1] += vals_x[i] * vals_y[i]; + G[3] += Square(vals_y[i]); + } + + // The matrix is symmetric, so this is a given. + G[2] = G[1]; +} + +} // namespace tf_tracking + +#endif |