aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/examples/android/jni/object_tracking/image_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/examples/android/jni/object_tracking/image_neon.cc')
-rw-r--r--tensorflow/examples/android/jni/object_tracking/image_neon.cc270
1 files changed, 270 insertions, 0 deletions
diff --git a/tensorflow/examples/android/jni/object_tracking/image_neon.cc b/tensorflow/examples/android/jni/object_tracking/image_neon.cc
new file mode 100644
index 0000000000..ddd8447bf3
--- /dev/null
+++ b/tensorflow/examples/android/jni/object_tracking/image_neon.cc
@@ -0,0 +1,270 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NEON implementations of Image methods for compatible devices. Control
+// should never enter this compilation unit on incompatible devices.
+
+#ifdef __ARM_NEON
+
+#include <arm_neon.h>
+
+#include "tensorflow/core/platform/types.h"
+
+#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
+#include "tensorflow/examples/android/jni/object_tracking/image.h"
+#include "tensorflow/examples/android/jni/object_tracking/image_utils.h"
+#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+
+using namespace tensorflow;
+
+namespace tf_tracking {
+
+// This function does the bulk of the work.
+template <>
+void Image<uint8>::Downsample2x32ColumnsNeon(const uint8* const original,
+ const int stride,
+ const int orig_x) {
+ // Divide input x offset by 2 to find output offset.
+ const int new_x = orig_x >> 1;
+
+ // Initial offset into top row.
+ const uint8* offset = original + orig_x;
+
+ // This points to the leftmost pixel of our 8 horizontally arranged
+ // pixels in the destination data.
+ uint8* ptr_dst = (*this)[0] + new_x;
+
+ // Sum along vertical columns.
+ // Process 32x2 input pixels and 16x1 output pixels per iteration.
+ for (int new_y = 0; new_y < height_; ++new_y) {
+ uint16x8_t accum1 = vdupq_n_u16(0);
+ uint16x8_t accum2 = vdupq_n_u16(0);
+
+ // Go top to bottom across the four rows of input pixels that make up
+ // this output row.
+ for (int row_num = 0; row_num < 2; ++row_num) {
+ // First 16 bytes.
+ {
+ // Load 16 bytes of data from current offset.
+ const uint8x16_t curr_data1 = vld1q_u8(offset);
+
+ // Pairwise add and accumulate into accum vectors (16 bit to account
+ // for values above 255).
+ accum1 = vpadalq_u8(accum1, curr_data1);
+ }
+
+ // Second 16 bytes.
+ {
+ // Load 16 bytes of data from current offset.
+ const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
+
+ // Pairwise add and accumulate into accum vectors (16 bit to account
+ // for values above 255).
+ accum2 = vpadalq_u8(accum2, curr_data2);
+ }
+
+ // Move offset down one row.
+ offset += stride;
+ }
+
+ // Divide by 4 (number of input pixels per output
+ // pixel) and narrow data from 16 bits per pixel to 8 bpp.
+ const uint8x8_t tmp_pix1 = vqshrn_n_u16(accum1, 2);
+ const uint8x8_t tmp_pix2 = vqshrn_n_u16(accum2, 2);
+
+ // Concatenate 8x1 pixel strips into 16x1 pixel strip.
+ const uint8x16_t allpixels = vcombine_u8(tmp_pix1, tmp_pix2);
+
+ // Copy all pixels from composite 16x1 vector into output strip.
+ vst1q_u8(ptr_dst, allpixels);
+
+ ptr_dst += stride_;
+ }
+}
+
+// This function does the bulk of the work.
+template <>
+void Image<uint8>::Downsample4x32ColumnsNeon(const uint8* const original,
+ const int stride,
+ const int orig_x) {
+ // Divide input x offset by 4 to find output offset.
+ const int new_x = orig_x >> 2;
+
+ // Initial offset into top row.
+ const uint8* offset = original + orig_x;
+
+ // This points to the leftmost pixel of our 8 horizontally arranged
+ // pixels in the destination data.
+ uint8* ptr_dst = (*this)[0] + new_x;
+
+ // Sum along vertical columns.
+ // Process 32x4 input pixels and 8x1 output pixels per iteration.
+ for (int new_y = 0; new_y < height_; ++new_y) {
+ uint16x8_t accum1 = vdupq_n_u16(0);
+ uint16x8_t accum2 = vdupq_n_u16(0);
+
+ // Go top to bottom across the four rows of input pixels that make up
+ // this output row.
+ for (int row_num = 0; row_num < 4; ++row_num) {
+ // First 16 bytes.
+ {
+ // Load 16 bytes of data from current offset.
+ const uint8x16_t curr_data1 = vld1q_u8(offset);
+
+ // Pairwise add and accumulate into accum vectors (16 bit to account
+ // for values above 255).
+ accum1 = vpadalq_u8(accum1, curr_data1);
+ }
+
+ // Second 16 bytes.
+ {
+ // Load 16 bytes of data from current offset.
+ const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
+
+ // Pairwise add and accumulate into accum vectors (16 bit to account
+ // for values above 255).
+ accum2 = vpadalq_u8(accum2, curr_data2);
+ }
+
+ // Move offset down one row.
+ offset += stride;
+ }
+
+ // Add and widen, then divide by 16 (number of input pixels per output
+ // pixel) and narrow data from 32 bits per pixel to 16 bpp.
+ const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4);
+ const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4);
+
+ // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from
+ // 16 bits to 8 bits per pixel.
+ const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2));
+
+ // Copy all pixels from composite 8x1 vector into output strip.
+ vst1_u8(ptr_dst, allpixels);
+
+ ptr_dst += stride_;
+ }
+}
+
+
+// Hardware accelerated downsampling method for supported devices.
+// Requires that image size be a multiple of 16 pixels in each dimension,
+// and that downsampling be by a factor of 2 or 4.
+template <>
+void Image<uint8>::DownsampleAveragedNeon(const uint8* const original,
+ const int stride, const int factor) {
+ // TODO(andrewharp): stride is a bad approximation for the src image's width.
+ // Better to pass that in directly.
+ SCHECK(width_ * factor <= stride, "Uh oh!");
+ const int last_starting_index = width_ * factor - 32;
+
+ // We process 32 input pixels lengthwise at a time.
+ // The output per pass of this loop is an 8 wide by downsampled height tall
+ // pixel strip.
+ int orig_x = 0;
+ for (; orig_x <= last_starting_index; orig_x += 32) {
+ if (factor == 2) {
+ Downsample2x32ColumnsNeon(original, stride, orig_x);
+ } else {
+ Downsample4x32ColumnsNeon(original, stride, orig_x);
+ }
+ }
+
+ // If a last pass is required, push it to the left enough so that it never
+ // goes out of bounds. This will result in some extra computation on devices
+ // whose frame widths are multiples of 16 and not 32.
+ if (orig_x < last_starting_index + 32) {
+ if (factor == 2) {
+ Downsample2x32ColumnsNeon(original, stride, last_starting_index);
+ } else {
+ Downsample4x32ColumnsNeon(original, stride, last_starting_index);
+ }
+ }
+}
+
+
+// Puts the image gradient matrix about a pixel into the 2x2 float array G.
+// vals_x should be an array of the window x gradient values, whose indices
+// can be in any order but are parallel to the vals_y entries.
+// See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details.
+void CalculateGNeon(const float* const vals_x, const float* const vals_y,
+ const int num_vals, float* const G) {
+ const float32_t* const arm_vals_x = (const float32_t*) vals_x;
+ const float32_t* const arm_vals_y = (const float32_t*) vals_y;
+
+ // Running sums.
+ float32x4_t xx = vdupq_n_f32(0.0f);
+ float32x4_t xy = vdupq_n_f32(0.0f);
+ float32x4_t yy = vdupq_n_f32(0.0f);
+
+ // Maximum index we can load 4 consecutive values from.
+ // e.g. if there are 81 values, our last full pass can be from index 77:
+ // 81-4=>77 (77, 78, 79, 80)
+ const int max_i = num_vals - 4;
+
+ // Defined here because we want to keep track of how many values were
+ // processed by NEON, so that we can finish off the remainder the normal
+ // way.
+ int i = 0;
+
+ // Process values 4 at a time, accumulating the sums of
+ // the pixel-wise x*x, x*y, and y*y values.
+ for (; i <= max_i; i += 4) {
+ // Load xs
+ float32x4_t x = vld1q_f32(arm_vals_x + i);
+
+ // Multiply x*x and accumulate.
+ xx = vmlaq_f32(xx, x, x);
+
+ // Load ys
+ float32x4_t y = vld1q_f32(arm_vals_y + i);
+
+ // Multiply x*y and accumulate.
+ xy = vmlaq_f32(xy, x, y);
+
+ // Multiply y*y and accumulate.
+ yy = vmlaq_f32(yy, y, y);
+ }
+
+ static float32_t xx_vals[4];
+ static float32_t xy_vals[4];
+ static float32_t yy_vals[4];
+
+ vst1q_f32(xx_vals, xx);
+ vst1q_f32(xy_vals, xy);
+ vst1q_f32(yy_vals, yy);
+
+ // Accumulated values are store in sets of 4, we have to manually add
+ // the last bits together.
+ for (int j = 0; j < 4; ++j) {
+ G[0] += xx_vals[j];
+ G[1] += xy_vals[j];
+ G[3] += yy_vals[j];
+ }
+
+ // Finishes off last few values (< 4) from above.
+ for (; i < num_vals; ++i) {
+ G[0] += Square(vals_x[i]);
+ G[1] += vals_x[i] * vals_y[i];
+ G[3] += Square(vals_y[i]);
+ }
+
+ // The matrix is symmetric, so this is a given.
+ G[2] = G[1];
+}
+
+} // namespace tf_tracking
+
+#endif