1 files changed, 337 insertions, 0 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
new file mode 100644
index 0000000000..bf0bdfb1fb
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -0,0 +1,337 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
+
+#ifdef USE_NEON
+
+#include <arm_neon.h>
+#define kFloatWeightsPerNeonLane 4
+
+namespace tflite {
+namespace tensor_utils {
+
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                             int m_cols, const float* vector,
+                                             int n_batch, float* result,
+                                             int result_stride) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
+
+  // The arrays used to cache the vector.
+  float32x4_t* vector_cache_float32x4 =
+      new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) *
+                      sizeof(float32x4_t)];
+  const int kUnrollSize = 2;
+  for (int b = 0; b < n_batch; b++) {
+    float* result_in_batch = result + b * m_rows * result_stride;
+    const float* vector_in_batch = vector + b * m_cols;
+
+    const float* matrix_ptr0 = matrix;
+    // If there is only 1 row, we don't want to assign an illegal pointer.
+    const float* matrix_ptr1 = nullptr;
+    if (m_rows > 1) {
+      matrix_ptr1 = matrix + m_cols;
+    }
+
+    // Cahce the vector.
+    for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+      vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c);
+    }
+
+    // Main matrix by vector multiplication loop, which handles two rows of
+    // matrix by vector multiplication.
+    for (int r = 0; r < (m_rows & ~(kUnrollSize - 1)); r += kUnrollSize) {
+      float32x4_t acc0_32x4 = vmovq_n_f32(0.0);
+      float32x4_t acc1_32x4 = vmovq_n_f32(0.0);
+      for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+        float32x4_t temp = vector_cache_float32x4[c >> 2];
+        // Load 4 float values from vector1 and vector2 and accumulator.
+        float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c);
+        float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr1 + c);
+        // Vector multiply-accumulate 4 float
+        acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp);
+        acc1_32x4 = vmlaq_f32(acc1_32x4, v1_f32x4, temp);
+      }
+      // Add the 4 intermediate sum values to get the final dot-prod value for
+      // this column.
+      *result_in_batch +=
+          (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) +
+           vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3));
+      *(result_in_batch + result_stride) +=
+          (vgetq_lane_f32(acc1_32x4, 0) + vgetq_lane_f32(acc1_32x4, 1) +
+           vgetq_lane_f32(acc1_32x4, 2) + vgetq_lane_f32(acc1_32x4, 3));
+      for (int c = postamble_start; c < m_cols; c++) {
+        *result_in_batch += matrix_ptr0[c] * vector_in_batch[c];
+        *(result_in_batch + result_stride) +=
+            matrix_ptr1[c] * vector_in_batch[c];
+      }
+      matrix_ptr0 += kUnrollSize * m_cols;
+      matrix_ptr1 += kUnrollSize * m_cols;
+      result_in_batch += kUnrollSize * result_stride;
+    }
+    for (int r = (m_rows & ~(kUnrollSize - 1)); r < m_rows; r++) {
+      float32x4_t acc0_32x4 = vmovq_n_f32(0.0);
+      for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+        float32x4_t temp = vector_cache_float32x4[c >> 2];
+        // Load 4 float values from vector1 and vector2 and accumulator.
+        float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c);
+        // Vector multiply-accumulate 4 float
+        acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp);
+      }
+      // Add the 4 intermediate sum values to get the final dot-prod value for
+      // this column.
+      *result_in_batch +=
+          (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) +
+           vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3));
+      for (int c = postamble_start; c < m_cols; c++) {
+        *result_in_batch += matrix_ptr0[c] * vector_in_batch[c];
+      }
+      matrix_ptr0 += m_cols;
+      result_in_batch += result_stride;
+    }
+  }
+  delete[] vector_cache_float32x4;
+}
+
+void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                                  int v_size, float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from vector1 and vector2.
+    float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+    float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+    // Vector multiply 4 float
+    float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
+    // Save to result array.
+    vst1q_f32(&result[v], mul_32x4);
+  }
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] = vector1[v] * vector2[v];
+  }
+}
+
+void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
+                                            const float* vector2, int v_size,
+                                            float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from vector1 and vector2 and accumulator.
+    float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+    float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+    float32x4_t acc_32x4 = vld1q_f32(result + v);
+    // Vector multiply-accumulate 4 float
+    acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
+    // Save to result array.
+    vst1q_f32(&result[v], acc_32x4);
+  }
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] += vector1[v] * vector2[v];
+  }
+}
+
+void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                 int v_size,
+                                                 const float* batch_vector,
+                                                 int n_batch, float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  // The arrays used to cache the vector.
+  float32x4_t* vector_cache_float32x4 =
+      new float32x4_t[(v_size / kFloatWeightsPerNeonLane) *
+                      sizeof(float32x4_t)];
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v);
+  }
+
+  float* result_ptr = result;
+  const float* batch_vector_ptr = batch_vector;
+  for (int b = 0; b < n_batch; b++) {
+    for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+      // Load from memory to vectors.
+      float32x4_t result_f32x4 = vld1q_f32(result_ptr + v);
+      float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v);
+      // Multiply-accumulate.
+      result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4,
+                               vector_cache_float32x4[v >> 2]);
+      // Store.
+      vst1q_f32(result_ptr + v, result_f32x4);
+    }
+    // Postamble loop
+    for (int v = postamble_start; v < v_size; v++) {
+      result_ptr[v] += vector[v] * batch_vector_ptr[v];
+    }
+    // Update the pointers.
+    result_ptr += v_size;
+    batch_vector_ptr += v_size;
+  }
+  delete[] vector_cache_float32x4;
+}
+
+void NeonSub1Vector(const float* vector, int v_size, float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  float32x4_t one_f32x4 = vmovq_n_f32(1.0);
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from the current pointers of the input column and
+    // subtract from 1.
+    float32x4_t v_f32x4 = vld1q_f32(vector + v);
+    float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
+    // Save to output.
+    vst1q_f32(result + v, result_f32x4);
+  }
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] = 1.0f - vector[v];
+  }
+}
+
+void NeonClipVector(const float* vector, int v_size, float abs_limit,
+                    float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  // Replicate abs_limit and -abs_limit in two vectors.
+  const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
+  const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);
+
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load from memory to vector.
+    float32x4_t v_f32x4 = vld1q_f32(vector + v);
+    // Clip between abs_limit and -abs_limit.
+    float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4);
+    result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4);
+    // Save to output.
+    vst1q_f32(result + v, result_f32x4);
+  }
+  // Postamble loop.
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v];
+    result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v];
+  }
+}
+
+float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                 int v_size) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from vector1 and vector2 and accumulator.
+    float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+    float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+    // Vector multiply-accumulate 4 float
+    acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
+  }
+
+  float result = (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
+                  vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
+  // Postamble loop.
+  for (int v = postamble_start; v < v_size; v++) {
+    result += vector1[v] * vector2[v];
+  }
+  return result;
+}
+
+void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
+                                          const float* vector2, int v_size,
+                                          int n_batch, float* result,
+                                          int result_stride) {
+  float* result_ptr = result;
+  const float* vector1_ptr = vector1;
+  const float* vector2_ptr = vector2;
+  for (int b = 0; b < n_batch; b++) {
+    *result_ptr = NeonVectorVectorDotProduct(vector1_ptr, vector2_ptr, v_size);
+    vector1_ptr += v_size;
+    vector2_ptr += v_size;
+    result_ptr += result_stride;
+  }
+}
+
+void NeonReductionSumVector(const float* input_vector, float* output_vector,
+                            int output_size, int reduction_size) {
+  const float* input_vector_ptr = input_vector;
+  for (int o = 0; o < output_size; o++) {
+    // If reduction_size is not divisible by kWeightsPerNeonLane, we cannot use
+    // the main vectorized loop, and we need to process sequentially.
+    // postamble_start shows the start index where this should happen.
+    const int postamble_start =
+        reduction_size - (reduction_size & (kFloatWeightsPerNeonLane - 1));
+    float32x4_t sum_f32x4 = vmovq_n_f32(0.0);
+    for (int r = 0; r < postamble_start; r += kFloatWeightsPerNeonLane) {
+      float32x4_t v1_f32x4 = vld1q_f32(input_vector_ptr + r);
+      sum_f32x4 = vaddq_f32(sum_f32x4, v1_f32x4);
+    }
+    output_vector[o] +=
+        (vgetq_lane_f32(sum_f32x4, 0) + vgetq_lane_f32(sum_f32x4, 1) +
+         vgetq_lane_f32(sum_f32x4, 2) + vgetq_lane_f32(sum_f32x4, 3));
+    input_vector_ptr += postamble_start;
+
+    // Postamble loop.
+    for (int r = postamble_start; r < reduction_size; r++) {
+      output_vector[o] += *input_vector_ptr++;
+    }
+  }
+}
+
+void NeonVectorShiftLeft(float* vector, int v_size, float shift_value) {
+  // This variable keeps track of the next to the last index which is being
+  // copied to make sure we are not out of the vector boundary.
+  int last_index_copy = kFloatWeightsPerNeonLane;
+  int current_index_copy = 0;
+  while (last_index_copy < v_size) {
+    float32x4_t v_f32x4 = vld1q_f32(vector + current_index_copy + 1);
+    vst1q_f32(vector + current_index_copy, v_f32x4);
+    current_index_copy += kFloatWeightsPerNeonLane;
+    last_index_copy += kFloatWeightsPerNeonLane;
+  }
+  // Postamble loop.
+  for (int i = current_index_copy; i < v_size - 1; i++) {
+    vector[i] = vector[i + 1];
+  }
+  vector[v_size - 1] = shift_value;
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // USE_NEON