aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/lite/kernels/internal
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-08-15 16:53:20 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-08-15 16:57:24 -0700
commitd4d93a84497a406bfaebb8176c699ae810bc5ff5 (patch)
treeb66400477cecfb2d77358764ba705902db9c3dd6 /tensorflow/contrib/lite/kernels/internal
parent058b499b07e72a3ad21bb4be1a69412a731036bc (diff)
NEON optimizations for most common-Mul uint8 kernel.
PiperOrigin-RevId: 208907698
Diffstat (limited to 'tensorflow/contrib/lite/kernels/internal')
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h53
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h1
2 files changed, 53 insertions, 1 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 2d172315da..da585e5550 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -2946,7 +2946,58 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
inline void MulElementwise(int size, const ArithmeticParams& params,
const uint8* input1_data, const uint8* input2_data,
uint8* output_data) {
- for (int i = 0; i < size; ++i) {
+ int i = 0;
+ TFLITE_DCHECK_GT(params.input1_offset, -256);
+ TFLITE_DCHECK_LT(params.input1_offset, 256);
+ TFLITE_DCHECK_GT(params.input2_offset, -256);
+ TFLITE_DCHECK_LT(params.input2_offset, 256);
+ TFLITE_DCHECK_GT(params.output_offset, -256);
+ TFLITE_DCHECK_LT(params.output_offset, 256);
+#ifdef USE_NEON
+ const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
+ const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+ const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+ const auto output_activation_min_vector =
+ vdup_n_u8(params.quantized_activation_min);
+ const auto output_activation_max_vector =
+ vdup_n_u8(params.quantized_activation_max);
+ for (; i <= size - 8; i += 8) {
+ // We load / store 8 at a time, multiplying as two sets of 4 int32s.
+ const auto input1_val_original = vld1_u8(input1_data + i);
+ const auto input2_val_original = vld1_u8(input2_data + i);
+ const auto input1_val_s16 =
+ vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+ const auto input2_val_s16 =
+ vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+ const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
+ const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+
+ const auto input1_val_low = vget_low_s16(input1_val);
+ const auto input1_val_high = vget_high_s16(input1_val);
+ const auto input2_val_low = vget_low_s16(input2_val);
+ const auto input2_val_high = vget_high_s16(input2_val);
+
+ auto p1 = vmull_s16(input2_val_low, input1_val_low);
+ auto p2 = vmull_s16(input2_val_high, input1_val_high);
+
+ p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+ p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+ using gemmlowp::RoundingDivideByPOT;
+ p1 = RoundingDivideByPOT(p1, -params.output_shift);
+ p2 = RoundingDivideByPOT(p2, -params.output_shift);
+
+ const auto p1_narrowed = vmovn_s32(p1);
+ const auto p2_narrowed = vmovn_s32(p2);
+ const auto p =
+ vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
+ const auto clamped =
+ vmax_u8(output_activation_min_vector,
+ vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+ vst1_u8(output_data + i, clamped);
+ }
+#endif // NEON
+
+ for (; i < size; ++i) {
const int32 input1_val = params.input1_offset + input1_data[i];
const int32 input2_val = params.input2_offset + input2_data[i];
const int32 unclamped_result =
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index cb254f36cc..5634b8384a 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1494,6 +1494,7 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
+ // The input shapes are extended as part of NdArrayDesc initialization.
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
&desc2);
RuntimeShape extended_output_shape =