diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-08-15 16:53:20 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-08-15 16:57:24 -0700 |
commit | d4d93a84497a406bfaebb8176c699ae810bc5ff5 (patch) | |
tree | b66400477cecfb2d77358764ba705902db9c3dd6 /tensorflow/contrib/lite/kernels/internal | |
parent | 058b499b07e72a3ad21bb4be1a69412a731036bc (diff) |
NEON optimizations for most common-Mul uint8 kernel.
PiperOrigin-RevId: 208907698
Diffstat (limited to 'tensorflow/contrib/lite/kernels/internal')
-rw-r--r-- | tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h | 53 | ||||
-rw-r--r-- | tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h | 1 |
2 files changed, 53 insertions, 1 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 2d172315da..da585e5550 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -2946,7 +2946,58 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, inline void MulElementwise(int size, const ArithmeticParams& params, const uint8* input1_data, const uint8* input2_data, uint8* output_data) { - for (int i = 0; i < size; ++i) { + int i = 0; + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input2_offset, 256); + TFLITE_DCHECK_GT(params.output_offset, -256); + TFLITE_DCHECK_LT(params.output_offset, 256); +#ifdef USE_NEON + const auto input1_offset_vector = vdupq_n_s16(params.input1_offset); + const auto input2_offset_vector = vdupq_n_s16(params.input2_offset); + const auto output_offset_vector = vdupq_n_s16(params.output_offset); + const auto output_activation_min_vector = + vdup_n_u8(params.quantized_activation_min); + const auto output_activation_max_vector = + vdup_n_u8(params.quantized_activation_max); + for (; i <= size - 8; i += 8) { + // We load / store 8 at a time, multiplying as two sets of 4 int32s. + const auto input1_val_original = vld1_u8(input1_data + i); + const auto input2_val_original = vld1_u8(input2_data + i); + const auto input1_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input1_val_original)); + const auto input2_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); + const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector); + const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector); + + const auto input1_val_low = vget_low_s16(input1_val); + const auto input1_val_high = vget_high_s16(input1_val); + const auto input2_val_low = vget_low_s16(input2_val); + const auto input2_val_high = vget_high_s16(input2_val); + + auto p1 = vmull_s16(input2_val_low, input1_val_low); + auto p2 = vmull_s16(input2_val_high, input1_val_high); + + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); + p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + p1 = RoundingDivideByPOT(p1, -params.output_shift); + p2 = RoundingDivideByPOT(p2, -params.output_shift); + + const auto p1_narrowed = vmovn_s32(p1); + const auto p2_narrowed = vmovn_s32(p2); + const auto p = + vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); + const auto clamped = + vmax_u8(output_activation_min_vector, + vmin_u8(output_activation_max_vector, vqmovun_s16(p))); + vst1_u8(output_data + i, clamped); + } +#endif // NEON + + for (; i < size; ++i) { const int32 input1_val = params.input1_offset + input1_data[i]; const int32 input2_val = params.input2_offset + input2_data[i]; const int32 unclamped_result = diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index cb254f36cc..5634b8384a 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -1494,6 +1494,7 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params, NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; + // The input shapes are extended as part of NdArrayDesc initialization. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); RuntimeShape extended_output_shape = |