Arm32/64 kernel optimizations:

- QuantizeV2 - Dequantize - QuantizedBiasAdd - QuantizeDownAndShrinkRange - QuantizedRelu - QuantizedRelu6 - QuantizedMatMul - QuantizedConv The optimizations are controled by three knobs: meta::SetEnabled(bool) -- turns codepath on/off, on by default meta::SetUseLocalContext(bool) -- true -- codepath will use it's own internal fine grain workers pool that offers performance improvement over the standard tensorflow worker pool. This workers pool is not compatible with other ops. Per use-case performance testing recommended. -- false (default) -- use the standard tf worker pool instance meta::SetNumThreads(int) -- no. of compute threads when the internal worker pool is used. If 0 use intra_parallelism_count, if x > 0 then x threads. Change: 137197327
author: A. Unique TensorFlower <gardener@tensorflow.org> 2016-10-25 12:45:42 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-10-25 13:54:46 -0700
commit: 1f6a46596aa2c9c88800a84d85b2b785209a546d (patch)
tree: 4706dd7890017534a5f419037fc127dfbc4fa113 /tensorflow/core/kernels/quantized_matmul_op.cc
parent: 89a96067306258f6ed5eac5ea04801d0e6b213f9 (diff)
1 files changed, 19 insertions, 8 deletions
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index 0ce9e37642..4abcae0d35 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 // Implements a quantized eight-bit version of the matmul operation.
 
+#define EIGEN_USE_THREADS
+
 #include "public/gemmlowp.h"
-#include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/kernels/reference_gemm.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/kernels/reference_gemm.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -125,12 +128,20 @@ class QuantizedMatMulOp : public OpKernel {
     const size_t ldb = b.dim_size(1);
     const size_t ldc = n;
 
-    // The gemmlowp optimized library only works for a particular set of data
-    // types, so check if we meet those requirements and
-    // fall back to a slower reference implementation if not.
-    if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
-        std::is_same<Toutput, qint32>() && (offset_c == 0) && (mult_c == 1) &&
-        (shift_c == 0) && (transpose_c == false)) {
+    if (meta::IsSupportedAndEnabled() && std::is_same<T1, quint8>() &&
+        std::is_same<T2, quint8>() && std::is_same<Toutput, qint32>() &&
+        (offset_c == 0) && (mult_c == 1) && (shift_c == 0) &&
+        (transpose_c == false)) {
+      // Gemmlowp/meta code path works on 32 & 64 bit Arm with NEON Simd and
+      // allows optimized quantized 8bit to 32bit gemm.
+      meta::QuantizedGemm(context, transpose_a_, transpose_b_, a_data, b_data,
+                          c_data, m, n, k, offset_a, offset_b, lda, ldb, ldc);
+    } else if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
+               std::is_same<Toutput, qint32>() && (offset_c == 0) &&
+               (mult_c == 1) && (shift_c == 0) && (transpose_c == false)) {
+      // The gemmlowp optimized library only works for a particular set of data
+      // types, so check if we meet those requirements and fall back to a slower
+      // reference implementation if not.
       if (transpose_a_) {
         if (transpose_b_) {
           GemmlowpMultiply<true, true, false>(context, a_data, b_data, c_data,
author	A. Unique TensorFlower <gardener@tensorflow.org>	2016-10-25 12:45:42 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-10-25 13:54:46 -0700
commit	1f6a46596aa2c9c88800a84d85b2b785209a546d (patch)
tree	4706dd7890017534a5f419037fc127dfbc4fa113 /tensorflow/core/kernels/quantized_matmul_op.cc
parent	89a96067306258f6ed5eac5ea04801d0e6b213f9 (diff)