Arm32/64 kernel optimizations:

- QuantizeV2 - Dequantize - QuantizedBiasAdd - QuantizeDownAndShrinkRange - QuantizedRelu - QuantizedRelu6 - QuantizedMatMul - QuantizedConv The optimizations are controled by three knobs: meta::SetEnabled(bool) -- turns codepath on/off, on by default meta::SetUseLocalContext(bool) -- true -- codepath will use it's own internal fine grain workers pool that offers performance improvement over the standard tensorflow worker pool. This workers pool is not compatible with other ops. Per use-case performance testing recommended. -- false (default) -- use the standard tf worker pool instance meta::SetNumThreads(int) -- no. of compute threads when the internal worker pool is used. If 0 use intra_parallelism_count, if x > 0 then x threads. Change: 137448955
author: A. Unique TensorFlower <gardener@tensorflow.org> 2016-10-27 14:24:07 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-10-27 15:32:39 -0700
commit: 16cda320d92cfbfc6870140691ae2c5e6286688c (patch)
tree: 87a60a261560dd840f6be4c4fec89d15c5532da5 /tensorflow/core/kernels/quantized_bias_add_op.cc
parent: 71b993a63c9f4c62d45303623f926219066902cc (diff)
1 files changed, 21 insertions, 4 deletions
diff --git a/tensorflow/core/kernels/quantized_bias_add_op.cc b/tensorflow/core/kernels/quantized_bias_add_op.cc
index 0b34bfcad8..5457d290c2 100644
--- a/tensorflow/core/kernels/quantized_bias_add_op.cc
+++ b/tensorflow/core/kernels/quantized_bias_add_op.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 // Implements a quantized eight-bit version of the bias addition operation.
 
-#include "tensorflow/core/kernels/quantization_utils.h"
+#define EIGEN_USE_THREADS
+
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -60,9 +63,23 @@ class QuantizedBiasAddOp : public OpKernel {
 
     float total_min;
     float total_max;
-    QuantizedAddUsingEigen<T1, T2, T3>(
-        context->template eigen_device<CPUDevice>(), input, input_min,
-        input_max, bias, bias_min, bias_max, output, &total_min, &total_max);
+
+    if (meta::IsSupportedAndEnabled() && std::is_same<T1, quint8>() &&
+        std::is_same<T2, quint8>() && std::is_same<T3, qint32>()) {
+      auto input_ui8_array = input.flat<quint8>();
+      auto bias_ui8_array = bias.flat<quint8>();
+      GetOutputMinAndMaxForQuantizedAdd(input_min, input_max, bias_min,
+                                        bias_max, &total_min, &total_max);
+      meta::QuantizedBiasAdd(context, input_ui8_array.data(),
+                             input_ui8_array.size(), bias_ui8_array.data(),
+                             bias_ui8_array.size(), input_min, input_max,
+                             bias_min, bias_max, total_min, total_max,
+                             output->flat<qint32>().data());
+    } else {
+      QuantizedAddUsingEigen<T1, T2, T3>(
+          context->template eigen_device<CPUDevice>(), input, input_min,
+          input_max, bias, bias_min, bias_max, output, &total_min, &total_max);
+    }
 
     Tensor* output_min = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
author	A. Unique TensorFlower <gardener@tensorflow.org>	2016-10-27 14:24:07 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-10-27 15:32:39 -0700
commit	16cda320d92cfbfc6870140691ae2c5e6286688c (patch)
tree	87a60a261560dd840f6be4c4fec89d15c5532da5 /tensorflow/core/kernels/quantized_bias_add_op.cc
parent	71b993a63c9f4c62d45303623f926219066902cc (diff)