aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/quantized_conv_ops.cc
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2016-10-25 12:45:42 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-10-25 13:54:46 -0700
commit1f6a46596aa2c9c88800a84d85b2b785209a546d (patch)
tree4706dd7890017534a5f419037fc127dfbc4fa113 /tensorflow/core/kernels/quantized_conv_ops.cc
parent89a96067306258f6ed5eac5ea04801d0e6b213f9 (diff)
Arm32/64 kernel optimizations:
- QuantizeV2 - Dequantize - QuantizedBiasAdd - QuantizeDownAndShrinkRange - QuantizedRelu - QuantizedRelu6 - QuantizedMatMul - QuantizedConv The optimizations are controled by three knobs: meta::SetEnabled(bool) -- turns codepath on/off, on by default meta::SetUseLocalContext(bool) -- true -- codepath will use it's own internal fine grain workers pool that offers performance improvement over the standard tensorflow worker pool. This workers pool is not compatible with other ops. Per use-case performance testing recommended. -- false (default) -- use the standard tf worker pool instance meta::SetNumThreads(int) -- no. of compute threads when the internal worker pool is used. If 0 use intra_parallelism_count, if x > 0 then x threads. Change: 137197327
Diffstat (limited to 'tensorflow/core/kernels/quantized_conv_ops.cc')
-rw-r--r--tensorflow/core/kernels/quantized_conv_ops.cc27
1 files changed, 19 insertions, 8 deletions
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index fb69d770c0..2405c55c5b 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -18,12 +18,15 @@ limitations under the License.
#include <algorithm>
#include <vector>
+#define EIGEN_USE_THREADS
+
#include "public/gemmlowp.h"
-#include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/kernels/reference_gemm.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/meta_support.h"
#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/kernels/reference_gemm.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/util/padding.h"
@@ -338,12 +341,20 @@ class Im2ColConvFunctor {
const int lda = filter_value_count;
const int ldb = filter_count;
const int ldc = filter_count;
- // The gemmlowp optimized library only works for a particular set of data
- // types, so check if we meet those requirements and
- // fall back to a slower reference implementation if not.
- if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
- std::is_same<T3, qint32>() && (output_offset == 0) &&
- (output_mult == 1) && (output_shift == 0)) {
+
+ if (meta::IsSupportedAndEnabled() && std::is_same<T1, quint8>() &&
+ std::is_same<T2, quint8>() && std::is_same<T3, qint32>() &&
+ (output_offset == 0) && (output_mult == 1) && (output_shift == 0) &&
+ (transpose_c == false)) {
+ meta::QuantizedGemm(op_context, transpose_a, transpose_b,
+ im2col_buffer.get(), filter_data, output_data, m, n,
+ k, -input_offset, -filter_offset, lda, ldb, ldc);
+ } else if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
+ std::is_same<T3, qint32>() && (output_offset == 0) &&
+ (output_mult == 1) && (output_shift == 0)) {
+ // The gemmlowp optimized library only works for a particular set of data
+ // types, so check if we meet those requirements and
+ // fall back to a slower reference implementation if not.
const uint8* im2col_data_as_uint8 = &(im2col_buffer.get()->value);
const uint8* filter_data_as_uint8 = &(filter_data->value);
int32* output_data_as_int32 = &(output_data->value);