diff options
author | 2016-10-25 16:41:49 -0800 | |
---|---|---|
committer | 2016-10-25 17:50:36 -0700 | |
commit | 3e3633c8b5e2817d502de6dd892c5495cb5e85a3 (patch) | |
tree | 155a5e19087fabf81e6b1b84cf4cd08d707f6fbf /tensorflow/core/kernels/meta_support.cc | |
parent | 5271c1a05785d0e7a44d8c46951dfbce6e7e9662 (diff) |
Automated rollback of change 137197327
Change: 137225083
Diffstat (limited to 'tensorflow/core/kernels/meta_support.cc')
-rw-r--r-- | tensorflow/core/kernels/meta_support.cc | 373 |
1 files changed, 0 insertions, 373 deletions
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc deleted file mode 100644 index bd46506c71..0000000000 --- a/tensorflow/core/kernels/meta_support.cc +++ /dev/null @@ -1,373 +0,0 @@ -/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#define EIGEN_USE_THREADS - -#include "tensorflow/core/kernels/meta_support.h" - -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/kernels/quantization_utils.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/mutex.h" - -#if (defined(GEMMLOWP_NEON_32) || defined(GEMMLOWP_NEON_64)) && \ - !defined(TENSORFLOW_DISABLE_META) -#define TENSORFLOW_USE_META (1) -#endif - -namespace tensorflow { -namespace meta { - -namespace { - -int g_num_threads = 0; -bool g_enabled = true; -bool g_use_local_context = false; - -#ifdef TENSORFLOW_USE_META - -uint8_t* GetScratch() { - static uint8_t* scratch = new uint8_t[2048 * 1024]; - return scratch; -} - -gemmlowp::WorkersPool* GetWorkersPool() { - static gemmlowp::WorkersPool* pool = new gemmlowp::WorkersPool(); - return pool; -} - -mutex& GetMutex() { - static mutex mu; - return mu; -} - -int GetWorkersCount(OpKernelContext* tf_context) { - if (g_num_threads == 0) { - return tf_context->device()->tensorflow_cpu_worker_threads()->num_threads; - } - return g_num_threads; -} - -typedef gemmlowp::meta::SimpleContext<gemmlowp::WorkersPool> LocalContext; - -template <typename Context, typename Params> -void MultiThreadGemm(Context* context, const Params& params) { - if (params.m <= 4) { - gemmlowp::meta::Gemm<gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, - Params, 1, 8, 8>(params); - } else { - if (params.m >= params.n) { - gemmlowp::meta::MultiThreadGemm< - Context, gemmlowp::meta::GemmExecutorPackRHSCacheFriendly<>, Params, - 2, 4, 8>(context, params); - } else { - gemmlowp::meta::MultiThreadGemm< - Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, - 2, 4, 8>(context, params); - } - } -} - -template <typename LeftStream, typename RightStream> -void QuantizedGemmImpl(OpKernelContext* tf_context, const quint8* a_data, - const quint8* b_data, qint32* c_data, int m, int n, - int k, int offset_a, int offset_b, int lda, int ldb, - int ldc) { - typedef gemmlowp::meta::GemmParams< - uint8_t, int32_t, LeftStream, RightStream, - gemmlowp::meta::QuantizedStaticPreprocessedAsInt32, - gemmlowp::meta::RowMajor> - Params; - Params params; - - params.m = m; - params.n = n; - params.k = k; - - params.lhs = reinterpret_cast<const uint8_t*>(&(a_data->value)); - params.rhs = reinterpret_cast<const uint8_t*>(&(b_data->value)); - params.result = reinterpret_cast<int32_t*>(&(c_data->value)); - params.scratch = GetScratch(); - - params.left_stream.count = k; - params.left_stream.stride = lda; - params.left_stream.multiplicative_sum_offset = offset_b; - params.left_stream.additive_sum_offset = k * offset_a * offset_b; - - params.right_stream.count = k; - params.right_stream.stride = ldb; - params.right_stream.multiplicative_sum_offset = offset_a; - params.right_stream.additive_sum_offset = 0; - - params.fused_kernel.kernel.count = k; - params.fused_kernel.output_stream.stride = ldc * sizeof(int32_t); - - if (g_use_local_context) { - LocalContext local_context(GetWorkersCount(tf_context), GetWorkersPool()); - MultiThreadGemm<LocalContext, Params>(&local_context, params); - } else { - auto& workers = *(tf_context->device()->tensorflow_cpu_worker_threads()); - TensorflowGemmContext context(workers.num_threads, workers.workers); - MultiThreadGemm<TensorflowGemmContext, Params>(&context, params); - } -} - -template <typename Params, int kernel_size> -void MultiThreadTransform1D(OpKernelContext* tf_context, const Params& params) { - if (g_use_local_context) { - LocalContext local_context(GetWorkersCount(tf_context), GetWorkersPool()); - gemmlowp::meta::MultiThreadTransform1D<LocalContext, Params, kernel_size>( - &local_context, params); - } else { - auto& workers = *(tf_context->device()->tensorflow_cpu_worker_threads()); - TensorflowGemmContext context(workers.num_threads, workers.workers); - gemmlowp::meta::MultiThreadTransform1D<TensorflowGemmContext, Params, - kernel_size>(&context, params); - } -} - -template <typename QuantizedType> -double CalculateRangeScale(float min, float max) { - const int bits = sizeof(QuantizedType) * 8; - return static_cast<double>(max - min) / - ((static_cast<int64_t>(1) << bits) - 1); -} - -template <typename QuantizedType> -double CalculateOneOverRangeScale(float min, float max) { - if (min == max) { - return 0.0; - } - const int bits = sizeof(QuantizedType) * 8; - return static_cast<double>((static_cast<int64_t>(1) << bits) - 1) / - (max - min); -} - -#endif // TENSORFLOW_USE_META - -} // namespace - -void SetNumThreads(int num_threads) { g_num_threads = num_threads; } - -int GetNumThreads() { return g_num_threads; } - -void SetUseLocalContext(bool use_local_context) { - g_use_local_context = use_local_context; -} - -bool GetUseLocalContext() { return g_use_local_context; } - -bool IsSupported() { -#if defined(TENSORFLOW_USE_META) - return true; -#else - return false; -#endif -} - -bool IsEnabled() { return g_enabled; } - -void SetEnabled(bool enabled) { g_enabled = enabled; } - -bool IsSupportedAndEnabled() { return IsSupported() && IsEnabled(); } - -void QuantizedGemm(OpKernelContext* tf_context, bool transpose_a, - bool transpose_b, const quint8* a_data, const quint8* b_data, - qint32* c_data, int m, int n, int k, int offset_a, - int offset_b, int lda, int ldb, int ldc) { -#ifdef TENSORFLOW_USE_META - mutex_lock library_lock(GetMutex()); - if (transpose_a) { - if (transpose_b) { - QuantizedGemmImpl<gemmlowp::meta::ColumnMajorWithSum, - gemmlowp::meta::RowMajorWithSum>( - tf_context, a_data, b_data, c_data, m, n, k, offset_a, offset_b, lda, - ldb, ldc); - } else { - QuantizedGemmImpl<gemmlowp::meta::ColumnMajorWithSum, - gemmlowp::meta::ColumnMajorWithSum>( - tf_context, a_data, b_data, c_data, m, n, k, offset_a, offset_b, lda, - ldb, ldc); - } - } else { - if (transpose_b) { - QuantizedGemmImpl<gemmlowp::meta::RowMajorWithSum, - gemmlowp::meta::RowMajorWithSum>( - tf_context, a_data, b_data, c_data, m, n, k, offset_a, offset_b, lda, - ldb, ldc); - } else { - QuantizedGemmImpl<gemmlowp::meta::RowMajorWithSum, - gemmlowp::meta::ColumnMajorWithSum>( - tf_context, a_data, b_data, c_data, m, n, k, offset_a, offset_b, lda, - ldb, ldc); - } - } -#else - LOG(FATAL) << "QuantizedGemm: Meta fastpath not supported."; -#endif -} - -void Requantize(OpKernelContext* tf_context, const qint32* input, int count, - float input_min, float input_max, float output_min, - float output_max, quint8* output) { -#ifdef TENSORFLOW_USE_META - mutex_lock library_lock(GetMutex()); - typedef gemmlowp::meta::Transform1DParams<int32_t, uint8_t, - gemmlowp::meta::Requantize> - Params; - - Params params; - params.input = reinterpret_cast<const int32_t*>(input); - params.output = reinterpret_cast<uint8_t*>(output); - params.kernel.count = count; - params.kernel.input_range_min = input_min; - params.kernel.output_range_min = output_min; - params.kernel.input_range_scale = - CalculateRangeScale<int32_t>(input_min, input_max); - params.kernel.one_over_output_range_scale = - CalculateOneOverRangeScale<uint8_t>(output_min, output_max); - params.kernel.input_range_offset = - static_cast<float>(std::numeric_limits<int32_t>::lowest()); - - // After adding the output_range_offset the value is cast from float to uint. - // The float to int/uint cast in NEON uses round toward 0. To keep the - // rounding consistent with Eigen, which uses round toward closest, we can - // add 0.5f and exploit the fact that we only operate on non negative values. - // TODO(maciekc): fix the actual kernel in gemmlowp/meta - params.kernel.output_range_offset = - static_cast<float>(std::numeric_limits<uint8_t>::lowest()) + 0.5f; - - MultiThreadTransform1D<Params, 16>(tf_context, params); -#else - LOG(FATAL) << "Requantize: Meta fastpath not supported."; -#endif -} - -void Dequantize(OpKernelContext* tf_context, const quint8* input, int count, - float range_min, float range_max, float* output) { -#ifdef TENSORFLOW_USE_META - mutex_lock library_lock(GetMutex()); - typedef gemmlowp::meta::Transform1DParams<uint8_t, float, - gemmlowp::meta::Dequantize> - Params; - - Params params; - params.input = reinterpret_cast<const uint8_t*>(input); - params.output = reinterpret_cast<float*>(output); - params.kernel.count = count; - params.kernel.range_min = range_min; - params.kernel.range_scale = - CalculateRangeScale<uint8_t>(range_min, range_max); - params.kernel.range_offset = - static_cast<float>(std::numeric_limits<uint8_t>::lowest()); - - MultiThreadTransform1D<Params, 16>(tf_context, params); -#else - LOG(FATAL) << "Dequantize: Meta fastpath not supported."; -#endif -} - -void Quantize(OpKernelContext* tf_context, const float* input, int count, - float range_min, float range_max, quint8* output) { -#ifdef TENSORFLOW_USE_META - mutex_lock library_lock(GetMutex()); - typedef gemmlowp::meta::Transform1DParams<float, uint8_t, - gemmlowp::meta::Quantize> - Params; - - Params params; - params.input = reinterpret_cast<const float*>(input); - params.output = reinterpret_cast<uint8_t*>(output); - params.kernel.count = count; - params.kernel.range_min = range_min; - params.kernel.range_scale = - CalculateOneOverRangeScale<uint8_t>(range_min, range_max); - - // After adding the range_offset the value is cast from float to uint. - // The float to int/uint cast in NEON uses round toward 0. To keep the - // rounding consistent with Eigen, which uses round toward closest, we can - // add 0.5f and exploit the fact that we only operate on non negative values. - // TODO(maciekc): fix the the actual kernel in gemmlowp/meta - params.kernel.range_offset = - static_cast<float>(std::numeric_limits<uint8_t>::lowest()) + 0.5f; - - MultiThreadTransform1D<Params, 16>(tf_context, params); -#else - LOG(FATAL) << "Quantize: Meta fastpath not supported."; -#endif -} - -void QuantizedBiasAdd(OpKernelContext* tf_context, const quint8* input, - int input_count, const quint8* bias, int bias_count, - float input_min, float input_max, float bias_min, - float bias_max, float output_min, float output_max, - qint32* output) { -#ifdef TENSORFLOW_USE_META - mutex_lock library_lock(GetMutex()); - typedef gemmlowp::meta::Transform1DParams<uint8_t, int32_t, - gemmlowp::meta::BiasAdd<uint8_t>> - Params; - - Params params; - params.input = reinterpret_cast<const uint8_t*>(input); - params.output = reinterpret_cast<int32_t*>(output); - params.kernel.bias = reinterpret_cast<const uint8_t*>(bias); - params.kernel.count = bias_count; - params.kernel.rows = input_count / bias_count; - params.kernel.input_range_min = input_min; - params.kernel.bias_range_min = bias_min; - params.kernel.input_range_scale = - CalculateRangeScale<uint8_t>(input_min, input_max); - params.kernel.bias_range_scale = - CalculateRangeScale<uint8_t>(bias_min, bias_max); - params.kernel.input_range_offset = 0; - params.kernel.bias_range_offset = 0; - params.kernel.output_range_min = output_min; - params.kernel.one_over_output_range_scale = - CalculateOneOverRangeScale<int32_t>(output_min, output_max); - params.kernel.output_range_offset = - static_cast<float>(std::numeric_limits<int32_t>::lowest()); - - // TODO(maciekc): add multithreading to bias add. - // Right now this kernel does not support multi threaded execution. - gemmlowp::meta::Transform1D<Params, 16>(params); -#else - LOG(FATAL) << "QuantizedBiasAdd: Meta fastpath not supported."; -#endif -} - -void Clamp(OpKernelContext* tf_context, const quint8* input, int count, - quint8 clamp_min, quint8 clamp_max, quint8* output) { -#ifdef TENSORFLOW_USE_META - mutex_lock library_lock(GetMutex()); - typedef gemmlowp::meta::Transform1DParams<uint8_t, uint8_t, - gemmlowp::meta::MinMax<uint8_t>> - Params; - - Params params; - params.input = reinterpret_cast<const uint8_t*>(input); - params.output = reinterpret_cast<uint8_t*>(output); - params.kernel.count = count; - params.kernel.min = clamp_min; - params.kernel.max = clamp_max; - - MultiThreadTransform1D<Params, 16>(tf_context, params); -#else - LOG(FATAL) << "Clamp: Meta fastpath not supported."; -#endif -} - -} // namespace meta -} // namespace tensorflow |