diff options
author | Pete Warden <petewarden@google.com> | 2018-09-24 15:54:32 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-09-24 16:02:13 -0700 |
commit | 1ff157d82dac29f5a3a3197b2664208f6ed6ba06 (patch) | |
tree | c751f5a665a27c660809c4884eb07f69b4983244 /tensorflow/contrib/lite/kernels | |
parent | 9c58005ec86297a1d0a17dc4f7ad7cbae9c47e4b (diff) |
Portability preparation for more cross-platform prototyping.
PiperOrigin-RevId: 214346240
Diffstat (limited to 'tensorflow/contrib/lite/kernels')
9 files changed, 756 insertions, 601 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD index 195474e7fd..afb5ec05df 100644 --- a/tensorflow/contrib/lite/kernels/internal/BUILD +++ b/tensorflow/contrib/lite/kernels/internal/BUILD @@ -43,7 +43,10 @@ cc_library( "compatibility.h", "types.h", ], - deps = ["@com_google_absl//absl/base:core_headers"], + deps = [ + "//tensorflow/contrib/lite/kernels:op_macros", + "@com_google_absl//absl/base:core_headers", + ], ) config_setting( @@ -260,6 +263,7 @@ cc_library( deps = [ ":round", ":types", + "//tensorflow/contrib/lite/kernels:op_macros", ], ) @@ -291,7 +295,9 @@ cc_library( "common.h", "reference/depthwiseconv_float.h", "reference/depthwiseconv_uint8.h", + "reference/fully_connected.h", "reference/reference_ops.h", + "reference/softmax.h", ], deps = [ ":quantization_util", @@ -300,6 +306,7 @@ cc_library( ":types", "@gemmlowp", "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:op_macros", ] + select({ ":haswell": tflite_deps_intel, ":ios_x86_64": tflite_deps_intel, @@ -320,8 +327,10 @@ cc_library( "common.h", "reference/depthwiseconv_float.h", "reference/depthwiseconv_uint8.h", + "reference/fully_connected.h", "reference/legacy_reference_ops.h", "reference/reference_ops.h", + "reference/softmax.h", ], deps = [ ":quantization_util", @@ -330,6 +339,7 @@ cc_library( ":types", "@gemmlowp", "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:op_macros", ] + select({ ":haswell": tflite_deps_intel, ":ios_x86_64": tflite_deps_intel, @@ -462,6 +472,7 @@ cc_library( "@com_google_absl//absl/base:core_headers", "//tensorflow/contrib/lite/c:c_api_internal", "@arm_neon_2_x86_sse", + "//tensorflow/contrib/lite/kernels:op_macros", "@gemmlowp", ] + select({ ":arm": [ diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h index 93fc6b6a76..b87cf2b60d 100644 --- a/tensorflow/contrib/lite/kernels/internal/compatibility.h +++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h @@ -15,65 +15,65 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_ -#include <cassert> #include <cstdint> -#include <cstdlib> + +#include "tensorflow/contrib/lite/kernels/op_macros.h" #ifndef TFLITE_DCHECK -#define TFLITE_DCHECK(condition) (condition) ? (void)0 : assert(false) +#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_EQ -#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_NE -#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_GE -#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_GT -#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_LE -#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_LT -#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif // TODO(ahentz): Clean up: We should stick to the DCHECK versions. #ifndef TFLITE_CHECK -#define TFLITE_CHECK(condition) (condition) ? (void)0 : abort() +#define TFLITE_CHECK(condition) (condition) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_EQ -#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort() +#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_NE -#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : abort() +#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_GE -#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort() +#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_GT -#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : abort() +#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_LE -#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : abort() +#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_LT -#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : abort() +#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT #endif // TODO(ahentz): Clean up. diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h index ecc655cf99..e8fc566502 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h @@ -18,7 +18,6 @@ limitations under the License. #include <algorithm> #include "fixedpoint/fixedpoint.h" -#include "public/gemmlowp.h" #include "tensorflow/contrib/lite/kernels/internal/common.h" #include "tensorflow/contrib/lite/kernels/internal/compatibility.h" #include "tensorflow/contrib/lite/kernels/internal/types.h" @@ -35,7 +34,6 @@ inline void DepthwiseConv( const uint8* filter_data, const RuntimeShape& bias_shape, const int32* bias_data, const RuntimeShape& output_shape, uint8* output_data) { - gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit"); const int stride_width = params.stride_width; const int stride_height = params.stride_height; const int dilation_width_factor = params.dilation_width_factor; diff --git a/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h b/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h new file mode 100644 index 0000000000..23325e8c4c --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h @@ -0,0 +1,460 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ + +#include "fixedpoint/fixedpoint.h" +#include "tensorflow/contrib/lite/kernels/internal/common.h" +#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" +#include "tensorflow/contrib/lite/kernels/internal/round.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" + +namespace tflite { +namespace reference_ops { + +const int kReverseShift = -1; + +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& weights_shape, + const float* weights_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data) { + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dims_count = output_shape.DimensionsCount(); + const int weights_dims_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); + const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2, + output_shape, output_dims_count - 1); + const int accum_depth = weights_shape.Dims(weights_dims_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + float total = 0.f; + for (int d = 0; d < accum_depth; ++d) { + total += input_data[b * accum_depth + d] * + weights_data[out_c * accum_depth + d]; + } + float bias_value = 0.0f; + if (bias_data) { + bias_value = bias_data[out_c]; + } + output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( + total + bias_value, output_activation_min, output_activation_max); + } + } +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +// Legacy. +inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, + const float* weights_data, + const Dims<4>& weights_dims, const float* bias_data, + const Dims<4>& bias_dims, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { + tflite::FullyConnectedParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + FullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(weights_dims), weights_data, + DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), + output_data); +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void FullyConnected(const float* input_data, const Dims<4>& input_dims, + const float* weights_data, const Dims<4>& weights_dims, + const float* bias_data, const Dims<4>& bias_dims, + float* output_data, const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, + bias_dims, output_activation_min, output_activation_max, + output_data, output_dims); +} + +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data, void* gemm_context) { + (void)gemm_context; // only used in optimized code. + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + int32 acc = 0; + for (int d = 0; d < accum_depth; ++d) { + int32 input_val = input_data[b * accum_depth + d]; + int32 filter_val = filter_data[out_c * accum_depth + d]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + if (bias_data) { + acc += bias_data[out_c]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[out_c + output_depth * b] = static_cast<uint8>(acc); + } + } +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +// Legacy. +inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, void* gemm_context) { + tflite::FullyConnectedParams op_params; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + FullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data, + gemm_context); +} + +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + int16* output_data, void* gemm_context) { + (void)gemm_context; // only used in optimized code. + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_EQ(output_offset, 0); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum = bias_data[out_c]; + // Accumulation loop. + for (int d = 0; d < accum_depth; ++d) { + int16 input_val = input_data[b * accum_depth + d] + input_offset; + int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset; + accum += filter_val * input_val; + } + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + accum = + MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift); + // Saturate, cast to int16, and store to output array. + accum = std::max(accum, output_activation_min - output_offset); + accum = std::min(accum, output_activation_max - output_offset); + accum += output_offset; + output_data[out_c + output_depth * b] = accum; + } + } +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +// Legacy. +inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, int16* output_data, + const Dims<4>& output_dims, void* gemm_context) { + tflite::FullyConnectedParams op_params; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + FullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data, + gemm_context); +} + +inline void ShuffledFullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& weights_shape, + const uint8* shuffled_weights_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + int16* output_data, uint8* shuffled_input_workspace_data, + void* gemm_context) { + (void)gemm_context; // only used in optimized code. + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + + TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int weights_dim_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = weights_shape.Dims(weights_dim_count - 1); + TFLITE_DCHECK((accum_depth % 16) == 0); + TFLITE_DCHECK((output_depth % 4) == 0); + + // Shuffling and xoring of input activations into the workspace buffer + uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + if (batches == 1) { + for (int i = 0; i < accum_depth; i++) { + shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; + } + } else if (batches == 4) { + for (int c = 0; c < accum_depth; c += 16) { + for (int b = 0; b < 4; b++) { + const uint8* src_data_ptr = input_data + b * accum_depth + c; + for (int j = 0; j < 16; j++) { + uint8 src_val = *src_data_ptr++; + // Flip the sign bit, so that the kernel will only need to + // reinterpret these uint8 values as int8, getting for free the + // subtraction of the zero_point value 128. + uint8 dst_val = src_val ^ 0x80; + *shuffled_input_workspace_ptr++ = dst_val; + } + } + } + } else { + TFLITE_DCHECK(false); + return; + } + + // Actual computation + if (batches == 1) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast<const int8*>(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast<const int8*>(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4] = {0}; + // Accumulation loop. + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_data[d + j]; + int8 weights_val = *shuffled_weights_ptr++; + accum[i] += weights_val * input_val; + } + } + } + for (int i = 0; i < 4; i++) { + // Add bias value + int32 acc = accum[i] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = + MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[c + i] = acc; + } + } + } else if (batches == 4) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast<const int8*>(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast<const int8*>(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = shuffled_input_data; + // Accumulation loop. + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4][4]; + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + accum[i][b] = 0; + } + } + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_ptr[16 * b + j]; + int8 weights_val = shuffled_weights_ptr[16 * i + j]; + accum[i][b] += weights_val * input_val; + } + } + } + shuffled_input_ptr += 64; + shuffled_weights_ptr += 64; + } + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + // Add bias value + int32 acc = accum[i][b] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The + // quantized multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[b * output_depth + c + i] = acc; + } + } + } + } else { + TFLITE_DCHECK(false); + return; + } +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +// Legacy. +inline void ShuffledFullyConnected( + const uint8* input_data, const Dims<4>& input_dims, + const uint8* shuffled_weights_data, const Dims<4>& weights_dims, + const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + int16* output_data, const Dims<4>& output_dims, + uint8* shuffled_input_workspace_data, void* gemm_context) { + tflite::FullyConnectedParams op_params; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(weights_dims), shuffled_weights_data, + DimsToShape(bias_dims), bias_data, + DimsToShape(output_dims), output_data, + shuffled_input_workspace_data, gemm_context); +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, void* gemm_context) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, output_offset, + output_multiplier, output_shift, output_activation_min, + output_activation_max, output_data, output_dims, gemm_context); +} + +} // namespace reference_ops +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index 5bfa3bd084..7a5535489a 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -28,6 +28,8 @@ limitations under the License. #include "public/gemmlowp.h" #include "tensorflow/contrib/lite/kernels/internal/common.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/softmax.h" #include "tensorflow/contrib/lite/kernels/internal/round.h" #include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h" #include "tensorflow/contrib/lite/kernels/internal/types.h" @@ -98,13 +100,6 @@ gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub( namespace reference_ops { -// TODO(b/80247582) Remove this constant. -// This will be phased out as the shifts are revised with more thought. Use of a -// constant enables us to track progress on this work. -// -// Used mainly to convert from old-style shifts (right) to new-style (left). -static constexpr int kReverseShift = -1; - inline void ShapeFromDims(const tflite::Dims<4>& dims, RuntimeShape* shape) { shape->BuildFrom( {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]}); @@ -181,7 +176,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); - (void)im2col_data; // only used in optimized code. + (void)im2col_data; // only used in optimized code. (void)im2col_shape; // only used in optimized code. const int batches = MatchingDim(input_shape, 0, output_shape, 0); const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); @@ -606,437 +601,6 @@ inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params, } } -inline void FullyConnected( - const FullyConnectedParams& params, const RuntimeShape& input_shape, - const float* input_data, const RuntimeShape& weights_shape, - const float* weights_data, const RuntimeShape& bias_shape, - const float* bias_data, const RuntimeShape& output_shape, - float* output_data) { - const float output_activation_min = params.float_activation_min; - const float output_activation_max = params.float_activation_max; - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int output_dims_count = output_shape.DimensionsCount(); - const int weights_dims_count = weights_shape.DimensionsCount(); - const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); - const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2, - output_shape, output_dims_count - 1); - const int accum_depth = weights_shape.Dims(weights_dims_count - 1); - for (int b = 0; b < batches; ++b) { - for (int out_c = 0; out_c < output_depth; ++out_c) { - float total = 0.f; - for (int d = 0; d < accum_depth; ++d) { - total += input_data[b * accum_depth + d] * - weights_data[out_c * accum_depth + d]; - } - float bias_value = 0.0f; - if (bias_data) { - bias_value = bias_data[out_c]; - } - output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( - total + bias_value, output_activation_min, output_activation_max); - } - } -} - -// TODO(b/80418076): Move to legacy ops file, update invocations. -// Legacy. -inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, - const float* weights_data, - const Dims<4>& weights_dims, const float* bias_data, - const Dims<4>& bias_dims, - float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims) { - tflite::FullyConnectedParams op_params; - op_params.float_activation_min = output_activation_min; - op_params.float_activation_max = output_activation_max; - - FullyConnected(op_params, DimsToShape(input_dims), input_data, - DimsToShape(weights_dims), weights_data, - DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), - output_data); -} - -// TODO(b/80418076): Move to legacy ops file, update invocations. -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void FullyConnected(const float* input_data, const Dims<4>& input_dims, - const float* weights_data, const Dims<4>& weights_dims, - const float* bias_data, const Dims<4>& bias_dims, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, - bias_dims, output_activation_min, output_activation_max, - output_data, output_dims); -} - -inline void FullyConnected( - const FullyConnectedParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - uint8* output_data, gemmlowp::GemmContext* gemm_context) { - (void)gemm_context; // only used in optimized code. - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; - const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; - TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); - TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); - - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int output_dim_count = output_shape.DimensionsCount(); - const int filter_dim_count = filter_shape.DimensionsCount(); - const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); - const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, - output_shape, output_dim_count - 1); - const int accum_depth = filter_shape.Dims(filter_dim_count - 1); - for (int b = 0; b < batches; ++b) { - for (int out_c = 0; out_c < output_depth; ++out_c) { - int32 acc = 0; - for (int d = 0; d < accum_depth; ++d) { - int32 input_val = input_data[b * accum_depth + d]; - int32 filter_val = filter_data[out_c * accum_depth + d]; - acc += (filter_val + filter_offset) * (input_val + input_offset); - } - if (bias_data) { - acc += bias_data[out_c]; - } - acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); - acc += output_offset; - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[out_c + output_depth * b] = static_cast<uint8>(acc); - } - } -} - -// TODO(b/80418076): Move to legacy ops file, update invocations. -// Legacy. -inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - tflite::FullyConnectedParams op_params; - op_params.input_offset = input_offset; - op_params.weights_offset = filter_offset; - op_params.output_offset = output_offset; - op_params.output_multiplier = output_multiplier; - // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. - op_params.output_shift = kReverseShift * output_shift; - op_params.quantized_activation_min = output_activation_min; - op_params.quantized_activation_max = output_activation_max; - - FullyConnected(op_params, DimsToShape(input_dims), input_data, - DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), - bias_data, DimsToShape(output_dims), output_data, - gemm_context); -} - -inline void FullyConnected( - const FullyConnectedParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - int16* output_data, gemmlowp::GemmContext* gemm_context) { - (void)gemm_context; // only used in optimized code. - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; - const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; - - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - TFLITE_DCHECK_EQ(output_offset, 0); - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int output_dim_count = output_shape.DimensionsCount(); - const int filter_dim_count = filter_shape.DimensionsCount(); - const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); - const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, - output_shape, output_dim_count - 1); - const int accum_depth = filter_shape.Dims(filter_dim_count - 1); - for (int b = 0; b < batches; ++b) { - for (int out_c = 0; out_c < output_depth; ++out_c) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum = bias_data[out_c]; - // Accumulation loop. - for (int d = 0; d < accum_depth; ++d) { - int16 input_val = input_data[b * accum_depth + d] + input_offset; - int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset; - accum += filter_val * input_val; - } - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - accum = - MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift); - // Saturate, cast to int16, and store to output array. - accum = std::max(accum, output_activation_min - output_offset); - accum = std::min(accum, output_activation_max - output_offset); - accum += output_offset; - output_data[out_c + output_depth * b] = accum; - } - } -} - -// TODO(b/80418076): Move to legacy ops file, update invocations. -// Legacy. -inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, int16* output_data, - const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - tflite::FullyConnectedParams op_params; - op_params.input_offset = input_offset; - op_params.weights_offset = filter_offset; - op_params.output_offset = output_offset; - op_params.output_multiplier = output_multiplier; - // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. - op_params.output_shift = kReverseShift * output_shift; - op_params.quantized_activation_min = output_activation_min; - op_params.quantized_activation_max = output_activation_max; - - FullyConnected(op_params, DimsToShape(input_dims), input_data, - DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), - bias_data, DimsToShape(output_dims), output_data, - gemm_context); -} - -inline void ShuffledFullyConnected( - const FullyConnectedParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& weights_shape, - const uint8* shuffled_weights_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - int16* output_data, uint8* shuffled_input_workspace_data, - gemmlowp::GemmContext* gemm_context) { - (void)gemm_context; // only used in optimized code. - const int32 output_multiplier = params.output_multiplier; - const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - - TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); - TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); - TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int output_dim_count = output_shape.DimensionsCount(); - const int weights_dim_count = weights_shape.DimensionsCount(); - const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); - const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2, - output_shape, output_dim_count - 1); - const int accum_depth = weights_shape.Dims(weights_dim_count - 1); - TFLITE_DCHECK((accum_depth % 16) == 0); - TFLITE_DCHECK((output_depth % 4) == 0); - - // Shuffling and xoring of input activations into the workspace buffer - uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; - if (batches == 1) { - for (int i = 0; i < accum_depth; i++) { - shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; - } - } else if (batches == 4) { - for (int c = 0; c < accum_depth; c += 16) { - for (int b = 0; b < 4; b++) { - const uint8* src_data_ptr = input_data + b * accum_depth + c; - for (int j = 0; j < 16; j++) { - uint8 src_val = *src_data_ptr++; - // Flip the sign bit, so that the kernel will only need to - // reinterpret these uint8 values as int8, getting for free the - // subtraction of the zero_point value 128. - uint8 dst_val = src_val ^ 0x80; - *shuffled_input_workspace_ptr++ = dst_val; - } - } - } - } else { - TFLITE_DCHECK(false); - return; - } - - // Actual computation - if (batches == 1) { - int16* output_ptr = output_data; - // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to - // subtracting 128 from them, thus implementing for free the subtraction of - // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast<const int8*>(shuffled_weights_data); - // Likewise, we preshuffled and pre-xored the input data above. - const int8* shuffled_input_data = - reinterpret_cast<const int8*>(shuffled_input_workspace_data); - for (int c = 0; c < output_depth; c += 4) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4] = {0}; - // Accumulation loop. - for (int d = 0; d < accum_depth; d += 16) { - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 16; j++) { - int8 input_val = shuffled_input_data[d + j]; - int8 weights_val = *shuffled_weights_ptr++; - accum[i] += weights_val * input_val; - } - } - } - for (int i = 0; i < 4; i++) { - // Add bias value - int acc = accum[i] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = - MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_ptr[c + i] = acc; - } - } - } else if (batches == 4) { - int16* output_ptr = output_data; - // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to - // subtracting 128 from them, thus implementing for free the subtraction of - // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast<const int8*>(shuffled_weights_data); - // Likewise, we preshuffled and pre-xored the input data above. - const int8* shuffled_input_data = - reinterpret_cast<const int8*>(shuffled_input_workspace_data); - for (int c = 0; c < output_depth; c += 4) { - const int8* shuffled_input_ptr = shuffled_input_data; - // Accumulation loop. - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4][4]; - for (int i = 0; i < 4; i++) { - for (int b = 0; b < 4; b++) { - accum[i][b] = 0; - } - } - for (int d = 0; d < accum_depth; d += 16) { - for (int i = 0; i < 4; i++) { - for (int b = 0; b < 4; b++) { - for (int j = 0; j < 16; j++) { - int8 input_val = shuffled_input_ptr[16 * b + j]; - int8 weights_val = shuffled_weights_ptr[16 * i + j]; - accum[i][b] += weights_val * input_val; - } - } - } - shuffled_input_ptr += 64; - shuffled_weights_ptr += 64; - } - for (int i = 0; i < 4; i++) { - for (int b = 0; b < 4; b++) { - // Add bias value - int acc = accum[i][b] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The - // quantized multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, - output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_ptr[b * output_depth + c + i] = acc; - } - } - } - } else { - TFLITE_DCHECK(false); - return; - } -} - -// TODO(b/80418076): Move to legacy ops file, update invocations. -// Legacy. -inline void ShuffledFullyConnected( - const uint8* input_data, const Dims<4>& input_dims, - const uint8* shuffled_weights_data, const Dims<4>& weights_dims, - const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - int16* output_data, const Dims<4>& output_dims, - uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { - tflite::FullyConnectedParams op_params; - op_params.output_multiplier = output_multiplier; - // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. - op_params.output_shift = kReverseShift * output_shift; - op_params.quantized_activation_min = output_activation_min; - op_params.quantized_activation_max = output_activation_max; - - ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data, - DimsToShape(weights_dims), shuffled_weights_data, - DimsToShape(bias_dims), bias_data, - DimsToShape(output_dims), output_data, - shuffled_input_workspace_data, gemm_context); -} - -// TODO(b/80418076): Move to legacy ops file, update invocations. -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims, gemm_context); -} - inline void Relu(const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, float* output_data) { const int flat_size = MatchingFlatSize(input_shape, output_shape); @@ -3238,144 +2802,6 @@ inline void LocalResponseNormalization( } } -inline void Softmax(const SoftmaxParams& params, - const RuntimeShape& input_shape, const float* input_data, - const RuntimeShape& output_shape, float* output_data) { - const int trailing_dim = input_shape.DimensionsCount() - 1; - const int outer_size = - MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); - const int depth = - MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); - - for (int i = 0; i < outer_size; ++i) { - // Find max element value which we'll use to ensure numerical stability - // taking advantage of the following equality: - // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) - float max = std::numeric_limits<float>::lowest(); - for (int c = 0; c < depth; ++c) { - max = std::max(max, input_data[i * depth + c]); - } - - // Compute sum. - float sum = 0.f; - for (int c = 0; c < depth; ++c) { - sum += std::exp((input_data[i * depth + c] - max) * params.beta); - } - - // Compute result. - for (int c = 0; c < depth; ++c) { - output_data[i * depth + c] = - std::exp((input_data[i * depth + c] - max) * params.beta) / sum; - } - } -} - -// TODO(b/80418076): Move to legacy ops file, update invocations. -// Legacy. -inline void Softmax(const float* input_data, const RuntimeShape& input_shape, - float beta, float* output_data, - const RuntimeShape& output_shape) { - SoftmaxParams params; - params.beta = beta; - Softmax(params, input_shape, input_data, output_shape, output_data); -} - -inline void Softmax(const SoftmaxParams& params, - const RuntimeShape& input_shape, const uint8* input_data, - const RuntimeShape& output_shape, uint8* output_data) { - const int32 input_beta_multiplier = params.input_multiplier; - const int32 input_beta_left_shift = params.input_left_shift; - const int diff_min = params.diff_min; - // The representation chosen for the input to the exp() function is Q5.26. - // We need to leave extra space since values that we skip might be as large as - // -32 before multiplying by input_beta_multiplier, and therefore as large as - // -16 afterwards. Note that exp(-8) is definitely not insignificant to - // accumulation, but exp(-16) definitely is. - static const int kScaledDiffIntegerBits = 5; - static const int kAccumulationIntegerBits = 12; - using FixedPointScaledDiff = - gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; - using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; - using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; - - const int trailing_dim = input_shape.DimensionsCount() - 1; - const int outer_size = - MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); - const int depth = - MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); - - for (int i = 0; i < outer_size; ++i) { - uint8 max_in_row = 0; - for (int c = 0; c < depth; ++c) { - max_in_row = std::max(max_in_row, input_data[i * depth + c]); - } - - FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); - for (int c = 0; c < depth; ++c) { - int32 input_diff = - static_cast<int32>(input_data[i * depth + c]) - max_in_row; - if (input_diff >= diff_min) { - const int32 input_diff_rescaled = - MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( - exp_on_negative_values(scaled_diff_f8)); - } - } - - int32 fixed_sum_of_exps = sum_of_exps.raw(); - int headroom_plus_one = - CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps)); - // This is the number of bits to the left of the binary point above 1.0. - // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and - // no later adjustment will be needed. - int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; - int32 shifted_sum_minus_one = static_cast<int32>( - (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - - (static_cast<uint32>(1) << 31)); - - FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( - FixedPoint0::FromRaw(shifted_sum_minus_one)); - - for (int c = 0; c < depth; ++c) { - int32 input_diff = - static_cast<int32>(input_data[i * depth + c]) - max_in_row; - if (input_diff >= diff_min) { - const int32 input_diff_rescaled = - MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - - FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); - int32 unsat_output = gemmlowp::RoundingDivideByPOT( - (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); - - output_data[i * depth + c] = static_cast<uint8>( - std::max(std::min(unsat_output, static_cast<int32>(255)), 0)); - - } else { - output_data[i * depth + c] = 0; - } - } - } -} - -// TODO(b/80418076): Move to legacy ops file, update invocations. -// Legacy -inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape, - int32 input_beta_multiplier, int32 input_beta_left_shift, - int diff_min, uint8* output_data, - const RuntimeShape& output_shape) { - SoftmaxParams params; - params.input_multiplier = input_beta_multiplier; - params.input_left_shift = input_beta_left_shift; - params.diff_min = diff_min; - Softmax(params, input_shape, input_data, output_shape, output_data); -} - inline void LogSoftmax(const SoftmaxParams& params, const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, float* output_data) { diff --git a/tensorflow/contrib/lite/kernels/internal/reference/softmax.h b/tensorflow/contrib/lite/kernels/internal/reference/softmax.h new file mode 100644 index 0000000000..006174e8db --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/reference/softmax.h @@ -0,0 +1,202 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_ + +#include "fixedpoint/fixedpoint.h" +#include "tensorflow/contrib/lite/kernels/internal/common.h" +#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" +#include "tensorflow/contrib/lite/kernels/internal/round.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace reference_ops { + +inline void Softmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) { + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + float max = std::numeric_limits<float>::lowest(); + for (int c = 0; c < depth; ++c) { + max = std::max(max, input_data[i * depth + c]); + } + + // Compute sum. + float sum = 0.f; + for (int c = 0; c < depth; ++c) { + sum += std::exp((input_data[i * depth + c] - max) * params.beta); + } + + // Compute result. + for (int c = 0; c < depth; ++c) { + output_data[i * depth + c] = + std::exp((input_data[i * depth + c] - max) * params.beta) / sum; + } + } +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +// Legacy. +inline void Softmax(const float* input_data, const RuntimeShape& input_shape, + float beta, float* output_data, + const RuntimeShape& output_shape) { + SoftmaxParams params; + params.beta = beta; + Softmax(params, input_shape, input_data, output_shape, output_data); +} + +inline void Softmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { + const int32 input_beta_multiplier = params.input_multiplier; + const int32 input_beta_left_shift = params.input_left_shift; + const int diff_min = params.diff_min; + // The representation chosen for the input to the exp() function is Q5.26. + // We need to leave extra space since values that we skip might be as large as + // -32 before multiplying by input_beta_multiplier, and therefore as large as + // -16 afterwards. Note that exp(-8) is definitely not insignificant to + // accumulation, but exp(-16) definitely is. + static const int kScaledDiffIntegerBits = 5; + static const int kAccumulationIntegerBits = 12; + using FixedPointScaledDiff = + gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; + using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; + using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; + + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) { + uint8 max_in_row = 0; + for (int c = 0; c < depth; ++c) { + max_in_row = std::max(max_in_row, input_data[i * depth + c]); + } + + FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); + for (int c = 0; c < depth; ++c) { + int32 input_diff = + static_cast<int32>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) { + const int32 input_diff_rescaled = + MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( + exp_on_negative_values(scaled_diff_f8)); + } + } + + int32 fixed_sum_of_exps = sum_of_exps.raw(); + int headroom_plus_one = + CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps)); + // This is the number of bits to the left of the binary point above 1.0. + // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and + // no later adjustment will be needed. + int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; + int32 shifted_sum_minus_one = static_cast<int32>( + (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - + (static_cast<uint32>(1) << 31)); + + FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( + FixedPoint0::FromRaw(shifted_sum_minus_one)); + + for (int c = 0; c < depth; ++c) { + int32 input_diff = + static_cast<int32>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) { + const int32 input_diff_rescaled = + MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + + FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); + int32 unsat_output = gemmlowp::RoundingDivideByPOT( + (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); + + output_data[i * depth + c] = static_cast<uint8>( + std::max(std::min(unsat_output, static_cast<int32>(255)), + static_cast<int32>(0))); + + } else { + output_data[i * depth + c] = 0; + } + } + } +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +// Legacy +inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape, + int32 input_beta_multiplier, int32 input_beta_left_shift, + int diff_min, uint8* output_data, + const RuntimeShape& output_shape) { + SoftmaxParams params; + params.input_multiplier = input_beta_multiplier; + params.input_left_shift = input_beta_left_shift; + params.diff_min = diff_min; + Softmax(params, input_shape, input_data, output_shape, output_data); +} + +// Performs softmax along the input of size (input_size * batch_size). +inline void Softmax(const float* in, const int input_size, const int batch_size, + const float beta, float* out) { + // TF_LITE_ASSERT(input_size > 0); + + // For each batch + for (int b = 0; b < batch_size; b++) { + // Find the max coeff. + float max_coeff = in[0]; + for (int i = 1; i < input_size; i++) { + if (in[i] > max_coeff) max_coeff = in[i]; + } + + // Compute the normalized sum of exps. + float exp_sum = 0.0; + for (int i = 0; i < input_size; i++) { + out[i] = std::exp((in[i] - max_coeff) * beta); + exp_sum += out[i]; + } + + // Divide by the sum of exps. + float reciprocal_sum_exp = 1.f / exp_sum; + for (int i = 0; i < input_size; i++) { + out[i] *= reciprocal_sum_exp; + } + + // Advance in and out pointers for the next batch. + in += input_size; + out += input_size; + } +} + +} // namespace reference_ops +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h index 3e0308721e..a3a5994c9c 100644 --- a/tensorflow/contrib/lite/kernels/internal/types.h +++ b/tensorflow/contrib/lite/kernels/internal/types.h @@ -15,8 +15,8 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_ +#include <algorithm> #include <cstring> -#include <iterator> #include "absl/base/macros.h" #include "tensorflow/contrib/lite/kernels/internal/compatibility.h" @@ -126,7 +126,11 @@ class RuntimeShape { explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) { if (dimensions_count > kMaxSmallSize) { +#ifdef TF_LITE_STATIC_MEMORY + TFLITE_CHECK(false && "No shape resizing supported on this platform"); +#else // TF_LITE_STATIC_MEMORY dims_pointer_ = new int32[dimensions_count]; +#endif // TF_LITE_STATIC_MEMORY } } @@ -161,7 +165,11 @@ class RuntimeShape { ~RuntimeShape() { if (size_ > kMaxSmallSize) { +#ifdef TF_LITE_STATIC_MEMORY + TFLITE_CHECK(false && "No shape resizing supported on this platform"); +#else // TF_LITE_STATIC_MEMORY delete[] dims_pointer_; +#endif // TF_LITE_STATIC_MEMORY } } @@ -192,11 +200,19 @@ class RuntimeShape { inline void Resize(int dimensions_count) { if (size_ > kMaxSmallSize) { +#ifdef TF_LITE_STATIC_MEMORY + TFLITE_CHECK(false && "No shape resizing supported on this platform"); +#else // TF_LITE_STATIC_MEMORY delete[] dims_pointer_; +#endif // TF_LITE_STATIC_MEMORY } size_ = dimensions_count; if (dimensions_count > kMaxSmallSize) { +#ifdef TF_LITE_STATIC_MEMORY + TFLITE_CHECK(false && "No shape resizing supported on this platform"); +#else // TF_LITE_STATIC_MEMORY dims_pointer_ = new int32[dimensions_count]; +#endif // TF_LITE_STATIC_MEMORY } } diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc index 08f942c933..503ef28459 100644 --- a/tensorflow/contrib/lite/kernels/kernel_util.cc +++ b/tensorflow/contrib/lite/kernels/kernel_util.cc @@ -107,6 +107,9 @@ bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2) { return TfLiteIntArrayEqual(input1->dims, input2->dims); } +// TODO(petewarden): Having macros around this is ugly, look at other strategies +// before replicating this approach elsewhere. +#ifndef TF_LITE_STATIC_MEMORY TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, const TfLiteTensor* input1, const TfLiteTensor* input2, @@ -125,5 +128,6 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, *output_shape = shape.release(); return kTfLiteOk; } +#endif // TF_LITE_STATIC_MEMORY } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/contrib/lite/kernels/op_macros.h index d66364c4d8..11e814daee 100644 --- a/tensorflow/contrib/lite/kernels/op_macros.h +++ b/tensorflow/contrib/lite/kernels/op_macros.h @@ -15,17 +15,55 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_ +// If we're on a platform without standard IO functions, fall back to a +// non-portable function. +#ifdef TF_LITE_MCU_DEBUG_LOG + +// This header is pulled in from the support library at +// https://github.com/google/stm32_bare_lib +#include <debug_log.h> + +#define DEBUG_LOG(x) \ + do { \ + DebugLog(x); \ + } while (0) + +inline void InfiniteLoop() { + DEBUG_LOG("HALTED\n"); + while (1) { + } +} +#define TFLITE_ASSERT_FALSE InfiniteLoop(); +#define TFLITE_ABORT InfiniteLoop(); + +#else // TF_LITE_MCU_DEBUG_LOG + +#include <cassert> #include <cstdio> +#include <cstdlib> -#define TF_LITE_FATAL(msg) \ - do { \ - fprintf(stderr, "%s\n", (msg)); \ - exit(1); \ +#define DEBUG_LOG(x) \ + do { \ + fprintf(stderr, "%s", (x)); \ } while (0) + +#define TFLITE_ASSERT_FALSE assert(false) +#define TFLITE_ABORT abort() + +#endif // TF_LITE_MCU_DEBUG_LOG + +#define TF_LITE_FATAL(msg) \ + do { \ + DEBUG_LOG(msg); \ + DEBUG_LOG("\nFATAL\n"); \ + TFLITE_ABORT; \ + } while (0) + #define TF_LITE_ASSERT(x) \ do { \ if (!(x)) TF_LITE_FATAL(#x); \ } while (0) + #define TF_LITE_ASSERT_EQ(x, y) \ do { \ if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \ |