aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/lite/kernels
diff options
context:
space:
mode:
authorGravatar Pete Warden <petewarden@google.com>2018-09-24 15:54:32 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-09-24 16:02:13 -0700
commit1ff157d82dac29f5a3a3197b2664208f6ed6ba06 (patch)
treec751f5a665a27c660809c4884eb07f69b4983244 /tensorflow/contrib/lite/kernels
parent9c58005ec86297a1d0a17dc4f7ad7cbae9c47e4b (diff)
Portability preparation for more cross-platform prototyping.
PiperOrigin-RevId: 214346240
Diffstat (limited to 'tensorflow/contrib/lite/kernels')
-rw-r--r--tensorflow/contrib/lite/kernels/internal/BUILD13
-rw-r--r--tensorflow/contrib/lite/kernels/internal/compatibility.h32
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h2
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h460
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h580
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/softmax.h202
-rw-r--r--tensorflow/contrib/lite/kernels/internal/types.h18
-rw-r--r--tensorflow/contrib/lite/kernels/kernel_util.cc4
-rw-r--r--tensorflow/contrib/lite/kernels/op_macros.h46
9 files changed, 756 insertions, 601 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 195474e7fd..afb5ec05df 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -43,7 +43,10 @@ cc_library(
"compatibility.h",
"types.h",
],
- deps = ["@com_google_absl//absl/base:core_headers"],
+ deps = [
+ "//tensorflow/contrib/lite/kernels:op_macros",
+ "@com_google_absl//absl/base:core_headers",
+ ],
)
config_setting(
@@ -260,6 +263,7 @@ cc_library(
deps = [
":round",
":types",
+ "//tensorflow/contrib/lite/kernels:op_macros",
],
)
@@ -291,7 +295,9 @@ cc_library(
"common.h",
"reference/depthwiseconv_float.h",
"reference/depthwiseconv_uint8.h",
+ "reference/fully_connected.h",
"reference/reference_ops.h",
+ "reference/softmax.h",
],
deps = [
":quantization_util",
@@ -300,6 +306,7 @@ cc_library(
":types",
"@gemmlowp",
"//tensorflow/contrib/lite/c:c_api_internal",
+ "//tensorflow/contrib/lite/kernels:op_macros",
] + select({
":haswell": tflite_deps_intel,
":ios_x86_64": tflite_deps_intel,
@@ -320,8 +327,10 @@ cc_library(
"common.h",
"reference/depthwiseconv_float.h",
"reference/depthwiseconv_uint8.h",
+ "reference/fully_connected.h",
"reference/legacy_reference_ops.h",
"reference/reference_ops.h",
+ "reference/softmax.h",
],
deps = [
":quantization_util",
@@ -330,6 +339,7 @@ cc_library(
":types",
"@gemmlowp",
"//tensorflow/contrib/lite/c:c_api_internal",
+ "//tensorflow/contrib/lite/kernels:op_macros",
] + select({
":haswell": tflite_deps_intel,
":ios_x86_64": tflite_deps_intel,
@@ -462,6 +472,7 @@ cc_library(
"@com_google_absl//absl/base:core_headers",
"//tensorflow/contrib/lite/c:c_api_internal",
"@arm_neon_2_x86_sse",
+ "//tensorflow/contrib/lite/kernels:op_macros",
"@gemmlowp",
] + select({
":arm": [
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index 93fc6b6a76..b87cf2b60d 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -15,65 +15,65 @@ limitations under the License.
#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
-#include <cassert>
#include <cstdint>
-#include <cstdlib>
+
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
#ifndef TFLITE_DCHECK
-#define TFLITE_DCHECK(condition) (condition) ? (void)0 : assert(false)
+#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_EQ
-#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
+#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_NE
-#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : assert(false)
+#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_GE
-#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false)
+#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_GT
-#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : assert(false)
+#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_LE
-#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : assert(false)
+#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
#ifndef TFLITE_DCHECK_LT
-#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : assert(false)
+#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE
#endif
// TODO(ahentz): Clean up: We should stick to the DCHECK versions.
#ifndef TFLITE_CHECK
-#define TFLITE_CHECK(condition) (condition) ? (void)0 : abort()
+#define TFLITE_CHECK(condition) (condition) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_EQ
-#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort()
+#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_NE
-#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : abort()
+#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_GE
-#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort()
+#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_GT
-#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : abort()
+#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_LE
-#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : abort()
+#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ABORT
#endif
#ifndef TFLITE_CHECK_LT
-#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : abort()
+#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
#endif
// TODO(ahentz): Clean up.
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
index ecc655cf99..e8fc566502 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -18,7 +18,6 @@ limitations under the License.
#include <algorithm>
#include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
#include "tensorflow/contrib/lite/kernels/internal/common.h"
#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
#include "tensorflow/contrib/lite/kernels/internal/types.h"
@@ -35,7 +34,6 @@ inline void DepthwiseConv(
const uint8* filter_data, const RuntimeShape& bias_shape,
const int32* bias_data, const RuntimeShape& output_shape,
uint8* output_data) {
- gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h b/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h
new file mode 100644
index 0000000000..23325e8c4c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h
@@ -0,0 +1,460 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+const int kReverseShift = -1;
+
+inline void FullyConnected(
+ const FullyConnectedParams& params, const RuntimeShape& input_shape,
+ const float* input_data, const RuntimeShape& weights_shape,
+ const float* weights_data, const RuntimeShape& bias_shape,
+ const float* bias_data, const RuntimeShape& output_shape,
+ float* output_data) {
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int output_dims_count = output_shape.DimensionsCount();
+ const int weights_dims_count = weights_shape.DimensionsCount();
+ const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+ const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
+ output_shape, output_dims_count - 1);
+ const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+ for (int b = 0; b < batches; ++b) {
+ for (int out_c = 0; out_c < output_depth; ++out_c) {
+ float total = 0.f;
+ for (int d = 0; d < accum_depth; ++d) {
+ total += input_data[b * accum_depth + d] *
+ weights_data[out_c * accum_depth + d];
+ }
+ float bias_value = 0.0f;
+ if (bias_data) {
+ bias_value = bias_data[out_c];
+ }
+ output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
+ total + bias_value, output_activation_min, output_activation_max);
+ }
+ }
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+// Legacy.
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+ const float* weights_data,
+ const Dims<4>& weights_dims, const float* bias_data,
+ const Dims<4>& bias_dims,
+ float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims) {
+ tflite::FullyConnectedParams op_params;
+ op_params.float_activation_min = output_activation_min;
+ op_params.float_activation_max = output_activation_max;
+
+ FullyConnected(op_params, DimsToShape(input_dims), input_data,
+ DimsToShape(weights_dims), weights_data,
+ DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+ output_data);
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+ const float* weights_data, const Dims<4>& weights_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+ bias_dims, output_activation_min, output_activation_max,
+ output_data, output_dims);
+}
+
+inline void FullyConnected(
+ const FullyConnectedParams& params, const RuntimeShape& input_shape,
+ const uint8* input_data, const RuntimeShape& filter_shape,
+ const uint8* filter_data, const RuntimeShape& bias_shape,
+ const int32* bias_data, const RuntimeShape& output_shape,
+ uint8* output_data, void* gemm_context) {
+ (void)gemm_context; // only used in optimized code.
+ const int32 input_offset = params.input_offset;
+ const int32 filter_offset = params.weights_offset;
+ const int32 output_offset = params.output_offset;
+ const int32 output_multiplier = params.output_multiplier;
+ const int output_shift = params.output_shift;
+ const int32 output_activation_min = params.quantized_activation_min;
+ const int32 output_activation_max = params.quantized_activation_max;
+ TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+ TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int output_dim_count = output_shape.DimensionsCount();
+ const int filter_dim_count = filter_shape.DimensionsCount();
+ const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+ const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+ output_shape, output_dim_count - 1);
+ const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+ for (int b = 0; b < batches; ++b) {
+ for (int out_c = 0; out_c < output_depth; ++out_c) {
+ int32 acc = 0;
+ for (int d = 0; d < accum_depth; ++d) {
+ int32 input_val = input_data[b * accum_depth + d];
+ int32 filter_val = filter_data[out_c * accum_depth + d];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ if (bias_data) {
+ acc += bias_data[out_c];
+ }
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
+ }
+ }
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+// Legacy.
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims, void* gemm_context) {
+ tflite::FullyConnectedParams op_params;
+ op_params.input_offset = input_offset;
+ op_params.weights_offset = filter_offset;
+ op_params.output_offset = output_offset;
+ op_params.output_multiplier = output_multiplier;
+ // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+ op_params.output_shift = kReverseShift * output_shift;
+ op_params.quantized_activation_min = output_activation_min;
+ op_params.quantized_activation_max = output_activation_max;
+
+ FullyConnected(op_params, DimsToShape(input_dims), input_data,
+ DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+ bias_data, DimsToShape(output_dims), output_data,
+ gemm_context);
+}
+
+inline void FullyConnected(
+ const FullyConnectedParams& params, const RuntimeShape& input_shape,
+ const uint8* input_data, const RuntimeShape& filter_shape,
+ const uint8* filter_data, const RuntimeShape& bias_shape,
+ const int32* bias_data, const RuntimeShape& output_shape,
+ int16* output_data, void* gemm_context) {
+ (void)gemm_context; // only used in optimized code.
+ const int32 input_offset = params.input_offset;
+ const int32 filter_offset = params.weights_offset;
+ const int32 output_offset = params.output_offset;
+ const int32 output_multiplier = params.output_multiplier;
+ const int output_shift = params.output_shift;
+ const int32 output_activation_min = params.quantized_activation_min;
+ const int32 output_activation_max = params.quantized_activation_max;
+
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ TFLITE_DCHECK_EQ(output_offset, 0);
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int output_dim_count = output_shape.DimensionsCount();
+ const int filter_dim_count = filter_shape.DimensionsCount();
+ const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+ const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+ output_shape, output_dim_count - 1);
+ const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+ for (int b = 0; b < batches; ++b) {
+ for (int out_c = 0; out_c < output_depth; ++out_c) {
+ // Internal accumulation.
+ // Initialize accumulator with the bias-value.
+ int32 accum = bias_data[out_c];
+ // Accumulation loop.
+ for (int d = 0; d < accum_depth; ++d) {
+ int16 input_val = input_data[b * accum_depth + d] + input_offset;
+ int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
+ accum += filter_val * input_val;
+ }
+ // Down-scale the final int32 accumulator to the scale used by our
+ // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+ // multiplier and shift here have been pre-computed offline
+ // (e.g. by toco).
+ accum =
+ MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
+ // Saturate, cast to int16, and store to output array.
+ accum = std::max(accum, output_activation_min - output_offset);
+ accum = std::min(accum, output_activation_max - output_offset);
+ accum += output_offset;
+ output_data[out_c + output_depth * b] = accum;
+ }
+ }
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+// Legacy.
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, int16* output_data,
+ const Dims<4>& output_dims, void* gemm_context) {
+ tflite::FullyConnectedParams op_params;
+ op_params.input_offset = input_offset;
+ op_params.weights_offset = filter_offset;
+ op_params.output_offset = output_offset;
+ op_params.output_multiplier = output_multiplier;
+ // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+ op_params.output_shift = kReverseShift * output_shift;
+ op_params.quantized_activation_min = output_activation_min;
+ op_params.quantized_activation_max = output_activation_max;
+
+ FullyConnected(op_params, DimsToShape(input_dims), input_data,
+ DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
+ bias_data, DimsToShape(output_dims), output_data,
+ gemm_context);
+}
+
+inline void ShuffledFullyConnected(
+ const FullyConnectedParams& params, const RuntimeShape& input_shape,
+ const uint8* input_data, const RuntimeShape& weights_shape,
+ const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
+ const int32* bias_data, const RuntimeShape& output_shape,
+ int16* output_data, uint8* shuffled_input_workspace_data,
+ void* gemm_context) {
+ (void)gemm_context; // only used in optimized code.
+ const int32 output_multiplier = params.output_multiplier;
+ const int output_shift = params.output_shift;
+ const int32 output_activation_min = params.quantized_activation_min;
+ const int32 output_activation_max = params.quantized_activation_max;
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+ TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
+ TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+ TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int output_dim_count = output_shape.DimensionsCount();
+ const int weights_dim_count = weights_shape.DimensionsCount();
+ const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+ const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
+ output_shape, output_dim_count - 1);
+ const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
+ TFLITE_DCHECK((accum_depth % 16) == 0);
+ TFLITE_DCHECK((output_depth % 4) == 0);
+
+ // Shuffling and xoring of input activations into the workspace buffer
+ uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+ if (batches == 1) {
+ for (int i = 0; i < accum_depth; i++) {
+ shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
+ }
+ } else if (batches == 4) {
+ for (int c = 0; c < accum_depth; c += 16) {
+ for (int b = 0; b < 4; b++) {
+ const uint8* src_data_ptr = input_data + b * accum_depth + c;
+ for (int j = 0; j < 16; j++) {
+ uint8 src_val = *src_data_ptr++;
+ // Flip the sign bit, so that the kernel will only need to
+ // reinterpret these uint8 values as int8, getting for free the
+ // subtraction of the zero_point value 128.
+ uint8 dst_val = src_val ^ 0x80;
+ *shuffled_input_workspace_ptr++ = dst_val;
+ }
+ }
+ }
+ } else {
+ TFLITE_DCHECK(false);
+ return;
+ }
+
+ // Actual computation
+ if (batches == 1) {
+ int16* output_ptr = output_data;
+ // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+ // so that just reinterpreting them as int8 values is equivalent to
+ // subtracting 128 from them, thus implementing for free the subtraction of
+ // the zero_point value 128.
+ const int8* shuffled_weights_ptr =
+ reinterpret_cast<const int8*>(shuffled_weights_data);
+ // Likewise, we preshuffled and pre-xored the input data above.
+ const int8* shuffled_input_data =
+ reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+ for (int c = 0; c < output_depth; c += 4) {
+ // Internal accumulation.
+ // Initialize accumulator with the bias-value.
+ int32 accum[4] = {0};
+ // Accumulation loop.
+ for (int d = 0; d < accum_depth; d += 16) {
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 16; j++) {
+ int8 input_val = shuffled_input_data[d + j];
+ int8 weights_val = *shuffled_weights_ptr++;
+ accum[i] += weights_val * input_val;
+ }
+ }
+ }
+ for (int i = 0; i < 4; i++) {
+ // Add bias value
+ int32 acc = accum[i] + bias_data[c + i];
+ // Down-scale the final int32 accumulator to the scale used by our
+ // (16-bit, typically 3 integer bits) fixed-point format. The quantized
+ // multiplier and shift here have been pre-computed offline
+ // (e.g. by toco).
+ acc =
+ MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+ // Saturate, cast to int16, and store to output array.
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_ptr[c + i] = acc;
+ }
+ }
+ } else if (batches == 4) {
+ int16* output_ptr = output_data;
+ // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
+ // so that just reinterpreting them as int8 values is equivalent to
+ // subtracting 128 from them, thus implementing for free the subtraction of
+ // the zero_point value 128.
+ const int8* shuffled_weights_ptr =
+ reinterpret_cast<const int8*>(shuffled_weights_data);
+ // Likewise, we preshuffled and pre-xored the input data above.
+ const int8* shuffled_input_data =
+ reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+ for (int c = 0; c < output_depth; c += 4) {
+ const int8* shuffled_input_ptr = shuffled_input_data;
+ // Accumulation loop.
+ // Internal accumulation.
+ // Initialize accumulator with the bias-value.
+ int32 accum[4][4];
+ for (int i = 0; i < 4; i++) {
+ for (int b = 0; b < 4; b++) {
+ accum[i][b] = 0;
+ }
+ }
+ for (int d = 0; d < accum_depth; d += 16) {
+ for (int i = 0; i < 4; i++) {
+ for (int b = 0; b < 4; b++) {
+ for (int j = 0; j < 16; j++) {
+ int8 input_val = shuffled_input_ptr[16 * b + j];
+ int8 weights_val = shuffled_weights_ptr[16 * i + j];
+ accum[i][b] += weights_val * input_val;
+ }
+ }
+ }
+ shuffled_input_ptr += 64;
+ shuffled_weights_ptr += 64;
+ }
+ for (int i = 0; i < 4; i++) {
+ for (int b = 0; b < 4; b++) {
+ // Add bias value
+ int32 acc = accum[i][b] + bias_data[c + i];
+ // Down-scale the final int32 accumulator to the scale used by our
+ // (16-bit, typically 3 integer bits) fixed-point format. The
+ // quantized multiplier and shift here have been pre-computed offline
+ // (e.g. by toco).
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+ output_shift);
+ // Saturate, cast to int16, and store to output array.
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_ptr[b * output_depth + c + i] = acc;
+ }
+ }
+ }
+ } else {
+ TFLITE_DCHECK(false);
+ return;
+ }
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+// Legacy.
+inline void ShuffledFullyConnected(
+ const uint8* input_data, const Dims<4>& input_dims,
+ const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
+ const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
+ int output_shift, int32 output_activation_min, int32 output_activation_max,
+ int16* output_data, const Dims<4>& output_dims,
+ uint8* shuffled_input_workspace_data, void* gemm_context) {
+ tflite::FullyConnectedParams op_params;
+ op_params.output_multiplier = output_multiplier;
+ // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+ op_params.output_shift = kReverseShift * output_shift;
+ op_params.quantized_activation_min = output_activation_min;
+ op_params.quantized_activation_max = output_activation_max;
+
+ ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data,
+ DimsToShape(weights_dims), shuffled_weights_data,
+ DimsToShape(bias_dims), bias_data,
+ DimsToShape(output_dims), output_data,
+ shuffled_input_workspace_data, gemm_context);
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims, void* gemm_context) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, output_offset,
+ output_multiplier, output_shift, output_activation_min,
+ output_activation_max, output_data, output_dims, gemm_context);
+}
+
+} // namespace reference_ops
+} // namespace tflite
+
+#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 5bfa3bd084..7a5535489a 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -28,6 +28,8 @@ limitations under the License.
#include "public/gemmlowp.h"
#include "tensorflow/contrib/lite/kernels/internal/common.h"
#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/softmax.h"
#include "tensorflow/contrib/lite/kernels/internal/round.h"
#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
#include "tensorflow/contrib/lite/kernels/internal/types.h"
@@ -98,13 +100,6 @@ gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
namespace reference_ops {
-// TODO(b/80247582) Remove this constant.
-// This will be phased out as the shifts are revised with more thought. Use of a
-// constant enables us to track progress on this work.
-//
-// Used mainly to convert from old-style shifts (right) to new-style (left).
-static constexpr int kReverseShift = -1;
-
inline void ShapeFromDims(const tflite::Dims<4>& dims, RuntimeShape* shape) {
shape->BuildFrom(
{dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
@@ -181,7 +176,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
- (void)im2col_data; // only used in optimized code.
+ (void)im2col_data; // only used in optimized code.
(void)im2col_shape; // only used in optimized code.
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
@@ -606,437 +601,6 @@ inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
}
}
-inline void FullyConnected(
- const FullyConnectedParams& params, const RuntimeShape& input_shape,
- const float* input_data, const RuntimeShape& weights_shape,
- const float* weights_data, const RuntimeShape& bias_shape,
- const float* bias_data, const RuntimeShape& output_shape,
- float* output_data) {
- const float output_activation_min = params.float_activation_min;
- const float output_activation_max = params.float_activation_max;
- // TODO(benoitjacob): This really should be:
- // const int batches = ArraySize(output_dims, 1);
- // but the current --variable_batch hack consists in overwriting the 3rd
- // dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- const int output_dims_count = output_shape.DimensionsCount();
- const int weights_dims_count = weights_shape.DimensionsCount();
- const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
- const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
- output_shape, output_dims_count - 1);
- const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
- for (int b = 0; b < batches; ++b) {
- for (int out_c = 0; out_c < output_depth; ++out_c) {
- float total = 0.f;
- for (int d = 0; d < accum_depth; ++d) {
- total += input_data[b * accum_depth + d] *
- weights_data[out_c * accum_depth + d];
- }
- float bias_value = 0.0f;
- if (bias_data) {
- bias_value = bias_data[out_c];
- }
- output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
- total + bias_value, output_activation_min, output_activation_max);
- }
- }
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
- const float* weights_data,
- const Dims<4>& weights_dims, const float* bias_data,
- const Dims<4>& bias_dims,
- float output_activation_min,
- float output_activation_max, float* output_data,
- const Dims<4>& output_dims) {
- tflite::FullyConnectedParams op_params;
- op_params.float_activation_min = output_activation_min;
- op_params.float_activation_max = output_activation_max;
-
- FullyConnected(op_params, DimsToShape(input_dims), input_data,
- DimsToShape(weights_dims), weights_data,
- DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
- output_data);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void FullyConnected(const float* input_data, const Dims<4>& input_dims,
- const float* weights_data, const Dims<4>& weights_dims,
- const float* bias_data, const Dims<4>& bias_dims,
- float* output_data, const Dims<4>& output_dims) {
- float output_activation_min, output_activation_max;
- GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
- FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
- bias_dims, output_activation_min, output_activation_max,
- output_data, output_dims);
-}
-
-inline void FullyConnected(
- const FullyConnectedParams& params, const RuntimeShape& input_shape,
- const uint8* input_data, const RuntimeShape& filter_shape,
- const uint8* filter_data, const RuntimeShape& bias_shape,
- const int32* bias_data, const RuntimeShape& output_shape,
- uint8* output_data, gemmlowp::GemmContext* gemm_context) {
- (void)gemm_context; // only used in optimized code.
- const int32 input_offset = params.input_offset;
- const int32 filter_offset = params.weights_offset;
- const int32 output_offset = params.output_offset;
- const int32 output_multiplier = params.output_multiplier;
- const int output_shift = params.output_shift;
- const int32 output_activation_min = params.quantized_activation_min;
- const int32 output_activation_max = params.quantized_activation_max;
- TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
- TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
-
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- // TODO(benoitjacob): This really should be:
- // const int batches = ArraySize(output_dims, 1);
- // but the current --variable_batch hack consists in overwriting the 3rd
- // dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- const int output_dim_count = output_shape.DimensionsCount();
- const int filter_dim_count = filter_shape.DimensionsCount();
- const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
- const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
- output_shape, output_dim_count - 1);
- const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
- for (int b = 0; b < batches; ++b) {
- for (int out_c = 0; out_c < output_depth; ++out_c) {
- int32 acc = 0;
- for (int d = 0; d < accum_depth; ++d) {
- int32 input_val = input_data[b * accum_depth + d];
- int32 filter_val = filter_data[out_c * accum_depth + d];
- acc += (filter_val + filter_offset) * (input_val + input_offset);
- }
- if (bias_data) {
- acc += bias_data[out_c];
- }
- acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
- acc += output_offset;
- acc = std::max(acc, output_activation_min);
- acc = std::min(acc, output_activation_max);
- output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
- }
- }
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_offset, const uint8* filter_data,
- const Dims<4>& filter_dims, int32 filter_offset,
- const int32* bias_data, const Dims<4>& bias_dims,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims,
- gemmlowp::GemmContext* gemm_context) {
- tflite::FullyConnectedParams op_params;
- op_params.input_offset = input_offset;
- op_params.weights_offset = filter_offset;
- op_params.output_offset = output_offset;
- op_params.output_multiplier = output_multiplier;
- // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
- op_params.output_shift = kReverseShift * output_shift;
- op_params.quantized_activation_min = output_activation_min;
- op_params.quantized_activation_max = output_activation_max;
-
- FullyConnected(op_params, DimsToShape(input_dims), input_data,
- DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
- bias_data, DimsToShape(output_dims), output_data,
- gemm_context);
-}
-
-inline void FullyConnected(
- const FullyConnectedParams& params, const RuntimeShape& input_shape,
- const uint8* input_data, const RuntimeShape& filter_shape,
- const uint8* filter_data, const RuntimeShape& bias_shape,
- const int32* bias_data, const RuntimeShape& output_shape,
- int16* output_data, gemmlowp::GemmContext* gemm_context) {
- (void)gemm_context; // only used in optimized code.
- const int32 input_offset = params.input_offset;
- const int32 filter_offset = params.weights_offset;
- const int32 output_offset = params.output_offset;
- const int32 output_multiplier = params.output_multiplier;
- const int output_shift = params.output_shift;
- const int32 output_activation_min = params.quantized_activation_min;
- const int32 output_activation_max = params.quantized_activation_max;
-
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- TFLITE_DCHECK_EQ(output_offset, 0);
- // TODO(benoitjacob): This really should be:
- // const int batches = ArraySize(output_dims, 1);
- // but the current --variable_batch hack consists in overwriting the 3rd
- // dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- const int output_dim_count = output_shape.DimensionsCount();
- const int filter_dim_count = filter_shape.DimensionsCount();
- const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
- const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
- output_shape, output_dim_count - 1);
- const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
- for (int b = 0; b < batches; ++b) {
- for (int out_c = 0; out_c < output_depth; ++out_c) {
- // Internal accumulation.
- // Initialize accumulator with the bias-value.
- int32 accum = bias_data[out_c];
- // Accumulation loop.
- for (int d = 0; d < accum_depth; ++d) {
- int16 input_val = input_data[b * accum_depth + d] + input_offset;
- int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
- accum += filter_val * input_val;
- }
- // Down-scale the final int32 accumulator to the scale used by our
- // (16-bit, typically 3 integer bits) fixed-point format. The quantized
- // multiplier and shift here have been pre-computed offline
- // (e.g. by toco).
- accum =
- MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
- // Saturate, cast to int16, and store to output array.
- accum = std::max(accum, output_activation_min - output_offset);
- accum = std::min(accum, output_activation_max - output_offset);
- accum += output_offset;
- output_data[out_c + output_depth * b] = accum;
- }
- }
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_offset, const uint8* filter_data,
- const Dims<4>& filter_dims, int32 filter_offset,
- const int32* bias_data, const Dims<4>& bias_dims,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, int16* output_data,
- const Dims<4>& output_dims,
- gemmlowp::GemmContext* gemm_context) {
- tflite::FullyConnectedParams op_params;
- op_params.input_offset = input_offset;
- op_params.weights_offset = filter_offset;
- op_params.output_offset = output_offset;
- op_params.output_multiplier = output_multiplier;
- // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
- op_params.output_shift = kReverseShift * output_shift;
- op_params.quantized_activation_min = output_activation_min;
- op_params.quantized_activation_max = output_activation_max;
-
- FullyConnected(op_params, DimsToShape(input_dims), input_data,
- DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims),
- bias_data, DimsToShape(output_dims), output_data,
- gemm_context);
-}
-
-inline void ShuffledFullyConnected(
- const FullyConnectedParams& params, const RuntimeShape& input_shape,
- const uint8* input_data, const RuntimeShape& weights_shape,
- const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
- const int32* bias_data, const RuntimeShape& output_shape,
- int16* output_data, uint8* shuffled_input_workspace_data,
- gemmlowp::GemmContext* gemm_context) {
- (void)gemm_context; // only used in optimized code.
- const int32 output_multiplier = params.output_multiplier;
- const int output_shift = params.output_shift;
- const int32 output_activation_min = params.quantized_activation_min;
- const int32 output_activation_max = params.quantized_activation_max;
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-
- TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
- TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
- TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
- // TODO(benoitjacob): This really should be:
- // const int batches = ArraySize(output_dims, 1);
- // but the current --variable_batch hack consists in overwriting the 3rd
- // dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- const int output_dim_count = output_shape.DimensionsCount();
- const int weights_dim_count = weights_shape.DimensionsCount();
- const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
- const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
- output_shape, output_dim_count - 1);
- const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
- TFLITE_DCHECK((accum_depth % 16) == 0);
- TFLITE_DCHECK((output_depth % 4) == 0);
-
- // Shuffling and xoring of input activations into the workspace buffer
- uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
- if (batches == 1) {
- for (int i = 0; i < accum_depth; i++) {
- shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
- }
- } else if (batches == 4) {
- for (int c = 0; c < accum_depth; c += 16) {
- for (int b = 0; b < 4; b++) {
- const uint8* src_data_ptr = input_data + b * accum_depth + c;
- for (int j = 0; j < 16; j++) {
- uint8 src_val = *src_data_ptr++;
- // Flip the sign bit, so that the kernel will only need to
- // reinterpret these uint8 values as int8, getting for free the
- // subtraction of the zero_point value 128.
- uint8 dst_val = src_val ^ 0x80;
- *shuffled_input_workspace_ptr++ = dst_val;
- }
- }
- }
- } else {
- TFLITE_DCHECK(false);
- return;
- }
-
- // Actual computation
- if (batches == 1) {
- int16* output_ptr = output_data;
- // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
- // so that just reinterpreting them as int8 values is equivalent to
- // subtracting 128 from them, thus implementing for free the subtraction of
- // the zero_point value 128.
- const int8* shuffled_weights_ptr =
- reinterpret_cast<const int8*>(shuffled_weights_data);
- // Likewise, we preshuffled and pre-xored the input data above.
- const int8* shuffled_input_data =
- reinterpret_cast<const int8*>(shuffled_input_workspace_data);
- for (int c = 0; c < output_depth; c += 4) {
- // Internal accumulation.
- // Initialize accumulator with the bias-value.
- int32 accum[4] = {0};
- // Accumulation loop.
- for (int d = 0; d < accum_depth; d += 16) {
- for (int i = 0; i < 4; i++) {
- for (int j = 0; j < 16; j++) {
- int8 input_val = shuffled_input_data[d + j];
- int8 weights_val = *shuffled_weights_ptr++;
- accum[i] += weights_val * input_val;
- }
- }
- }
- for (int i = 0; i < 4; i++) {
- // Add bias value
- int acc = accum[i] + bias_data[c + i];
- // Down-scale the final int32 accumulator to the scale used by our
- // (16-bit, typically 3 integer bits) fixed-point format. The quantized
- // multiplier and shift here have been pre-computed offline
- // (e.g. by toco).
- acc =
- MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
- // Saturate, cast to int16, and store to output array.
- acc = std::max(acc, output_activation_min);
- acc = std::min(acc, output_activation_max);
- output_ptr[c + i] = acc;
- }
- }
- } else if (batches == 4) {
- int16* output_ptr = output_data;
- // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
- // so that just reinterpreting them as int8 values is equivalent to
- // subtracting 128 from them, thus implementing for free the subtraction of
- // the zero_point value 128.
- const int8* shuffled_weights_ptr =
- reinterpret_cast<const int8*>(shuffled_weights_data);
- // Likewise, we preshuffled and pre-xored the input data above.
- const int8* shuffled_input_data =
- reinterpret_cast<const int8*>(shuffled_input_workspace_data);
- for (int c = 0; c < output_depth; c += 4) {
- const int8* shuffled_input_ptr = shuffled_input_data;
- // Accumulation loop.
- // Internal accumulation.
- // Initialize accumulator with the bias-value.
- int32 accum[4][4];
- for (int i = 0; i < 4; i++) {
- for (int b = 0; b < 4; b++) {
- accum[i][b] = 0;
- }
- }
- for (int d = 0; d < accum_depth; d += 16) {
- for (int i = 0; i < 4; i++) {
- for (int b = 0; b < 4; b++) {
- for (int j = 0; j < 16; j++) {
- int8 input_val = shuffled_input_ptr[16 * b + j];
- int8 weights_val = shuffled_weights_ptr[16 * i + j];
- accum[i][b] += weights_val * input_val;
- }
- }
- }
- shuffled_input_ptr += 64;
- shuffled_weights_ptr += 64;
- }
- for (int i = 0; i < 4; i++) {
- for (int b = 0; b < 4; b++) {
- // Add bias value
- int acc = accum[i][b] + bias_data[c + i];
- // Down-scale the final int32 accumulator to the scale used by our
- // (16-bit, typically 3 integer bits) fixed-point format. The
- // quantized multiplier and shift here have been pre-computed offline
- // (e.g. by toco).
- acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
- output_shift);
- // Saturate, cast to int16, and store to output array.
- acc = std::max(acc, output_activation_min);
- acc = std::min(acc, output_activation_max);
- output_ptr[b * output_depth + c + i] = acc;
- }
- }
- }
- } else {
- TFLITE_DCHECK(false);
- return;
- }
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void ShuffledFullyConnected(
- const uint8* input_data, const Dims<4>& input_dims,
- const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
- const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
- int output_shift, int32 output_activation_min, int32 output_activation_max,
- int16* output_data, const Dims<4>& output_dims,
- uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) {
- tflite::FullyConnectedParams op_params;
- op_params.output_multiplier = output_multiplier;
- // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
- op_params.output_shift = kReverseShift * output_shift;
- op_params.quantized_activation_min = output_activation_min;
- op_params.quantized_activation_max = output_activation_max;
-
- ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data,
- DimsToShape(weights_dims), shuffled_weights_data,
- DimsToShape(bias_dims), bias_data,
- DimsToShape(output_dims), output_data,
- shuffled_input_workspace_data, gemm_context);
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
- int32 input_offset, const uint8* filter_data,
- const Dims<4>& filter_dims, int32 filter_offset,
- const int32* bias_data, const Dims<4>& bias_dims,
- int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- const Dims<4>& output_dims,
- gemmlowp::GemmContext* gemm_context) {
- static_assert(Ac == FusedActivationFunctionType::kNone ||
- Ac == FusedActivationFunctionType::kRelu ||
- Ac == FusedActivationFunctionType::kRelu6 ||
- Ac == FusedActivationFunctionType::kRelu1,
- "");
- if (Ac == FusedActivationFunctionType::kNone) {
- TFLITE_DCHECK_EQ(output_activation_min, 0);
- TFLITE_DCHECK_EQ(output_activation_max, 255);
- }
- FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
- filter_offset, bias_data, bias_dims, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_data, output_dims, gemm_context);
-}
-
inline void Relu(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
const int flat_size = MatchingFlatSize(input_shape, output_shape);
@@ -3238,144 +2802,6 @@ inline void LocalResponseNormalization(
}
}
-inline void Softmax(const SoftmaxParams& params,
- const RuntimeShape& input_shape, const float* input_data,
- const RuntimeShape& output_shape, float* output_data) {
- const int trailing_dim = input_shape.DimensionsCount() - 1;
- const int outer_size =
- MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
- const int depth =
- MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-
- for (int i = 0; i < outer_size; ++i) {
- // Find max element value which we'll use to ensure numerical stability
- // taking advantage of the following equality:
- // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
- float max = std::numeric_limits<float>::lowest();
- for (int c = 0; c < depth; ++c) {
- max = std::max(max, input_data[i * depth + c]);
- }
-
- // Compute sum.
- float sum = 0.f;
- for (int c = 0; c < depth; ++c) {
- sum += std::exp((input_data[i * depth + c] - max) * params.beta);
- }
-
- // Compute result.
- for (int c = 0; c < depth; ++c) {
- output_data[i * depth + c] =
- std::exp((input_data[i * depth + c] - max) * params.beta) / sum;
- }
- }
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy.
-inline void Softmax(const float* input_data, const RuntimeShape& input_shape,
- float beta, float* output_data,
- const RuntimeShape& output_shape) {
- SoftmaxParams params;
- params.beta = beta;
- Softmax(params, input_shape, input_data, output_shape, output_data);
-}
-
-inline void Softmax(const SoftmaxParams& params,
- const RuntimeShape& input_shape, const uint8* input_data,
- const RuntimeShape& output_shape, uint8* output_data) {
- const int32 input_beta_multiplier = params.input_multiplier;
- const int32 input_beta_left_shift = params.input_left_shift;
- const int diff_min = params.diff_min;
- // The representation chosen for the input to the exp() function is Q5.26.
- // We need to leave extra space since values that we skip might be as large as
- // -32 before multiplying by input_beta_multiplier, and therefore as large as
- // -16 afterwards. Note that exp(-8) is definitely not insignificant to
- // accumulation, but exp(-16) definitely is.
- static const int kScaledDiffIntegerBits = 5;
- static const int kAccumulationIntegerBits = 12;
- using FixedPointScaledDiff =
- gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
- using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
- using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-
- const int trailing_dim = input_shape.DimensionsCount() - 1;
- const int outer_size =
- MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
- const int depth =
- MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-
- for (int i = 0; i < outer_size; ++i) {
- uint8 max_in_row = 0;
- for (int c = 0; c < depth; ++c) {
- max_in_row = std::max(max_in_row, input_data[i * depth + c]);
- }
-
- FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
- for (int c = 0; c < depth; ++c) {
- int32 input_diff =
- static_cast<int32>(input_data[i * depth + c]) - max_in_row;
- if (input_diff >= diff_min) {
- const int32 input_diff_rescaled =
- MultiplyByQuantizedMultiplierGreaterThanOne(
- input_diff, input_beta_multiplier, input_beta_left_shift);
- const FixedPointScaledDiff scaled_diff_f8 =
- FixedPointScaledDiff::FromRaw(input_diff_rescaled);
- sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
- exp_on_negative_values(scaled_diff_f8));
- }
- }
-
- int32 fixed_sum_of_exps = sum_of_exps.raw();
- int headroom_plus_one =
- CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
- // This is the number of bits to the left of the binary point above 1.0.
- // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
- // no later adjustment will be needed.
- int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
- int32 shifted_sum_minus_one = static_cast<int32>(
- (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
- (static_cast<uint32>(1) << 31));
-
- FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
- FixedPoint0::FromRaw(shifted_sum_minus_one));
-
- for (int c = 0; c < depth; ++c) {
- int32 input_diff =
- static_cast<int32>(input_data[i * depth + c]) - max_in_row;
- if (input_diff >= diff_min) {
- const int32 input_diff_rescaled =
- MultiplyByQuantizedMultiplierGreaterThanOne(
- input_diff, input_beta_multiplier, input_beta_left_shift);
- const FixedPointScaledDiff scaled_diff_f8 =
- FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
- FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
- int32 unsat_output = gemmlowp::RoundingDivideByPOT(
- (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
-
- output_data[i * depth + c] = static_cast<uint8>(
- std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
-
- } else {
- output_data[i * depth + c] = 0;
- }
- }
- }
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-// Legacy
-inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape,
- int32 input_beta_multiplier, int32 input_beta_left_shift,
- int diff_min, uint8* output_data,
- const RuntimeShape& output_shape) {
- SoftmaxParams params;
- params.input_multiplier = input_beta_multiplier;
- params.input_left_shift = input_beta_left_shift;
- params.diff_min = diff_min;
- Softmax(params, input_shape, input_data, output_shape, output_data);
-}
-
inline void LogSoftmax(const SoftmaxParams& params,
const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/softmax.h b/tensorflow/contrib/lite/kernels/internal/reference/softmax.h
new file mode 100644
index 0000000000..006174e8db
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/softmax.h
@@ -0,0 +1,202 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Softmax(const SoftmaxParams& params,
+ const RuntimeShape& input_shape, const float* input_data,
+ const RuntimeShape& output_shape, float* output_data) {
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int outer_size =
+ MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int depth =
+ MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+ for (int i = 0; i < outer_size; ++i) {
+ // Find max element value which we'll use to ensure numerical stability
+ // taking advantage of the following equality:
+ // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+ float max = std::numeric_limits<float>::lowest();
+ for (int c = 0; c < depth; ++c) {
+ max = std::max(max, input_data[i * depth + c]);
+ }
+
+ // Compute sum.
+ float sum = 0.f;
+ for (int c = 0; c < depth; ++c) {
+ sum += std::exp((input_data[i * depth + c] - max) * params.beta);
+ }
+
+ // Compute result.
+ for (int c = 0; c < depth; ++c) {
+ output_data[i * depth + c] =
+ std::exp((input_data[i * depth + c] - max) * params.beta) / sum;
+ }
+ }
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+// Legacy.
+inline void Softmax(const float* input_data, const RuntimeShape& input_shape,
+ float beta, float* output_data,
+ const RuntimeShape& output_shape) {
+ SoftmaxParams params;
+ params.beta = beta;
+ Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+inline void Softmax(const SoftmaxParams& params,
+ const RuntimeShape& input_shape, const uint8* input_data,
+ const RuntimeShape& output_shape, uint8* output_data) {
+ const int32 input_beta_multiplier = params.input_multiplier;
+ const int32 input_beta_left_shift = params.input_left_shift;
+ const int diff_min = params.diff_min;
+ // The representation chosen for the input to the exp() function is Q5.26.
+ // We need to leave extra space since values that we skip might be as large as
+ // -32 before multiplying by input_beta_multiplier, and therefore as large as
+ // -16 afterwards. Note that exp(-8) is definitely not insignificant to
+ // accumulation, but exp(-16) definitely is.
+ static const int kScaledDiffIntegerBits = 5;
+ static const int kAccumulationIntegerBits = 12;
+ using FixedPointScaledDiff =
+ gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+ using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+ using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int outer_size =
+ MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int depth =
+ MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+ for (int i = 0; i < outer_size; ++i) {
+ uint8 max_in_row = 0;
+ for (int c = 0; c < depth; ++c) {
+ max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+ }
+
+ FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+ for (int c = 0; c < depth; ++c) {
+ int32 input_diff =
+ static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+ if (input_diff >= diff_min) {
+ const int32 input_diff_rescaled =
+ MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+ sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+ exp_on_negative_values(scaled_diff_f8));
+ }
+ }
+
+ int32 fixed_sum_of_exps = sum_of_exps.raw();
+ int headroom_plus_one =
+ CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
+ // This is the number of bits to the left of the binary point above 1.0.
+ // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
+ // no later adjustment will be needed.
+ int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+ int32 shifted_sum_minus_one = static_cast<int32>(
+ (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+ (static_cast<uint32>(1) << 31));
+
+ FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+ FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+ for (int c = 0; c < depth; ++c) {
+ int32 input_diff =
+ static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+ if (input_diff >= diff_min) {
+ const int32 input_diff_rescaled =
+ MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+ FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+ int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+ (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+ output_data[i * depth + c] = static_cast<uint8>(
+ std::max(std::min(unsat_output, static_cast<int32>(255)),
+ static_cast<int32>(0)));
+
+ } else {
+ output_data[i * depth + c] = 0;
+ }
+ }
+ }
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+// Legacy
+inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape,
+ int32 input_beta_multiplier, int32 input_beta_left_shift,
+ int diff_min, uint8* output_data,
+ const RuntimeShape& output_shape) {
+ SoftmaxParams params;
+ params.input_multiplier = input_beta_multiplier;
+ params.input_left_shift = input_beta_left_shift;
+ params.diff_min = diff_min;
+ Softmax(params, input_shape, input_data, output_shape, output_data);
+}
+
+// Performs softmax along the input of size (input_size * batch_size).
+inline void Softmax(const float* in, const int input_size, const int batch_size,
+ const float beta, float* out) {
+ // TF_LITE_ASSERT(input_size > 0);
+
+ // For each batch
+ for (int b = 0; b < batch_size; b++) {
+ // Find the max coeff.
+ float max_coeff = in[0];
+ for (int i = 1; i < input_size; i++) {
+ if (in[i] > max_coeff) max_coeff = in[i];
+ }
+
+ // Compute the normalized sum of exps.
+ float exp_sum = 0.0;
+ for (int i = 0; i < input_size; i++) {
+ out[i] = std::exp((in[i] - max_coeff) * beta);
+ exp_sum += out[i];
+ }
+
+ // Divide by the sum of exps.
+ float reciprocal_sum_exp = 1.f / exp_sum;
+ for (int i = 0; i < input_size; i++) {
+ out[i] *= reciprocal_sum_exp;
+ }
+
+ // Advance in and out pointers for the next batch.
+ in += input_size;
+ out += input_size;
+ }
+}
+
+} // namespace reference_ops
+} // namespace tflite
+
+#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 3e0308721e..a3a5994c9c 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -15,8 +15,8 @@ limitations under the License.
#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+#include <algorithm>
#include <cstring>
-#include <iterator>
#include "absl/base/macros.h"
#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
@@ -126,7 +126,11 @@ class RuntimeShape {
explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
if (dimensions_count > kMaxSmallSize) {
+#ifdef TF_LITE_STATIC_MEMORY
+ TFLITE_CHECK(false && "No shape resizing supported on this platform");
+#else // TF_LITE_STATIC_MEMORY
dims_pointer_ = new int32[dimensions_count];
+#endif // TF_LITE_STATIC_MEMORY
}
}
@@ -161,7 +165,11 @@ class RuntimeShape {
~RuntimeShape() {
if (size_ > kMaxSmallSize) {
+#ifdef TF_LITE_STATIC_MEMORY
+ TFLITE_CHECK(false && "No shape resizing supported on this platform");
+#else // TF_LITE_STATIC_MEMORY
delete[] dims_pointer_;
+#endif // TF_LITE_STATIC_MEMORY
}
}
@@ -192,11 +200,19 @@ class RuntimeShape {
inline void Resize(int dimensions_count) {
if (size_ > kMaxSmallSize) {
+#ifdef TF_LITE_STATIC_MEMORY
+ TFLITE_CHECK(false && "No shape resizing supported on this platform");
+#else // TF_LITE_STATIC_MEMORY
delete[] dims_pointer_;
+#endif // TF_LITE_STATIC_MEMORY
}
size_ = dimensions_count;
if (dimensions_count > kMaxSmallSize) {
+#ifdef TF_LITE_STATIC_MEMORY
+ TFLITE_CHECK(false && "No shape resizing supported on this platform");
+#else // TF_LITE_STATIC_MEMORY
dims_pointer_ = new int32[dimensions_count];
+#endif // TF_LITE_STATIC_MEMORY
}
}
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
index 08f942c933..503ef28459 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -107,6 +107,9 @@ bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2) {
return TfLiteIntArrayEqual(input1->dims, input2->dims);
}
+// TODO(petewarden): Having macros around this is ugly, look at other strategies
+// before replicating this approach elsewhere.
+#ifndef TF_LITE_STATIC_MEMORY
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
const TfLiteTensor* input1,
const TfLiteTensor* input2,
@@ -125,5 +128,6 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
*output_shape = shape.release();
return kTfLiteOk;
}
+#endif // TF_LITE_STATIC_MEMORY
} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/contrib/lite/kernels/op_macros.h
index d66364c4d8..11e814daee 100644
--- a/tensorflow/contrib/lite/kernels/op_macros.h
+++ b/tensorflow/contrib/lite/kernels/op_macros.h
@@ -15,17 +15,55 @@ limitations under the License.
#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_
#define TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_
+// If we're on a platform without standard IO functions, fall back to a
+// non-portable function.
+#ifdef TF_LITE_MCU_DEBUG_LOG
+
+// This header is pulled in from the support library at
+// https://github.com/google/stm32_bare_lib
+#include <debug_log.h>
+
+#define DEBUG_LOG(x) \
+ do { \
+ DebugLog(x); \
+ } while (0)
+
+inline void InfiniteLoop() {
+ DEBUG_LOG("HALTED\n");
+ while (1) {
+ }
+}
+#define TFLITE_ASSERT_FALSE InfiniteLoop();
+#define TFLITE_ABORT InfiniteLoop();
+
+#else // TF_LITE_MCU_DEBUG_LOG
+
+#include <cassert>
#include <cstdio>
+#include <cstdlib>
-#define TF_LITE_FATAL(msg) \
- do { \
- fprintf(stderr, "%s\n", (msg)); \
- exit(1); \
+#define DEBUG_LOG(x) \
+ do { \
+ fprintf(stderr, "%s", (x)); \
} while (0)
+
+#define TFLITE_ASSERT_FALSE assert(false)
+#define TFLITE_ABORT abort()
+
+#endif // TF_LITE_MCU_DEBUG_LOG
+
+#define TF_LITE_FATAL(msg) \
+ do { \
+ DEBUG_LOG(msg); \
+ DEBUG_LOG("\nFATAL\n"); \
+ TFLITE_ABORT; \
+ } while (0)
+
#define TF_LITE_ASSERT(x) \
do { \
if (!(x)) TF_LITE_FATAL(#x); \
} while (0)
+
#define TF_LITE_ASSERT_EQ(x, y) \
do { \
if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \