aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/lite/kernels
diff options
context:
space:
mode:
authorGravatar Andrew Selle <aselle@google.com>2017-11-10 10:35:35 -0800
committerGravatar Andrew Selle <aselle@andyselle.com>2017-11-10 16:14:42 -0800
commit0b15439f8f0f2d4755587f4096c3ea04cb199d23 (patch)
tree9aa4fc8162bf9b4ee50112a7b85703f70ca4df08 /tensorflow/contrib/lite/kernels
parent7ac140a5845553275427162aabd9d54987144b4a (diff)
Internal Change.
PiperOrigin-RevId: 175307445
Diffstat (limited to 'tensorflow/contrib/lite/kernels')
-rw-r--r--tensorflow/contrib/lite/kernels/BUILD408
-rw-r--r--tensorflow/contrib/lite/kernels/activation_functor.h58
-rw-r--r--tensorflow/contrib/lite/kernels/activations.cc389
-rw-r--r--tensorflow/contrib/lite/kernels/activations_test.cc323
-rw-r--r--tensorflow/contrib/lite/kernels/add.cc184
-rw-r--r--tensorflow/contrib/lite/kernels/add_test.cc171
-rw-r--r--tensorflow/contrib/lite/kernels/basic_rnn.cc161
-rw-r--r--tensorflow/contrib/lite/kernels/basic_rnn_test.cc267
-rw-r--r--tensorflow/contrib/lite/kernels/concatenation.cc200
-rw-r--r--tensorflow/contrib/lite/kernels/concatenation_test.cc162
-rw-r--r--tensorflow/contrib/lite/kernels/conv.cc425
-rw-r--r--tensorflow/contrib/lite/kernels/conv_test.cc440
-rw-r--r--tensorflow/contrib/lite/kernels/depthwise_conv.cc289
-rw-r--r--tensorflow/contrib/lite/kernels/depthwise_conv_test.cc186
-rw-r--r--tensorflow/contrib/lite/kernels/embedding_lookup.cc104
-rw-r--r--tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc248
-rw-r--r--tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc166
-rw-r--r--tensorflow/contrib/lite/kernels/embedding_lookup_test.cc94
-rw-r--r--tensorflow/contrib/lite/kernels/fully_connected.cc307
-rw-r--r--tensorflow/contrib/lite/kernels/fully_connected_test.cc377
-rw-r--r--tensorflow/contrib/lite/kernels/gemm_support.cc68
-rw-r--r--tensorflow/contrib/lite/kernels/gemm_support.h54
-rw-r--r--tensorflow/contrib/lite/kernels/hashtable_lookup.cc155
-rw-r--r--tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc176
-rw-r--r--tensorflow/contrib/lite/kernels/internal/BUILD359
-rw-r--r--tensorflow/contrib/lite/kernels/internal/common.h107
-rw-r--r--tensorflow/contrib/lite/kernels/internal/compatibility.h78
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h65
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h987
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h1916
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h231
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h143
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h167
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h195
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc337
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h113
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h3715
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h138
-rw-r--r--tensorflow/contrib/lite/kernels/internal/quantization_util.cc95
-rw-r--r--tensorflow/contrib/lite/kernels/internal/quantization_util.h55
-rw-r--r--tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc108
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h115
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h138
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc165
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h189
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h2455
-rw-r--r--tensorflow/contrib/lite/kernels/internal/round.h39
-rw-r--r--tensorflow/contrib/lite/kernels/internal/tensor.h87
-rw-r--r--tensorflow/contrib/lite/kernels/internal/tensor_test.cc55
-rw-r--r--tensorflow/contrib/lite/kernels/internal/tensor_utils.cc27
-rw-r--r--tensorflow/contrib/lite/kernels/internal/tensor_utils.h116
-rw-r--r--tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc192
-rw-r--r--tensorflow/contrib/lite/kernels/internal/types.h81
-rw-r--r--tensorflow/contrib/lite/kernels/kernel_util.cc87
-rw-r--r--tensorflow/contrib/lite/kernels/kernel_util.h65
-rw-r--r--tensorflow/contrib/lite/kernels/l2norm.cc112
-rw-r--r--tensorflow/contrib/lite/kernels/l2norm_test.cc63
-rw-r--r--tensorflow/contrib/lite/kernels/local_response_norm.cc109
-rw-r--r--tensorflow/contrib/lite/kernels/local_response_norm_test.cc101
-rw-r--r--tensorflow/contrib/lite/kernels/lsh_projection.cc204
-rw-r--r--tensorflow/contrib/lite/kernels/lsh_projection_test.cc123
-rw-r--r--tensorflow/contrib/lite/kernels/lstm.cc515
-rw-r--r--tensorflow/contrib/lite/kernels/lstm_test.cc1088
-rw-r--r--tensorflow/contrib/lite/kernels/mul.cc167
-rw-r--r--tensorflow/contrib/lite/kernels/mul_test.cc127
-rw-r--r--tensorflow/contrib/lite/kernels/op_macros.h32
-rw-r--r--tensorflow/contrib/lite/kernels/optional_tensor_test.cc343
-rw-r--r--tensorflow/contrib/lite/kernels/padding.h28
-rw-r--r--tensorflow/contrib/lite/kernels/pooling.cc355
-rw-r--r--tensorflow/contrib/lite/kernels/pooling_test.cc161
-rw-r--r--tensorflow/contrib/lite/kernels/register.cc109
-rw-r--r--tensorflow/contrib/lite/kernels/register.h50
-rw-r--r--tensorflow/contrib/lite/kernels/reshape.cc91
-rw-r--r--tensorflow/contrib/lite/kernels/reshape_test.cc90
-rw-r--r--tensorflow/contrib/lite/kernels/resize_bilinear.cc129
-rw-r--r--tensorflow/contrib/lite/kernels/resize_bilinear_test.cc117
-rw-r--r--tensorflow/contrib/lite/kernels/skip_gram.cc160
-rw-r--r--tensorflow/contrib/lite/kernels/skip_gram_test.cc257
-rw-r--r--tensorflow/contrib/lite/kernels/softmax_test.cc143
-rw-r--r--tensorflow/contrib/lite/kernels/space_to_depth.cc146
-rw-r--r--tensorflow/contrib/lite/kernels/space_to_depth_test.cc102
-rw-r--r--tensorflow/contrib/lite/kernels/svdf.cc224
-rw-r--r--tensorflow/contrib/lite/kernels/svdf_test.cc312
-rw-r--r--tensorflow/contrib/lite/kernels/test_util.cc183
-rw-r--r--tensorflow/contrib/lite/kernels/test_util.h202
85 files changed, 23875 insertions, 0 deletions
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
new file mode 100644
index 0000000000..bbbfa3e741
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -0,0 +1,408 @@
+package(default_visibility = [
+ "//visibility:public",
+])
+
+licenses(["notice"]) # Apache 2.0
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+load(
+ "//tensorflow:tensorflow.bzl",
+ "tf_cc_test",
+)
+
+tf_cc_test(
+ name = "optional_tensor_test",
+ size = "small",
+ srcs = ["optional_tensor_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+cc_library(
+ name = "test_util",
+ testonly = 1,
+ srcs = ["test_util.cc"],
+ hdrs = ["test_util.h"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite:schema_fbs_version",
+ "//tensorflow/contrib/lite:string_util",
+ "//tensorflow/core:lib",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+cc_library(
+ name = "gemm_support",
+ srcs = [
+ "gemm_support.cc",
+ ],
+ hdrs = [
+ "gemm_support.h",
+ ],
+ copts = tflite_copts(),
+ deps = [
+ ":op_macros",
+ "//tensorflow/contrib/lite:context",
+ "@gemmlowp//:gemmlowp",
+ ],
+)
+
+cc_library(
+ name = "activation_functor",
+ hdrs = [
+ "activation_functor.h",
+ ],
+ deps = [
+ "//tensorflow/contrib/lite:builtin_op_data",
+ ],
+)
+
+cc_library(
+ name = "op_macros",
+ hdrs = [
+ "op_macros.h",
+ ],
+)
+
+cc_library(
+ name = "builtin_ops",
+ srcs = [
+ "activations.cc",
+ "add.cc",
+ "basic_rnn.cc",
+ "concatenation.cc",
+ "conv.cc",
+ "depthwise_conv.cc",
+ "embedding_lookup.cc",
+ "embedding_lookup_sparse.cc",
+ "fully_connected.cc",
+ "hashtable_lookup.cc",
+ "kernel_util.cc",
+ "l2norm.cc",
+ "local_response_norm.cc",
+ "lsh_projection.cc",
+ "lstm.cc",
+ "mul.cc",
+ "pooling.cc",
+ "register.cc",
+ "reshape.cc",
+ "resize_bilinear.cc",
+ "skip_gram.cc",
+ "space_to_depth.cc",
+ "svdf.cc",
+ ],
+ hdrs = [
+ "kernel_util.h",
+ "padding.h",
+ "register.h",
+ ],
+ # Suppress warnings that are introduced by Eigen Tensor.
+ copts = tflite_copts() + [
+ "-Wno-error=reorder",
+ ] + select({
+ "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"],
+ "//conditions:default": [
+ ],
+ }),
+ deps = [
+ ":activation_functor",
+ ":op_macros",
+ "//tensorflow/contrib/lite:builtin_op_data",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite:string_util",
+ "//tensorflow/contrib/lite/kernels:gemm_support",
+ "//tensorflow/contrib/lite/kernels/internal:optimized",
+ "//tensorflow/contrib/lite/kernels/internal:optimized_base",
+ "//tensorflow/contrib/lite/kernels/internal:quantization_util",
+ "//tensorflow/contrib/lite/kernels/internal:reference",
+ "//tensorflow/contrib/lite/kernels/internal:reference_base",
+ "//tensorflow/contrib/lite/kernels/internal:round",
+ "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
+ "@farmhash_archive//:farmhash",
+ ],
+)
+
+tf_cc_test(
+ name = "activations_test",
+ size = "small",
+ srcs = ["activations_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "add_test",
+ size = "small",
+ srcs = ["add_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "concatenation_test",
+ size = "small",
+ srcs = ["concatenation_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "conv_test",
+ size = "small",
+ srcs = ["conv_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "depthwise_conv_test",
+ size = "small",
+ srcs = ["depthwise_conv_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "basic_rnn_test",
+ size = "small",
+ srcs = ["basic_rnn_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "l2norm_test",
+ size = "small",
+ srcs = ["l2norm_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "mul_test",
+ size = "small",
+ srcs = ["mul_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "reshape_test",
+ size = "small",
+ srcs = ["reshape_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "resize_bilinear_test",
+ size = "small",
+ srcs = ["resize_bilinear_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "svdf_test",
+ size = "small",
+ srcs = ["svdf_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "embedding_lookup_test",
+ size = "small",
+ srcs = ["embedding_lookup_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "embedding_lookup_sparse_test",
+ size = "small",
+ srcs = ["embedding_lookup_sparse_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "fully_connected_test",
+ size = "small",
+ srcs = ["fully_connected_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "local_response_norm_test",
+ size = "small",
+ srcs = ["local_response_norm_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "pooling_test",
+ size = "small",
+ srcs = ["pooling_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "softmax_test",
+ size = "small",
+ srcs = ["softmax_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "//tensorflow/contrib/lite/kernels/internal:reference_base",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "lsh_projection_test",
+ size = "small",
+ srcs = ["lsh_projection_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "hashtable_lookup_test",
+ size = "small",
+ srcs = ["hashtable_lookup_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite:string_util",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "lstm_test",
+ size = "small",
+ srcs = ["lstm_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "skip_gram_test",
+ size = "small",
+ srcs = ["skip_gram_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite:string_util",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+tf_cc_test(
+ name = "space_to_depth_test",
+ size = "small",
+ srcs = ["space_to_depth_test.cc"],
+ deps = [
+ ":builtin_ops",
+ "//tensorflow/contrib/lite:framework",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+filegroup(
+ name = "all_files",
+ srcs = glob(
+ ["**/*"],
+ exclude = [
+ "**/METADATA",
+ "**/OWNERS",
+ ],
+ ),
+ visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/kernels/activation_functor.h b/tensorflow/contrib/lite/kernels/activation_functor.h
new file mode 100644
index 0000000000..cfb3369e99
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/activation_functor.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+namespace tflite {
+
+// Dynamic (non-fused) activation functor. perhaps it is worth having
+// template instantiation?
+// TODO(aselle): Make this more efficient by pulling the switch to conv_eval
+// using template inlining.
+class ActivationFunctor {
+ public:
+ explicit ActivationFunctor(TfLiteFusedActivation act) : act_(act) {}
+
+ float operator()(float a) const {
+ switch (act_) {
+ case kTfLiteActNone:
+ return a;
+ case kTfLiteActRelu:
+ return a < 0.f ? 0.f : a;
+ case kTfLiteActRelu6:
+ return std::max(0.f, std::min(a, 6.f));
+ case kTfLiteActTanh:
+ return std::tanh(a);
+ case kTfLiteActSigmoid:
+ return 1.0f / (1.0f + std::exp(-a));
+ default:
+ // TODO(aselle): More informative fatal error!
+ exit(1);
+ }
+ }
+
+ private:
+ TfLiteFusedActivation act_;
+};
+
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
new file mode 100644
index 0000000000..7ab60a33e5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -0,0 +1,389 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace activations {
+
+struct OpData {
+ int32_t input_multiplier = 0;
+ int input_left_shift = 0;
+ int32_t input_range_radius = 0;
+ int diff_min = 0;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+ // This is a builtin op, so we don't use the contents in 'buffer', if any.
+ // Instead, we allocate a new object to carry information from Prepare() to
+ // Eval().
+ return new OpData;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+ delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+ return context->ResizeTensor(context, output,
+ TfLiteIntArrayCopy(input->dims));
+}
+
+TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+ if (input->type == kTfLiteUInt8) {
+ TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+ TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+
+ static constexpr int kInputIntegerBits = 4;
+
+ const double input_real_multiplier =
+ input->params.scale *
+ static_cast<double>(1 << (31 - kInputIntegerBits));
+
+ QuantizeMultiplierGreaterThanOne(input_real_multiplier,
+ &data->input_multiplier,
+ &data->input_left_shift);
+ data->input_range_radius =
+ CalculateInputRadius(kInputIntegerBits, data->input_left_shift);
+ }
+
+ return context->ResizeTensor(context, output,
+ TfLiteIntArrayCopy(input->dims));
+}
+
+TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+ TF_LITE_ENSURE(context,
+ NumDimensions(input) == 2 || NumDimensions(input) == 4);
+
+ if (input->type == kTfLiteUInt8) {
+ TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+ TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+
+ static const int kScaledDiffIntegerBits = 5;
+
+ tflite::PreprocessSoftmaxScaling(
+ params->beta, input->params.scale, kScaledDiffIntegerBits,
+ &data->input_multiplier, &data->input_left_shift);
+ data->diff_min = -1.0 * tflite::CalculateInputRadius(
+ kScaledDiffIntegerBits, data->input_left_shift);
+ }
+
+ return context->ResizeTensor(context, output,
+ TfLiteIntArrayCopy(input->dims));
+}
+
+TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ switch (input->type) {
+ case kTfLiteFloat32: {
+ size_t elements = input->bytes / sizeof(float);
+ float* in = input->data.f;
+ float* in_end = in + elements;
+ float* out = output->data.f;
+ for (; in < in_end; in++, out++) *out = std::max(0.f, *in);
+ return kTfLiteOk;
+ }
+ break;
+ default:
+ context->ReportError(context, "Only float32 supported currently.");
+ return kTfLiteError;
+ }
+}
+
+TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ switch (input->type) {
+ case kTfLiteFloat32: {
+ size_t elements = input->bytes / sizeof(float);
+ float* in = input->data.f;
+ float* in_end = in + elements;
+ float* out = output->data.f;
+ for (; in < in_end; in++, out++) {
+ *out = std::min(std::max(-1.f, *in), 1.f);
+ }
+ return kTfLiteOk;
+ } break;
+ default:
+ context->ReportError(context, "Only float32 supported currently.");
+ return kTfLiteError;
+ }
+}
+
+TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ switch (input->type) {
+ case kTfLiteFloat32: {
+ size_t elements = input->bytes / sizeof(float);
+ float* in = input->data.f;
+ float* in_end = in + elements;
+ float* out = output->data.f;
+ for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
+ return kTfLiteOk;
+ }
+ break;
+ default:
+ context->ReportError(context, "Only float32 supported currently.");
+ return kTfLiteError;
+ }
+}
+
+TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ switch (input->type) {
+ case kTfLiteFloat32: {
+ size_t elements = input->bytes / sizeof(float);
+ float* in = input->data.f;
+ float* in_end = in + elements;
+ float* out = output->data.f;
+ for (; in < in_end; in++, out++) *out = std::tanh(*in);
+ return kTfLiteOk;
+ }
+ break;
+ default:
+ context->ReportError(context, "Only float32 supported currently.");
+ return kTfLiteError;
+ }
+}
+
+// Sigmoid is also know as "Logistic".
+TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ switch (input->type) {
+ case kTfLiteFloat32: {
+ size_t elements = input->bytes / sizeof(float);
+ float* in = input->data.f;
+ float* in_end = in + elements;
+ float* out = output->data.f;
+ for (; in < in_end; in++, out++) *out = 1.f / (1.f + std::exp(-*in));
+ break;
+ }
+ case kTfLiteUInt8: {
+ optimized_ops::Logistic(
+ GetTensorData<uint8_t>(input), GetTensorDims(input),
+ input->params.zero_point, data->input_range_radius,
+ data->input_multiplier, data->input_left_shift,
+ GetTensorData<uint8_t>(output), GetTensorDims(output));
+ break;
+ }
+ default:
+ context->ReportError(context, "Only float32 supported currently.");
+ return kTfLiteError;
+ }
+ return kTfLiteOk;
+}
+
+// Takes a 2D tensor and perform softmax along the second dimension.
+void Softmax2DFloat(TfLiteTensor* input, TfLiteTensor* output,
+ TfLiteSoftmaxParams* params) {
+ const int batch_size = input->dims->data[0];
+ const int input_size = input->dims->data[1];
+ float* in = input->data.f;
+ float* out = output->data.f;
+ TF_LITE_ASSERT(input_size > 0);
+
+ // For each batch
+ for (int b = 0; b < batch_size; b++) {
+ // Find the max coeff.
+ float max_coeff = in[0];
+ for (int i = 1; i < input_size; i++) {
+ if (in[i] > max_coeff) max_coeff = in[i];
+ }
+
+ // Compute the normalized sum of exps.
+ float exp_sum = 0.0;
+ for (int i = 0; i < input_size; i++) {
+ out[i] = std::exp((in[i] - max_coeff) * params->beta);
+ exp_sum += out[i];
+ }
+
+ // Divide by the sum of exps.
+ float reciprocal_sum_exp = 1.f / exp_sum;
+ for (int i = 0; i < input_size; i++) {
+ out[i] *= reciprocal_sum_exp;
+ }
+
+ // Advance in and out pointers for the next batch.
+ in += input_size;
+ out += input_size;
+ }
+}
+
+void Softmax2DQuantized(TfLiteTensor* input, TfLiteTensor* output,
+ TfLiteSoftmaxParams* params, OpData* data) {
+ // TODO(ahentz): this is arguably a dirty trick. Since the implementation
+ // always traverses the last dimension of a 4D tensor, we will pretend our 2D
+ // tensor is 4D in a special way. We will convert a (X, Y) shape into a (X,
+ // 1, 1, Y) shape.
+ const int batch_size = input->dims->data[0];
+ const int input_size = input->dims->data[1];
+ optimized_ops::Softmax(GetTensorData<uint8_t>(input),
+ GetTensorDims({batch_size, 1, 1, input_size}),
+ data->input_multiplier, data->input_left_shift,
+ data->diff_min, GetTensorData<uint8_t>(output),
+ GetTensorDims({batch_size, 1, 1, input_size}));
+}
+
+// Takes a 4D tensor and perform softmax along the forth dimension.
+void Softmax4DFloat(TfLiteTensor* input, TfLiteTensor* output,
+ TfLiteSoftmaxParams* params) {
+ optimized_ops::Softmax(GetTensorData<float>(input), GetTensorDims(input),
+ params->beta, GetTensorData<float>(output),
+ GetTensorDims(output));
+}
+
+void Softmax4DQuantized(TfLiteTensor* input, TfLiteTensor* output,
+ TfLiteSoftmaxParams* params, OpData* data) {
+ optimized_ops::Softmax(GetTensorData<uint8_t>(input), GetTensorDims(input),
+ data->input_multiplier, data->input_left_shift,
+ data->diff_min, GetTensorData<uint8_t>(output),
+ GetTensorDims(output));
+}
+
+TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+
+ // TODO(ahentz): consider an implementation that works for many (all?)
+ // dimensions.
+ switch (input->type) {
+ case kTfLiteFloat32: {
+ if (NumDimensions(input) == 2) {
+ Softmax2DFloat(input, output, params);
+ return kTfLiteOk;
+ }
+ if (NumDimensions(input) == 4) {
+ Softmax4DFloat(input, output, params);
+ return kTfLiteOk;
+ }
+ context->ReportError(context,
+ "Only 2D and 4D tensors supported currently.");
+ return kTfLiteError;
+ }
+ case kTfLiteUInt8: {
+ if (NumDimensions(input) == 2) {
+ Softmax2DQuantized(input, output, params, data);
+ return kTfLiteOk;
+ }
+ if (NumDimensions(input) == 4) {
+ Softmax4DQuantized(input, output, params, data);
+ return kTfLiteOk;
+ }
+ context->ReportError(context,
+ "Only 2D and 4D tensors supported currently.");
+ return kTfLiteError;
+ }
+ default:
+ context->ReportError(context,
+ "Only float32 and uint8_t supported currently.");
+ return kTfLiteError;
+ }
+}
+
+} // namespace activations
+
+TfLiteRegistration* Register_RELU() {
+ static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+ activations::GenericPrepare,
+ activations::ReluEval};
+ return &r;
+}
+
+TfLiteRegistration* Register_RELU1() {
+ static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+ activations::GenericPrepare,
+ activations::Relu1Eval};
+ return &r;
+}
+
+TfLiteRegistration* Register_RELU6() {
+ static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+ activations::GenericPrepare,
+ activations::Relu6Eval};
+ return &r;
+}
+
+TfLiteRegistration* Register_TANH() {
+ static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+ activations::GenericPrepare,
+ activations::TanhEval};
+ return &r;
+}
+
+TfLiteRegistration* Register_LOGISTIC() {
+ static TfLiteRegistration r = {activations::Init, activations::Free,
+ activations::SigmoidPrepare,
+ activations::SigmoidEval};
+ return &r;
+}
+
+TfLiteRegistration* Register_SOFTMAX() {
+ static TfLiteRegistration r = {activations::Init, activations::Free,
+ activations::SoftmaxPrepare,
+ activations::SoftmaxEval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
new file mode 100644
index 0000000000..f10aee7017
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -0,0 +1,323 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseActivationsOpModel : public SingleOpModel {
+ public:
+ // Most activations don't take any options, so this constructor works for
+ // them.
+ BaseActivationsOpModel(BuiltinOperator type, TensorData input) {
+ input_ = AddInput(input);
+ if (input.type == TensorType_UINT8) {
+ output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+ } else {
+ output_ = AddOutput({input.type, {}});
+ }
+ SetBuiltinOp(type, BuiltinOptions_NONE, 0);
+ BuildInterpreter({GetShape(input_)});
+ }
+
+ // A dedicated constructor for SOFTMAX, which does some options.
+ BaseActivationsOpModel(float softmax_beta, TensorData input) {
+ input_ = AddInput(input);
+ if (input.type == TensorType_UINT8) {
+ output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+ } else {
+ output_ = AddOutput({input.type, {}});
+ }
+ SetBuiltinOp(BuiltinOperator_SOFTMAX, BuiltinOptions_SoftmaxOptions,
+ CreateSoftmaxOptions(builder_, softmax_beta).Union());
+ BuildInterpreter({GetShape(input_)});
+ }
+
+ protected:
+ int input_;
+ int output_;
+};
+
+class FloatActivationsOpModel : public BaseActivationsOpModel {
+ public:
+ using BaseActivationsOpModel::BaseActivationsOpModel;
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+// TODO(ahentz): I don't quite understand the tradeoffs in the quantized
+// implementation of sigmoid and software, but a tolerance of twice the output
+// scale seems reasonable. We might want to change this if we have a better
+// theoretical bound.
+const float kQuantizedTolerance = 2 * (1. / 256);
+
+class QuantizedActivationsOpModel : public BaseActivationsOpModel {
+ public:
+ using BaseActivationsOpModel::BaseActivationsOpModel;
+
+ void SetInput(std::initializer_list<float> data) {
+ QuantizeAndPopulate<uint8_t>(input_, data);
+ }
+ std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+ std::vector<float> GetDequantizedOutput() {
+ return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+ GetScale(output_), GetZeroPoint(output_));
+ }
+};
+
+TEST(FloatActivationsOpTest, Relu) {
+ FloatActivationsOpModel m(BuiltinOperator_RELU,
+ /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+ m.SetInput({
+ 0, -6, 2, 4, //
+ 3, -2, 10, 1, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 0, 0, 2, 4, //
+ 3, 0, 10, 1, //
+ }));
+}
+
+TEST(FloatActivationsOpTest, Relu1) {
+ FloatActivationsOpModel m(BuiltinOperator_RELU1,
+ /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+ m.SetInput({
+ 0.0, -0.6, 0.2, -0.4, //
+ 0.3, -2.0, 1.1, -0.1, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 0.0, -0.6, 0.2, -0.4, //
+ 0.3, -1.0, 1.0, -0.1, //
+ }));
+}
+
+TEST(FloatActivationsOpTest, Relu6) {
+ FloatActivationsOpModel m(BuiltinOperator_RELU6,
+ /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+ m.SetInput({
+ 0, -6, 2, 4, //
+ 3, -2, 10, 1, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 0, 0, 2, 4, //
+ 3, 0, 6, 1, //
+ }));
+}
+
+TEST(FloatActivationsOpTest, Tanh) {
+ FloatActivationsOpModel m(BuiltinOperator_TANH,
+ /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+ m.SetInput({
+ 0, -6, 2, 4, //
+ 3, -2, 10, 1, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ 0, -0.9999877, 0.9640275, 0.999329, //
+ 0.99505475, -0.9640275, 1, 0.7615941, //
+ })));
+}
+
+TEST(FloatActivationsOpTest, Sigmoid) {
+ FloatActivationsOpModel m(BuiltinOperator_LOGISTIC,
+ /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+ m.SetInput({
+ 0, -6, 2, 4, //
+ 3, -2, 10, 1, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ 0.5, 0.002473, 0.880797, 0.982014, //
+ 0.952574, 0.119203, 0.999955, 0.731059, //
+ })));
+}
+
+TEST(QuantizedActivationsOpTest, Sigmoid) {
+ QuantizedActivationsOpModel m(
+ BuiltinOperator_LOGISTIC,
+ /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -10, 10});
+ m.SetInput({
+ 0, -6, 2, 4, //
+ 3, -2, 10, 1, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetDequantizedOutput(),
+ ElementsAreArray(ArrayFloatNear(
+ {
+ 0.5, 0.002473, 0.880797, 0.982014, //
+ 0.952574, 0.119203, 0.999955, 0.731059, //
+ },
+ kQuantizedTolerance)));
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
+}
+
+TEST(FloatActivationsOpTest, Softmax4D) {
+ FloatActivationsOpModel m(0.1,
+ /*input=*/{TensorType_FLOAT32, {1, 2, 1, 4}});
+ m.SetInput({
+ 0, -6, 2, 4, // depth = 0
+ 3, -2, 10, 1, // depth = 1
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ .23463, .12877, .28658, .35003, //
+ .22528, .13664, .45365, .18443, //
+ })));
+
+ // Same input, but a different shape.
+ FloatActivationsOpModel m2(0.1,
+ /*input=*/{TensorType_FLOAT32, {4, 1, 1, 2}});
+ m2.SetInput({
+ 0, -6, //
+ 2, 4, //
+ 3, -2, //
+ 10, 1, //
+ });
+ m2.Invoke();
+ EXPECT_THAT(m2.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ 0.645656, 0.354344, //
+ 0.450166, 0.549834, //
+ 0.622459, 0.377541, //
+ 0.710949, 0.28905, //
+ })));
+}
+
+TEST(QuantizedActivationsOpTest, Softmax4D) {
+ QuantizedActivationsOpModel m(
+ 0.1,
+ /*input=*/{TensorType_UINT8, {1, 2, 1, 4}, -10, 10});
+ m.SetInput({
+ 0, -6, 2, 4, // depth = 0
+ 3, -2, 10, 1, // depth = 1
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetDequantizedOutput(),
+ ElementsAreArray(ArrayFloatNear(
+ {
+ .23463, .12877, .28658, .35003, //
+ .22528, .13664, .45365, .18443, //
+ },
+ kQuantizedTolerance)));
+
+ // Same input, but a different shape.
+ QuantizedActivationsOpModel m2(
+ 0.1,
+ /*input=*/{TensorType_UINT8, {4, 1, 1, 2}, -10, 10});
+ m2.SetInput({
+ 0, -6, //
+ 2, 4, //
+ 3, -2, //
+ 10, 1, //
+ });
+ m2.Invoke();
+ EXPECT_THAT(m2.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+ {
+ 0.645656, 0.354344, //
+ 0.450166, 0.549834, //
+ 0.622459, 0.377541, //
+ 0.710949, 0.28905, //
+ },
+ kQuantizedTolerance)));
+}
+
+TEST(FloatActivationsOpTest, Softmax2D) {
+ FloatActivationsOpModel m(0.1,
+ /*input=*/{TensorType_FLOAT32, {2, 4}});
+ m.SetInput({
+ 0, -6, 2, 4, //
+ 3, -2, 10, 1, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ .23463, .12877, .28658, .35003, //
+ .22528, .13664, .45365, .18443, //
+ })));
+
+ // Same input, but a different shape.
+ FloatActivationsOpModel m2(0.1,
+ /*input=*/{TensorType_FLOAT32, {4, 2}});
+ m2.SetInput({
+ 0, -6, //
+ 2, 4, //
+ 3, -2, //
+ 10, 1, //
+ });
+ m2.Invoke();
+ EXPECT_THAT(m2.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ 0.645656, 0.354344, //
+ 0.450166, 0.549834, //
+ 0.622459, 0.377541, //
+ 0.710949, 0.28905, //
+ })));
+}
+
+TEST(QuantizedActivationsOpTest, Softmax2D) {
+ QuantizedActivationsOpModel m(0.1,
+ /*input=*/{TensorType_UINT8, {2, 4}, -10, 10});
+ m.SetInput({
+ 0, -6, 2, 4, //
+ 3, -2, 10, 1, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetDequantizedOutput(),
+ ElementsAreArray(ArrayFloatNear(
+ {
+ .23463, .12877, .28658, .35003, //
+ .22528, .13664, .45365, .18443, //
+ },
+ kQuantizedTolerance)));
+
+ // Same input, but a different shape.
+ QuantizedActivationsOpModel m2(0.1,
+ /*input=*/{TensorType_UINT8, {4, 2}, -10, 10});
+ m2.SetInput({
+ 0, -6, //
+ 2, 4, //
+ 3, -2, //
+ 10, 1, //
+ });
+ m2.Invoke();
+ EXPECT_THAT(m2.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+ {
+ 0.645656, 0.354344, //
+ 0.450166, 0.549834, //
+ 0.622459, 0.377541, //
+ 0.710949, 0.28905, //
+ },
+ kQuantizedTolerance)));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
new file mode 100644
index 0000000000..0e10a249ab
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -0,0 +1,184 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace add {
+
+// This file has three implementation of Add.
+enum KernelType {
+ kReference,
+ kGenericOptimized, // Neon-free
+ kNeonOptimized,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+ TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
+ for (int i = 0; i < NumDimensions(input1); ++i) {
+ TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
+ SizeOfDimension(input2, i));
+ }
+
+ TF_LITE_ENSURE_EQ(context, input1->type, output->type);
+ TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
+ return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
+ TfLiteAddParams* params, TfLiteTensor* input1,
+ TfLiteTensor* input2, TfLiteTensor* output) {
+ float output_activation_min, output_activation_max;
+ CalculateActivationRangeFloat(params->activation, &output_activation_min,
+ &output_activation_max);
+#define TF_LITE_ADD(type) \
+ type::Add(GetTensorData<float>(input1), GetTensorDims(input1), \
+ GetTensorData<float>(input2), GetTensorDims(input2), \
+ output_activation_min, output_activation_max, \
+ GetTensorData<float>(output), GetTensorDims(output))
+ if (kernel_type == kReference) {
+ TF_LITE_ADD(reference_ops);
+ } else {
+ TF_LITE_ADD(optimized_ops);
+ }
+#undef TF_LITE_ADD
+}
+
+template <KernelType kernel_type>
+void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
+ TfLiteAddParams* params, TfLiteTensor* input1,
+ TfLiteTensor* input2, TfLiteTensor* output) {
+ auto input1_offset = -input1->params.zero_point;
+ auto input2_offset = -input2->params.zero_point;
+ auto output_offset = output->params.zero_point;
+ const int left_shift = 20;
+ const double twice_max_input_scale =
+ 2 * std::max(input1->params.scale, input2->params.scale);
+ const double real_input1_multiplier =
+ input1->params.scale / twice_max_input_scale;
+ const double real_input2_multiplier =
+ input2->params.scale / twice_max_input_scale;
+ const double real_output_multiplier =
+ twice_max_input_scale / ((1 << left_shift) * output->params.scale);
+
+ int32 input1_multiplier;
+ int input1_shift;
+ QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &input1_multiplier,
+ &input1_shift);
+ int32 input2_multiplier;
+ int input2_shift;
+ QuantizeMultiplierSmallerThanOne(real_input2_multiplier, &input2_multiplier,
+ &input2_shift);
+ int32 output_multiplier;
+ int output_shift;
+ QuantizeMultiplierSmallerThanOne(real_output_multiplier, &output_multiplier,
+ &output_shift);
+
+ int32 output_activation_min, output_activation_max;
+ CalculateActivationRangeUint8(params->activation, output,
+ &output_activation_min, &output_activation_max);
+
+#define TF_LITE_ADD(type) \
+ type::BroadcastAdd( \
+ left_shift, GetTensorData<uint8_t>(input1), GetTensorDims(input1), \
+ input1_offset, input1_multiplier, input1_shift, \
+ GetTensorData<uint8_t>(input2), GetTensorDims(input2), input2_offset, \
+ input2_multiplier, input2_shift, output_offset, output_multiplier, \
+ output_shift, output_activation_min, output_activation_max, \
+ GetTensorData<uint8_t>(output), GetTensorDims(output));
+
+ if (kernel_type == kReference) {
+ TF_LITE_ADD(reference_ops);
+ } else {
+ TF_LITE_ADD(optimized_ops);
+ }
+#undef TF_LITE_ADD
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+ TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+ TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ if (output->type == kTfLiteFloat32) {
+ EvalAddFloat<kernel_type>(context, node, params, input1, input2, output);
+ } else if (output->type == kTfLiteUInt8) {
+ EvalAddQuantized<kernel_type>(context, node, params, input1, input2,
+ output);
+ } else {
+ context->ReportError(context,
+ "Inputs and outputs not all float|unit8 types.");
+ return kTfLiteError;
+ }
+
+ return kTfLiteOk;
+}
+
+} // namespace add
+
+TfLiteRegistration* Register_ADD_REF() {
+ static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+ add::Eval<add::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_ADD_GENERIC_OPT() {
+ static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+ add::Eval<add::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_ADD_NEON_OPT() {
+ static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+ add::Eval<add::kNeonOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_ADD() {
+#ifdef USE_NEON
+ return Register_ADD_NEON_OPT();
+#else
+ return Register_ADD_GENERIC_OPT();
+#endif
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/contrib/lite/kernels/add_test.cc
new file mode 100644
index 0000000000..8e12a837c4
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/add_test.cc
@@ -0,0 +1,171 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseAddOpModel : public SingleOpModel {
+ public:
+ BaseAddOpModel(const TensorData& input, const TensorData& output,
+ ActivationFunctionType activation_type) {
+ input1_ = AddInput(input);
+ input2_ = AddInput(input);
+ output_ = AddOutput(output);
+ SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+ CreateAddOptions(builder_, activation_type).Union());
+ BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+ }
+
+ int input1() { return input1_; }
+ int input2() { return input2_; }
+
+ protected:
+ int input1_;
+ int input2_;
+ int output_;
+};
+
+class FloatAddOpModel : public BaseAddOpModel {
+ public:
+ using BaseAddOpModel::BaseAddOpModel;
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedAddOpModel : public BaseAddOpModel {
+ public:
+ using BaseAddOpModel::BaseAddOpModel;
+
+ std::vector<float> GetDequantizedOutput() {
+ return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+ GetScale(output_), GetZeroPoint(output_));
+ }
+};
+
+// for quantized Add, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+ float kQuantizedStep = (max - min) / 255.0;
+ float kQuantizedTolerance = 2.0 * kQuantizedStep;
+ return kQuantizedTolerance;
+}
+
+TEST(FloatAddOpModel, NoActivation) {
+ FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+ {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+ m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+ m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
+TEST(FloatAddOpModel, ActivationRELU1) {
+ FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+ {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU1);
+ m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+ m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.0, 0.4, 1.0, 1.0}));
+}
+
+TEST(FloatAddOpModel, VariousInputShapes) {
+ std::vector<std::initializer_list<int>> test_shapes = {
+ {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+ for (int i = 0; i < test_shapes.size(); ++i) {
+ FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+ {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+ m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+ m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray({-1.9, 0.4, 1.0, 1.3, 2.2, 2.1}))
+ << "With shape number " << i;
+ }
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
+ float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+ std::vector<std::initializer_list<float>> inputs1 = {
+ {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
+ std::vector<std::initializer_list<float>> inputs2 = {
+ {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
+ std::vector<std::initializer_list<float>> results = {
+ {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
+ for (int i = 0; i < inputs1.size(); ++i) {
+ QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+ {TensorType_UINT8, {}, -1.0, 1.0},
+ ActivationFunctionType_NONE);
+ m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+ m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+ m.Invoke();
+ EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+ results[i], kQuantizedTolerance)))
+ << "With test number " << i;
+ }
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU1) {
+ float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+ std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+ {-0.8, 0.2, 0.7, 0.3}};
+ std::vector<std::initializer_list<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+ {0.6, 0.4, -0.8, 0.5}};
+ std::vector<std::initializer_list<float>> results = {{-0.2, 0.6, 1.0, -0.1},
+ {-0.2, 0.6, -0.1, 0.8}};
+ for (int i = 0; i < inputs1.size(); ++i) {
+ QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+ {TensorType_UINT8, {}, -1.0, 1.0},
+ ActivationFunctionType_RELU1);
+ m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+ m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+ m.Invoke();
+ EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+ results[i], kQuantizedTolerance)))
+ << "With test number " << i;
+ }
+}
+
+TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
+ float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+ std::vector<std::initializer_list<int>> test_shapes = {
+ {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+ for (int i = 0; i < test_shapes.size(); ++i) {
+ QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+ {TensorType_UINT8, {}, -3.0, 3.0},
+ ActivationFunctionType_NONE);
+ m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+ m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+ m.Invoke();
+ EXPECT_THAT(m.GetDequantizedOutput(),
+ ElementsAreArray(ArrayFloatNear({-1.9, 0.5, 1.0, 1.3, 2.2, 2.1},
+ kQuantizedTolerance)))
+ << "With shape number " << i;
+ }
+}
+
+} // namespace
+} // namespace tflite
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
new file mode 100644
index 0000000000..3cee43c68b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace rnn {
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kRecurrentWeightsTensor = 2;
+constexpr int kBiasTensor = 3;
+constexpr int KHiddenStateTensor = 0;
+constexpr int kOutputTensor = 1;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ // Check we have all the inputs and outputs we need.
+ TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
+ TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+
+ TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+ TfLiteTensor* input_weights =
+ &context->tensors[node->inputs->data[kWeightsTensor]];
+ TfLiteTensor* recurrent_weights =
+ &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
+ TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+
+ // Check all the parameters of tensor match within themselves and match the
+ // input configuration.
+ const int batch_size = input->dims->data[0];
+ const int num_units = input_weights->dims->data[0];
+ TF_LITE_ASSERT_EQ(input->dims->data[1], input_weights->dims->data[1]);
+ TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
+ TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
+ TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+
+ TfLiteTensor* hidden_state =
+ &context->tensors[node->outputs->data[KHiddenStateTensor]];
+ TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+
+ // Resize state.
+ TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2);
+ hidden_state_size_array->data[0] = batch_size;
+ hidden_state_size_array->data[1] = num_units;
+ TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, hidden_state,
+ hidden_state_size_array));
+
+ // Mark hidden state as a persistent tensor.
+ hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+
+ // Resize output.
+ TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
+ output_size_array->data[0] = batch_size;
+ output_size_array->data[1] = num_units;
+ TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
+ output_size_array));
+
+ return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteRNNParams*>(node->builtin_data);
+
+ TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+ TfLiteTensor* input_weights =
+ &context->tensors[node->inputs->data[kWeightsTensor]];
+ TfLiteTensor* recurrent_weights =
+ &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
+ TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+ TfLiteTensor* hidden_state =
+ &context->tensors[node->outputs->data[KHiddenStateTensor]];
+ TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+
+ // Initialize the pointer bias.
+ const float* bias_ptr = bias->data.f;
+
+ const int batch_size = input->dims->data[0];
+ const int num_units = input_weights->dims->data[0];
+ const int input_size = input->dims->data[1];
+ const int input_weights_stride = input_weights->dims->data[1];
+ const int recurrent_weights_stride = recurrent_weights->dims->data[1];
+
+ // For each batch
+ for (int b = 0; b < batch_size; b++) {
+ // Initialize the pointer to input, output and bias.
+ const float* input_ptr_batch = input->data.f + b * input_size;
+ float* output_ptr_batch = output->data.f + b * num_units;
+ float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
+
+ // Initialize input_weights and recurrent_weights.
+ const float* input_weights_ptr = input_weights->data.f;
+ const float* recurrent_weights_ptr = recurrent_weights->data.f;
+
+ // Output = bias
+ for (int o = 0; o < num_units; o++) {
+ output_ptr_batch[o] = bias_ptr[o];
+ }
+
+ // Output += input * input_weights
+ for (int o = 0; o < num_units; o++) {
+ for (int i = 0; i < input_size; i++) {
+ output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
+ }
+ input_weights_ptr += input_weights_stride;
+ }
+
+ // Output += recurrent_weights * hidden_state
+ for (int o = 0; o < num_units; o++) {
+ for (int h = 0; h < num_units; h++) {
+ output_ptr_batch[o] +=
+ hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
+ }
+ recurrent_weights_ptr += recurrent_weights_stride;
+ }
+
+ // Output = activation(Output) and update hidden_state
+ for (int o = 0; o < num_units; o++) {
+ output_ptr_batch[o] =
+ (ActivationFunctor(params->activation))(output_ptr_batch[o]);
+ hidden_state_ptr_batch[o] = output_ptr_batch[o];
+ }
+ }
+
+ return kTfLiteOk;
+}
+
+} // namespace rnn
+
+TfLiteRegistration* Register_RNN() {
+ static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+ rnn::Prepare, rnn::Eval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
new file mode 100644
index 0000000000..dfa75655bc
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
@@ -0,0 +1,267 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite RNN op.
+
+#include <vector>
+#include <iomanip>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float rnn_input[] = {
+ 0.23689353, 0.285385, 0.037029743, -0.19858193, -0.27569133,
+ 0.43773448, 0.60379338, 0.35562468, -0.69424844, -0.93421471,
+ -0.87287879, 0.37144363, -0.62476718, 0.23791671, 0.40060222,
+ 0.1356622, -0.99774903, -0.98858172, -0.38952237, -0.47685933,
+ 0.31073618, 0.71511042, -0.63767755, -0.31729108, 0.33468103,
+ 0.75801885, 0.30660987, -0.37354088, 0.77002847, -0.62747043,
+ -0.68572164, 0.0069220066, 0.65791464, 0.35130811, 0.80834007,
+ -0.61777675, -0.21095741, 0.41213346, 0.73784804, 0.094794154,
+ 0.47791874, 0.86496925, -0.53376222, 0.85315156, 0.10288584,
+ 0.86684, -0.011186242, 0.10513687, 0.87825835, 0.59929144,
+ 0.62827742, 0.18899453, 0.31440187, 0.99059987, 0.87170351,
+ -0.35091716, 0.74861872, 0.17831337, 0.2755419, 0.51864719,
+ 0.55084288, 0.58982027, -0.47443086, 0.20875752, -0.058871567,
+ -0.66609079, 0.59098077, 0.73017097, 0.74604273, 0.32882881,
+ -0.17503482, 0.22396147, 0.19379807, 0.29120302, 0.077113032,
+ -0.70331609, 0.15804303, -0.93407321, 0.40182066, 0.036301374,
+ 0.66521823, 0.0300982, -0.7747041, -0.02038002, 0.020698071,
+ -0.90300065, 0.62870288, -0.23068321, 0.27531278, -0.095755219,
+ -0.712036, -0.17384434, -0.50593495, -0.18646687, -0.96508682,
+ 0.43519354, 0.14744234, 0.62589407, 0.1653645, -0.10651493,
+ -0.045277178, 0.99032974, -0.88255352, -0.85147917, 0.28153265,
+ 0.19455957, -0.55479527, -0.56042433, 0.26048636, 0.84702539,
+ 0.47587705, -0.074295521, -0.12287641, 0.70117295, 0.90532446,
+ 0.89782166, 0.79817224, 0.53402734, -0.33286154, 0.073485017,
+ -0.56172788, -0.044897556, 0.89964068, -0.067662835, 0.76863563,
+ 0.93455386, -0.6324693, -0.083922029};
+
+static float rnn_golden_output[] = {
+ 0.496726, 0, 0.965996, 0, 0.0584254, 0,
+ 0, 0.12315, 0, 0, 0.612266, 0.456601,
+ 0, 0.52286, 1.16099, 0.0291232,
+
+ 0, 0, 0.524901, 0, 0, 0,
+ 0, 1.02116, 0, 1.35762, 0, 0.356909,
+ 0.436415, 0.0355727, 0, 0,
+
+ 0, 0, 0, 0.262335, 0, 0,
+ 0, 1.33992, 0, 2.9739, 0, 0,
+ 1.31914, 2.66147, 0, 0,
+
+ 0.942568, 0, 0, 0, 0.025507, 0,
+ 0, 0, 0.321429, 0.569141, 1.25274, 1.57719,
+ 0.8158, 1.21805, 0.586239, 0.25427,
+
+ 1.04436, 0, 0.630725, 0, 0.133801, 0.210693,
+ 0.363026, 0, 0.533426, 0, 1.25926, 0.722707,
+ 0, 1.22031, 1.30117, 0.495867,
+
+ 0.222187, 0, 0.72725, 0, 0.767003, 0,
+ 0, 0.147835, 0, 0, 0, 0.608758,
+ 0.469394, 0.00720298, 0.927537, 0,
+
+ 0.856974, 0.424257, 0, 0, 0.937329, 0,
+ 0, 0, 0.476425, 0, 0.566017, 0.418462,
+ 0.141911, 0.996214, 1.13063, 0,
+
+ 0.967899, 0, 0, 0, 0.0831304, 0,
+ 0, 1.00378, 0, 0, 0, 1.44818,
+ 1.01768, 0.943891, 0.502745, 0,
+
+ 0.940135, 0, 0, 0, 0, 0,
+ 0, 2.13243, 0, 0.71208, 0.123918, 1.53907,
+ 1.30225, 1.59644, 0.70222, 0,
+
+ 0.804329, 0, 0.430576, 0, 0.505872, 0.509603,
+ 0.343448, 0, 0.107756, 0.614544, 1.44549, 1.52311,
+ 0.0454298, 0.300267, 0.562784, 0.395095,
+
+ 0.228154, 0, 0.675323, 0, 1.70536, 0.766217,
+ 0, 0, 0, 0.735363, 0.0759267, 1.91017,
+ 0.941888, 0, 0, 0,
+
+ 0, 0, 1.5909, 0, 0, 0,
+ 0, 0.5755, 0, 0.184687, 0, 1.56296,
+ 0.625285, 0, 0, 0,
+
+ 0, 0, 0.0857888, 0, 0, 0,
+ 0, 0.488383, 0.252786, 0, 0, 0,
+ 1.02817, 1.85665, 0, 0,
+
+ 0.00981836, 0, 1.06371, 0, 0, 0,
+ 0, 0, 0, 0.290445, 0.316406, 0,
+ 0.304161, 1.25079, 0.0707152, 0,
+
+ 0.986264, 0.309201, 0, 0, 0, 0,
+ 0, 1.64896, 0.346248, 0, 0.918175, 0.78884,
+ 0.524981, 1.92076, 2.07013, 0.333244,
+
+ 0.415153, 0.210318, 0, 0, 0, 0,
+ 0, 2.02616, 0, 0.728256, 0.84183, 0.0907453,
+ 0.628881, 3.58099, 1.49974, 0
+};
+
+class RNNOpModel : public SingleOpModel {
+ public:
+ RNNOpModel(int batches, int units, int size)
+ : batches_(batches), units_(units), input_size_(size) {
+ input_ = AddInput(TensorType_FLOAT32);
+ weights_ = AddInput(TensorType_FLOAT32);
+ recurrent_weights_ = AddInput(TensorType_FLOAT32);
+ bias_ = AddInput(TensorType_FLOAT32);
+ hidden_state_ = AddOutput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+ SetBuiltinOp(
+ BuiltinOperator_RNN, BuiltinOptions_RNNOptions,
+ CreateRNNOptions(builder_, ActivationFunctionType_RELU).Union());
+ BuildInterpreter({{batches_, input_size_},
+ {units_, input_size_},
+ {units_, units_},
+ {units_}});
+ }
+
+ void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+ void SetWeights(std::initializer_list<float> f) {
+ PopulateTensor(weights_, f);
+ }
+
+ void SetRecurrentWeights(std::initializer_list<float> f) {
+ PopulateTensor(recurrent_weights_, f);
+ }
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+
+ void SetInput(int offset, float* begin, float* end) {
+ PopulateTensor(input_, offset, begin, end);
+ }
+
+ void ResetHiddenState() {
+ const int zero_buffer_size = units_ * batches_;
+ std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+ memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+ PopulateTensor(hidden_state_, 0, zero_buffer.get(),
+ zero_buffer.get() + zero_buffer_size);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ int input_size() { return input_size_; }
+ int num_units() { return units_; }
+ int num_batches() { return batches_; }
+
+ private:
+ int input_;
+ int weights_;
+ int recurrent_weights_;
+ int bias_;
+ int hidden_state_;
+ int output_;
+
+ int batches_;
+ int units_;
+ int input_size_;
+};
+
+TEST(FullyConnectedOpTest, BlackBoxTest) {
+ RNNOpModel rnn(2, 16, 8);
+ rnn.SetWeights(
+ {0.461459, 0.153381, 0.529743, -0.00371218, 0.676267, -0.211346,
+ 0.317493, 0.969689, -0.343251, 0.186423, 0.398151, 0.152399,
+ 0.448504, 0.317662, 0.523556, -0.323514, 0.480877, 0.333113,
+ -0.757714, -0.674487, -0.643585, 0.217766, -0.0251462, 0.79512,
+ -0.595574, -0.422444, 0.371572, -0.452178, -0.556069, -0.482188,
+ -0.685456, -0.727851, 0.841829, 0.551535, -0.232336, 0.729158,
+ -0.00294906, -0.69754, 0.766073, -0.178424, 0.369513, -0.423241,
+ 0.548547, -0.0152023, -0.757482, -0.85491, 0.251331, -0.989183,
+ 0.306261, -0.340716, 0.886103, -0.0726757, -0.723523, -0.784303,
+ 0.0354295, 0.566564, -0.485469, -0.620498, 0.832546, 0.697884,
+ -0.279115, 0.294415, -0.584313, 0.548772, 0.0648819, 0.968726,
+ 0.723834, -0.0080452, -0.350386, -0.272803, 0.115121, -0.412644,
+ -0.824713, -0.992843, -0.592904, -0.417893, 0.863791, -0.423461,
+ -0.147601, -0.770664, -0.479006, 0.654782, 0.587314, -0.639158,
+ 0.816969, -0.337228, 0.659878, 0.73107, 0.754768, -0.337042,
+ 0.0960841, 0.368357, 0.244191, -0.817703, -0.211223, 0.442012,
+ 0.37225, -0.623598, -0.405423, 0.455101, 0.673656, -0.145345,
+ -0.511346, -0.901675, -0.81252, -0.127006, 0.809865, -0.721884,
+ 0.636255, 0.868989, -0.347973, -0.10179, -0.777449, 0.917274,
+ 0.819286, 0.206218, -0.00785118, 0.167141, 0.45872, 0.972934,
+ -0.276798, 0.837861, 0.747958, -0.0151566, -0.330057, -0.469077,
+ 0.277308, 0.415818});
+
+ rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
+ -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
+ 0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
+ -0.37609905});
+
+ rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0.1});
+
+ rnn.ResetHiddenState();
+ const int input_sequence_size = sizeof(rnn_input) / sizeof(float) /
+ (rnn.input_size() * rnn.num_batches());
+
+ for (int i = 0; i < input_sequence_size; i++) {
+ float* batch_start = rnn_input + i * rnn.input_size();
+ float* batch_end = batch_start + rnn.input_size();
+ rnn.SetInput(0, batch_start, batch_end);
+ rnn.SetInput(rnn.input_size(), batch_start, batch_end);
+
+ rnn.Invoke();
+
+ float* golden_start = rnn_golden_output + i * rnn.num_units();
+ float* golden_end = golden_start + rnn.num_units();
+ std::vector<float> expected;
+ expected.insert(expected.end(), golden_start, golden_end);
+ expected.insert(expected.end(), golden_start, golden_end);
+
+ EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+ }
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
new file mode 100644
index 0000000000..9e7a1233da
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -0,0 +1,200 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace concatenation {
+
+// This file has two implementation of Concatenation.
+enum KernelType {
+ kReference,
+ kGenericOptimized,
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
+ int axis = params->axis;
+ int num_inputs = node->inputs->size;
+
+ // The number of dimensions of the input tensors must match, and all
+ // dimensions except 'axis' must be equal.
+ TfLiteTensor* t0 = &context->tensors[node->inputs->data[0]];
+ TfLiteType input_type = t0->type;
+ TF_LITE_ENSURE(context, axis >= 0);
+ TF_LITE_ENSURE(context, axis < t0->dims->size);
+
+ // TODO(ahentz): These are limitations of our implementation that could be
+ // removed with a bit of effort.
+ TF_LITE_ENSURE(context, t0->dims->size <= 4);
+ TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
+ TF_LITE_ENSURE(context,
+ input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8);
+
+ // Output dimensions will match input dimensions, except 'axis', which
+ // will be the sum of inputs
+ int sum_axis = t0->dims->data[axis];
+ for (int i = 1; i < num_inputs; ++i) {
+ TfLiteTensor* t = &context->tensors[node->inputs->data[i]];
+ TF_LITE_ENSURE_EQ(context, t->dims->size, t0->dims->size);
+ TF_LITE_ENSURE_EQ(context, t->type, input_type);
+ if (input_type == kTfLiteUInt8) {
+ TF_LITE_ENSURE_EQ(context, t->params.zero_point, t0->params.zero_point);
+ TF_LITE_ENSURE_EQ(context, t->params.scale, t0->params.scale);
+ }
+ for (int d = 0; d < t0->dims->size; ++d) {
+ if (d == axis) {
+ sum_axis += t->dims->data[axis];
+ } else {
+ TF_LITE_ENSURE_EQ(context, t->dims->data[d], t0->dims->data[d]);
+ }
+ }
+ }
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCreate(t0->dims->size);
+ for (int d = 0; d < t0->dims->size; ++d) {
+ output_size->data[d] = (d == axis) ? sum_axis : t0->dims->data[d];
+ }
+
+ TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+ TF_LITE_ENSURE_EQ(context, output->type, input_type);
+ if (input_type == kTfLiteUInt8) {
+ TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+ t0->params.zero_point);
+ TF_LITE_ENSURE_EQ(context, output->params.scale, t0->params.scale);
+ }
+
+ return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+class VectorOfInputs {
+ public:
+ VectorOfInputs(const TfLiteContext& context, const TfLiteIntArray& inputs) {
+ int num_inputs = inputs.size;
+
+ all_data_.reserve(num_inputs);
+ all_dims_.reserve(num_inputs);
+ all_dims_ptr_.reserve(num_inputs);
+
+ for (int i = 0; i < num_inputs; ++i) {
+ TfLiteTensor* input = &context.tensors[inputs.data[i]];
+ all_data_.push_back(GetTensorData<T>(input));
+ all_dims_.push_back(GetTensorDims(input));
+ }
+
+ // Taking the pointer from inside a std::vector is only OK if the vector is
+ // never modified, so we populate all_dims in the previous loop and then we
+ // are free to grab iterators here.
+ for (int i = 0; i < num_inputs; ++i) {
+ all_dims_ptr_.push_back(&all_dims_[i]);
+ }
+ }
+ const T* const* data() const { return all_data_.data(); }
+ const Dims<4>* const* dims() const { return all_dims_ptr_.data(); }
+
+ private:
+ std::vector<T*> all_data_;
+ std::vector<Dims<4>> all_dims_;
+ std::vector<Dims<4>*> all_dims_ptr_;
+};
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
+
+ TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+
+// TODO(ahentz): Creating 'all_inputs' below is not very efficient. We should
+// allocate and populate these during Prepare().
+// TODO(ycling): Activation function parameter is ignored. For now we dont have
+// a model with a Concatenation with fused activation function.
+#define TF_LITE_CONCATENATION(type, scalar) \
+ VectorOfInputs<scalar> all_inputs(*context, *node->inputs); \
+ type::Concatenation<FusedActivationFunctionType::kNone, scalar>( \
+ RemapDim(NumDimensions(output), params->axis), all_inputs.data(), \
+ all_inputs.dims(), node->inputs->size, GetTensorData<scalar>(output), \
+ GetTensorDims(output))
+
+ switch (output->type) { // Already know in/outtypes are same.
+ case kTfLiteFloat32:
+ if (kernel_type == kReference) {
+ TF_LITE_CONCATENATION(reference_ops, float);
+ } else {
+ TF_LITE_CONCATENATION(optimized_ops, float);
+ }
+ break;
+ case kTfLiteUInt8:
+ if (kernel_type == kReference) {
+ TF_LITE_CONCATENATION(reference_ops, uint8_t);
+ } else {
+ TF_LITE_CONCATENATION(optimized_ops, uint8_t);
+ }
+ break;
+ default:
+ context->ReportError(context,
+ "Only float32 and uint8 are currently supported.");
+ return kTfLiteError;
+ }
+
+#undef TF_LITE_CONCATENATION
+
+ return kTfLiteOk;
+}
+
+#undef TF_LITE_MACRO_DISPATCH
+
+} // namespace concatenation
+
+TfLiteRegistration* Register_CONCATENATION_REF() {
+ static TfLiteRegistration r = {
+ nullptr, nullptr, concatenation::Prepare,
+ concatenation::Eval<concatenation::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_CONCATENATION_GENERIC_OPT() {
+ static TfLiteRegistration r = {
+ nullptr, nullptr, concatenation::Prepare,
+ concatenation::Eval<concatenation::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_CONCATENATION() {
+ // TODO(ahentz): It turns out the two versions of Concatenation are almost
+ // identical, so we should consider removing one.
+ return Register_CONCATENATION_GENERIC_OPT();
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/contrib/lite/kernels/concatenation_test.cc
new file mode 100644
index 0000000000..94e5b2acdc
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/concatenation_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseConcatenationOpModel : public SingleOpModel {
+ public:
+ // TODO(ahentz): Also test different activation types, axis, input
+ // dimensions.
+ BaseConcatenationOpModel(const TensorData& input_template, int axis,
+ int num_inputs) {
+ std::vector<std::vector<int>> all_input_shapes;
+ for (int i = 0; i < num_inputs; ++i) {
+ all_input_shapes.push_back(input_template.shape);
+ AddInput(input_template);
+ }
+ output_ = AddOutput({input_template.type, /*shape=*/{}, input_template.min,
+ input_template.max});
+ SetBuiltinOp(
+ BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+ CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+ .Union());
+ BuildInterpreter(all_input_shapes);
+ }
+
+ protected:
+ int output_;
+};
+
+class ConcatenationOpModel : public BaseConcatenationOpModel {
+ public:
+ using BaseConcatenationOpModel::BaseConcatenationOpModel;
+ void SetInput(int index, std::initializer_list<float> data) {
+ PopulateTensor(index, data);
+ }
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
+ public:
+ using BaseConcatenationOpModel::BaseConcatenationOpModel;
+ void SetInput(int index, std::initializer_list<float> data) {
+ QuantizeAndPopulate<uint8_t>(index, data);
+ }
+ std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+ std::vector<float> GetDequantizedOutput() {
+ return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+ GetScale(output_), GetZeroPoint(output_));
+ }
+};
+
+TEST(ConcatenationOpTest, ThreeDimensionalOneInput) {
+ ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/1,
+ /*num_inputs=*/1);
+ m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+ m0.Invoke();
+ EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
+}
+
+TEST(ConcatenationOpTest, OneTrivialInput) {
+ ConcatenationOpModel m0({TensorType_FLOAT32, {1}}, /*axis=*/0,
+ /*num_inputs=*/1);
+ m0.SetInput(0, {5.0f});
+ m0.Invoke();
+ EXPECT_THAT(m0.GetOutput(), ::testing::ElementsAre(5));
+}
+
+TEST(ConcatenationOpTest, TwoDimensionalOneInput) {
+ ConcatenationOpModel m0({TensorType_FLOAT32, {2, 3}}, /*axis=*/0,
+ /*num_inputs=*/1);
+ m0.SetInput(0, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+ m0.Invoke();
+ EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(ConcatenationOpTest, TwoInputsTwoAxis) {
+ // We will concatenate two tensors along different dimensions.
+ auto tensor0 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+ auto tensor1 = {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+
+ ConcatenationOpModel m0({TensorType_FLOAT32, {2, 3}}, /*axis=*/0,
+ /*num_inputs=*/2);
+ m0.SetInput(0, tensor0);
+ m0.SetInput(1, tensor1);
+ m0.Invoke();
+ EXPECT_THAT(m0.GetOutput(),
+ ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+
+ ConcatenationOpModel m1({TensorType_FLOAT32, {2, 3}}, /*axis=*/1,
+ /*num_inputs=*/2);
+ m1.SetInput(0, tensor0);
+ m1.SetInput(1, tensor1);
+ m1.Invoke();
+ EXPECT_THAT(m1.GetOutput(),
+ ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
+TEST(ConcatenationOpTest, FourInputs) {
+ ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/2,
+ /*num_inputs=*/4);
+ m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+ m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+ m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+ m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+ m0.Invoke();
+ EXPECT_THAT(m0.GetOutput(),
+ ElementsAreArray({
+ 1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f, //
+ 4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f, //
+ }));
+}
+
+TEST(ConcatenationOpTest, FourInputsQuantized) {
+ QuantizedConcatenationOpModel m0({TensorType_UINT8, {2, 1, 2}, -12.7, 12.8},
+ /*axis=*/2,
+ /*num_inputs=*/4);
+
+ m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+ m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+ m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+ m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+ m0.Invoke();
+ EXPECT_THAT(m0.GetDequantizedOutput(),
+ ElementsAreArray(ArrayFloatNear({
+ 1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f, //
+ 4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f, //
+ })));
+ EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+ 137, 157, 138, 158, 139, 159, 140, 160, //
+ 167, 197, 168, 198, 169, 199, 170, 200, //
+ }));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
new file mode 100644
index 0000000000..c75c04baea
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -0,0 +1,425 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace conv {
+
+// This file has three implementation of Conv.
+enum KernelType {
+ kReference,
+ kGenericOptimized, // Neon-free
+ kNeonOptimized,
+};
+
+struct OpData {
+ // IDs are the arbitrary identifiers used by TF Lite to identify and access
+ // memory buffers.
+ int im2col_id;
+ int hwcn_weights_id;
+
+ TfLitePaddingValues padding;
+ // The scaling factor from input to output (aka the 'real multiplier') can
+ // be represented as a fixed point multipler plus a left shift.
+ int32_t output_multiplier;
+ int output_shift;
+ // The range of the fused activation layer. For example for kNone and
+ // uint8_t these would be 0 and 255.
+ int32_t output_activation_min;
+ int32_t output_activation_max;
+ // Indexes are the offset to the memory buffer in the array used to keep track
+ // of the allocated temporaries.
+ int32_t im2col_index;
+ int32_t hwcn_weights_index;
+ bool need_hwcn_weights;
+ bool have_weights_been_transposed;
+ bool need_im2col;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+ // This is a builtin op, so we don't use the contents in 'buffer', if any.
+ // Instead, we allocate a new object to use as scratch space for im2col, and
+ // to carry information from Prepare() to Eval().
+ auto* data = new OpData;
+ context->AddTensors(context, 1, &data->im2col_id);
+ context->AddTensors(context, 1, &data->hwcn_weights_id);
+ gemm_support::IncrementUsageCounter(context);
+ return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+ gemm_support::DecrementUsageCounter(context);
+ delete reinterpret_cast<OpData*>(buffer);
+}
+
+// Naive implementation of transpose for floats. Could be optimized to be more
+// cache friendly, but for now it's a one-time cost on first run, and we would
+// prefer to remove the need to do this at all eventually.
+void TransposeFloatTensor(TfLiteTensor* input, TfLiteTensor* output) {
+ const int rows = output->dims->data[1];
+ const int cols = output->dims->data[0];
+ const float* input_data = GetTensorData<float>(input);
+ float* output_data = GetTensorData<float>(output);
+ for (int i = 0; i < rows; ++i) {
+ for (int j = 0; j < cols; ++j) {
+ const float in_value = input_data[i * cols + j];
+ output_data[j * rows + i] = in_value;
+ }
+ }
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ bool hasBias = node->inputs->size == 3;
+ // Check number of inputs/outputs
+ TF_LITE_ENSURE(context, hasBias || node->inputs->size == 2);
+ TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+ TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+ TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+ TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
+ // Check dimensionality of input, filter
+ TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
+ TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
+ // Check input channels matching filter
+ TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]);
+
+ // Check types. (We assume that UINT8 refers to quantized tensors)
+ TfLiteType data_type = input->type;
+ TF_LITE_ENSURE(context,
+ data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
+ TF_LITE_ENSURE_EQ(context, output->type, data_type);
+ TF_LITE_ENSURE_EQ(context, filter->type, data_type);
+
+ TfLiteTensor* bias = nullptr;
+
+ // TODO(ahentz): At this point the optimized versions require 'bias'. We can
+ // either change that or document that convolution requires it.
+ TF_LITE_ENSURE(context, hasBias);
+
+ if (hasBias) {
+ bias = &context->tensors[node->inputs->data[2]];
+ if (data_type == kTfLiteUInt8) {
+ TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+ TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+ } else {
+ TF_LITE_ENSURE_EQ(context, bias->type, data_type);
+ }
+ TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, bias->dims->data[0], filter->dims->data[0]);
+ }
+
+ int channels_out = filter->dims->data[0];
+ int width = input->dims->data[2];
+ int height = input->dims->data[1];
+ int filter_width = filter->dims->data[2];
+ int filter_height = filter->dims->data[1];
+ int batches = input->dims->data[0];
+
+ // Matching GetWindowedOutputSize in TensorFlow.
+ auto padding = params->padding;
+ auto computeOutSize = [padding](int imageSize, int filterSize,
+ int stride) -> int {
+ return padding == kTfLitePaddingSame
+ ? (imageSize + stride - 1) / stride
+ : padding == kTfLitePaddingValid
+ ? (imageSize - filterSize + stride) / stride
+ : 0;
+ };
+
+ int outWidth = computeOutSize(width, filter_width, params->stride_width);
+ int outHeight = computeOutSize(height, filter_height, params->stride_height);
+
+ data->padding.height =
+ ComputePadding(params->stride_height, height, filter_height, outHeight);
+ data->padding.width =
+ ComputePadding(params->stride_width, width, filter_width, outWidth);
+
+ TF_LITE_ENSURE(context, hasBias);
+
+ // Note that quantized inference requires that all tensors have their
+ // parameters set. This is usually done during quantized training.
+ if (data_type != kTfLiteFloat32) {
+ double real_multiplier = 0.0;
+ TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+ context, input, filter, bias, output, &real_multiplier));
+ QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
+ &data->output_shift);
+ CalculateActivationRangeUint8(params->activation, output,
+ &data->output_activation_min,
+ &data->output_activation_max);
+ }
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+ output_size->data[0] = batches;
+ output_size->data[1] = outHeight;
+ output_size->data[2] = outWidth;
+ output_size->data[3] = channels_out;
+ auto output_status = context->ResizeTensor(context, output, output_size);
+
+ if (output_status != kTfLiteOk) return output_status;
+
+ // We don't always need to allocate im2col. It is only used in some versions
+ // of the optimized Conv. This test just mimics something that happens inside
+ // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
+ data->need_im2col =
+ (params->stride_width != 1 || params->stride_height != 1 ||
+ filter_width != 1 || filter_height != 1);
+ // If we're using the optimized multithreaded EigenTensor implementation of
+ // convolution, it expects the filter weights to be transposed compared to
+ // the normal TF Lite buffer format. Typical TF Lite weights are
+ // [filter_count, filter_height, filter_width, input_depth], but for the float
+ // implementation we need them as [filter_height, filter_width, input_depth,
+ // filter_count]. We get to that format by transposing, and create a temporary
+ // buffer to store the results.
+ // This path is only used for float processing, so only create the buffer if
+ // we're running with that data type.
+ data->need_hwcn_weights = (data_type == kTfLiteFloat32);
+
+ int temporaries_count = 0;
+ if (data->need_im2col) {
+ data->im2col_index = temporaries_count;
+ ++temporaries_count;
+ }
+ if (data->need_hwcn_weights) {
+ data->hwcn_weights_index = temporaries_count;
+ ++temporaries_count;
+ }
+
+ TfLiteIntArrayFree(node->temporaries);
+ node->temporaries = TfLiteIntArrayCreate(temporaries_count);
+
+ if (data->need_im2col) {
+ node->temporaries->data[data->im2col_index] = data->im2col_id;
+
+ TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
+
+ int input_depth = input->dims->data[3];
+ im2col_size->data[0] = output_size->data[0];
+ im2col_size->data[1] = output_size->data[1];
+ im2col_size->data[2] = output_size->data[2];
+ im2col_size->data[3] = input_depth * filter_height * filter_width;
+
+ TfLiteTensor* im2col =
+ &context->tensors[node->temporaries->data[data->im2col_index]];
+ im2col->type = data_type;
+ im2col->allocation_type = kTfLiteArenaRw;
+ auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
+ if (im2col_status != kTfLiteOk) return im2col_status;
+ }
+
+ if (data->need_hwcn_weights) {
+ node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
+ TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
+
+ // Because we're treating the filter weights as a matrix when we do the
+ // transpose, we allocate the buffer with a two-dimensional shape, where one
+ // dimension is the number of elements in each filter, and the second is the
+ // total number of filters.
+ int input_depth = input->dims->data[3];
+ hwcn_weights_size->data[0] = (filter_height * filter_width * input_depth);
+ hwcn_weights_size->data[1] = channels_out;
+
+ TfLiteTensor* hwcn_weights =
+ &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
+ hwcn_weights->type = data_type;
+ hwcn_weights->allocation_type = kTfLiteDynamic;
+ // Make sure we release any previous allocations before we reallocate.
+ // TODO(petewarden): Persistent arenas would be a better fit for this, but
+ // they aren't fully implemented yet.
+ if (hwcn_weights->data.raw) {
+ free(hwcn_weights->data.raw);
+ hwcn_weights->data.raw = nullptr;
+ }
+ auto hwcn_weights_status =
+ context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
+ if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
+ hwcn_weights->data.raw = static_cast<char*>(malloc(hwcn_weights->bytes));
+
+ // TODO(petewarden): If Resize() is called when the size hasn't actually
+ // changed, this will do extra redundant work.
+ data->have_weights_been_transposed = false;
+ }
+
+ return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+ TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
+ TfLiteTensor* filter, TfLiteTensor* bias,
+ TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
+ TfLiteTensor* output) {
+ gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
+
+ auto input_offset = -input->params.zero_point;
+ auto filter_offset = -filter->params.zero_point;
+ auto output_offset = output->params.zero_point;
+
+ if (kernel_type == kReference) {
+ reference_ops::Conv(
+ GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+ GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+ GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
+ params->stride_height, data->padding.width, data->padding.height,
+ output_offset, data->output_multiplier, data->output_shift,
+ data->output_activation_min, data->output_activation_max,
+ GetTensorData<uint8_t>(output), GetTensorDims(output),
+ GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+ } else {
+ optimized_ops::Conv(
+ GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+ GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+ GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
+ params->stride_height, data->padding.width, data->padding.height,
+ output_offset, data->output_multiplier, data->output_shift,
+ data->output_activation_min, data->output_activation_max,
+ GetTensorData<uint8_t>(output), GetTensorDims(output),
+ GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+ }
+}
+
+template <KernelType kernel_type>
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+ TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
+ TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col,
+ TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+ float output_activation_min, output_activation_max;
+ CalculateActivationRangeFloat(params->activation, &output_activation_min,
+ &output_activation_max);
+
+ const float* filter_data;
+ if (data->need_hwcn_weights) {
+ filter_data = GetTensorData<float>(hwcn_weights);
+ } else {
+ filter_data = GetTensorData<float>(filter);
+ }
+
+ if (kernel_type == kReference) {
+ reference_ops::Conv(
+ GetTensorData<float>(input), GetTensorDims(input), filter_data,
+ GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
+ params->stride_width, params->stride_height, data->padding.width,
+ data->padding.height, output_activation_min, output_activation_max,
+ GetTensorData<float>(output), GetTensorDims(output),
+ GetTensorData<float>(im2col), GetTensorDims(im2col));
+ } else {
+ multithreaded_ops::Conv(
+ GetTensorData<float>(input), GetTensorDims(input), filter_data,
+ GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
+ params->stride_width, params->stride_height, data->padding.width,
+ data->padding.height, params->padding, output_activation_min,
+ output_activation_max, GetTensorData<float>(output),
+ GetTensorDims(output), GetTensorData<float>(im2col),
+ GetTensorDims(im2col));
+ }
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+ TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+ TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
+ bool hasBias = node->inputs->size == 3;
+ TfLiteTensor* bias =
+ hasBias ? &context->tensors[node->inputs->data[2]] : nullptr;
+ TfLiteTensor* im2col =
+ data->need_im2col
+ ? &context->tensors[node->temporaries->data[data->im2col_index]]
+ : nullptr;
+ TfLiteTensor* hwcn_weights =
+ data->need_hwcn_weights
+ ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
+ : nullptr;
+
+ if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
+ TransposeFloatTensor(filter, hwcn_weights);
+ data->have_weights_been_transposed = true;
+ }
+
+ // TODO(aselle): Consider whether float conv and quantized conv should be
+ // separate ops to avoid dispatch overhead here.
+ switch (input->type) { // Already know in/outtypes are same.
+ case kTfLiteFloat32:
+ EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
+ im2col, hwcn_weights, output);
+ break;
+ case kTfLiteUInt8:
+ EvalQuantized<kernel_type>(context, node, params, data, input, filter,
+ bias, im2col, hwcn_weights, output);
+ break;
+ default:
+ context->ReportError(context, "Type not currently supported.");
+ return kTfLiteError;
+ }
+ return kTfLiteOk;
+}
+
+} // namespace conv
+
+TfLiteRegistration* Register_CONVOLUTION_REF() {
+ static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+ conv::Eval<conv::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
+ static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+ conv::Eval<conv::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_CONVOLUTION_NEON_OPT() {
+ static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+ conv::Eval<conv::kNeonOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_CONV_2D() {
+#ifdef USE_NEON
+ return Register_CONVOLUTION_NEON_OPT();
+#else
+ return Register_CONVOLUTION_GENERIC_OPT();
+#endif
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
new file mode 100644
index 0000000000..18d7a31d59
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -0,0 +1,440 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseConvolutionOpModel : public SingleOpModel {
+ public:
+ // TODO(ahentz): Also test different activation types, bias, padding types,
+ // stride values.
+ BaseConvolutionOpModel(
+ const TensorData& input, const TensorData& filter,
+ const TensorData& output, int stride_width = 2, int stride_height = 2,
+ enum Padding padding = Padding_VALID,
+ enum ActivationFunctionType activation = ActivationFunctionType_NONE) {
+ input_ = AddInput(input);
+ filter_ = AddInput(filter);
+
+ int bias_size = GetShape(filter_)[0];
+ if (input.type == TensorType_FLOAT32) {
+ bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+ } else {
+ // This is a quantized version. The scale of 'bias' depends on the scales
+ // of input and filter. Supposedly this is correctly set during quantized
+ // training.
+ auto bias_scale = GetScale(input_) * GetScale(filter_);
+ TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+ bias_ = AddInput(bias);
+ }
+
+ output_ = AddOutput(output);
+ if (input.type != TensorType_FLOAT32) {
+ // The following is required by quantized inference. It is the unittest's
+ // responsibility to make sure the output scale falls into the correct
+ // range.
+ CHECK_LT(GetScale(input_) * GetScale(filter_), GetScale(output_));
+ }
+
+ SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
+ CreateConv2DOptions(builder_, padding, stride_width,
+ stride_height, activation)
+ .Union());
+
+ BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
+ }
+
+ protected:
+ int input_;
+ int filter_;
+ int bias_;
+ int output_;
+};
+
+class ConvolutionOpModel : public BaseConvolutionOpModel {
+ public:
+ using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+ void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
+
+ void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+TEST(ConvolutionOpTest, SimpleTestFloat32) {
+ ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+ {TensorType_FLOAT32, {3, 2, 2, 1}},
+ {TensorType_FLOAT32, {}});
+
+ m.SetInput({
+ // First batch
+ 1, 1, 1, 1, // row = 1
+ 2, 2, 2, 2, // row = 2
+ // Second batch
+ 1, 2, 3, 4, // row = 1
+ 1, 2, 3, 4, // row = 2
+ });
+ m.SetFilter({
+ 1, 2, 3, 4, // first 2x2 filter
+ -1, 1, -1, 1, // second 2x2 filter
+ -1, -1, 1, 1, // third 2x2 filter
+ });
+ m.SetBias({1, 2, 3});
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 18, 2, 5, // first batch, left
+ 18, 2, 5, // first batch, right
+ 17, 4, 3, // second batch, left
+ 37, 4, 3, // second batch, right
+ }));
+}
+
+TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
+ ConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 6, 1}},
+ {TensorType_FLOAT32, {1, 2, 2, 1}},
+ {TensorType_FLOAT32, {}},
+ /*stride_width=*/3, /*stride_height=*/1);
+ m.SetInput({
+ 3, 2, 1, -1, -2, -3, //
+ 4, 3, 2, -2, -3, -4, //
+ 5, 4, 3, -3, -4, -5, //
+ });
+ m.SetFilter({
+ 1, 2, //
+ 3, 4, //
+ });
+ m.SetBias({-1});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 30, -24, //
+ 40, -34, //
+ }));
+}
+
+TEST(ConvolutionOpTest, HandCalculatedFloat32) {
+ const int depth = 1;
+ const int image_width = 4;
+ const int image_height = 3;
+ const int image_batch_count = 1;
+ const int filter_size = 3;
+ const int filter_count = 1;
+ const int stride_width = 1;
+ const int stride_height = 1;
+ const Padding padding = Padding_SAME;
+ ConvolutionOpModel m(
+ {TensorType_FLOAT32,
+ {image_batch_count, image_height, image_width, depth}},
+ {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+ {TensorType_FLOAT32, {}}, stride_width, stride_height, padding);
+
+ // The image matrix is:
+ // | 1 | 2 | 3 | 4 |
+ // | 5 | 6 | 7 | 8 |
+ // | 9 | 10 | 11 | 12 |
+ m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+ // The filter matrix is:
+ // | 1 | 4 | 7 |
+ // | 2 | 5 | 8 |
+ // | 3 | 6 | 9 |
+ m.SetFilter({1, 4, 7, 2, 5, 8, 3, 6, 9});
+ // No bias for this test.
+ m.SetBias({0});
+
+ m.Invoke();
+ // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+ // the input set to zero because we're using the 'SAME' padding mode.
+ // The calculations behind the expected output are:
+ // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
+ // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
+ // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
+ // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
+ // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
+ // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+ // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+ // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
+ // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
+ // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
+ // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
+ // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+ // This means we should end up with this matrix:
+ // | 105 | 150 | 183 | 95 |
+ // | 235 | 312 | 357 | 178 |
+ // | 187 | 234 | 261 | 121 |
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 312, 357,
+ 178, 187, 234, 261, 121}));
+}
+
+TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
+ const int depth = 1;
+ const int image_width = 4;
+ const int image_height = 3;
+ const int image_batch_count = 1;
+ const int filter_size = 3;
+ const int filter_count = 1;
+ const int stride_width = 1;
+ const int stride_height = 1;
+ const Padding padding = Padding_SAME;
+ ConvolutionOpModel m(
+ {TensorType_FLOAT32,
+ {image_batch_count, image_height, image_width, depth}},
+ {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+ {TensorType_FLOAT32, {}}, stride_width, stride_height, padding);
+
+ // The image matrix is:
+ // | 1 | 2 | 3 | 4 |
+ // | 5 | 6 | 7 | 8 |
+ // | 9 | 10 | 11 | 12 |
+ m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+ // The filter matrix is:
+ // | 1 | 4 | 7 |
+ // | 2 | 5 | 8 |
+ // | 3 | 6 | 9 |
+ m.SetFilter({1, 4, 7, 2, 5, 8, 3, 6, 9});
+ // Bias is | 10 |.
+ m.SetBias({10});
+
+ m.Invoke();
+ // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+ // the input set to zero because we're using the 'SAME' padding mode.
+ // The calculations behind the expected output are:
+ // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)+10=115
+ // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)+10=160
+ // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)+10=193
+ // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)+10=105
+ // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)+10=245
+ // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)+10=322
+ // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)+10=367
+ // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)+10=188
+ // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)+10=197
+ // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)+10=244
+ // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)+10=271
+ // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)+10=131
+ // This means we should end up with this matrix:
+ // | 115 | 160 | 193 | 105 |
+ // | 245 | 322 | 367 | 188 |
+ // | 197 | 244 | 271 | 131 |
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({115, 160, 193, 105, 245, 322,
+ 367, 188, 197, 244, 271, 131}));
+}
+
+TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
+ const int depth = 1;
+ const int image_width = 4;
+ const int image_height = 3;
+ const int image_batch_count = 1;
+ const int filter_size = 3;
+ const int filter_count = 1;
+ const int stride_width = 1;
+ const int stride_height = 1;
+ const Padding padding = Padding_SAME;
+ ConvolutionOpModel m(
+ {TensorType_FLOAT32,
+ {image_batch_count, image_height, image_width, depth}},
+ {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+ {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
+ ActivationFunctionType_RELU);
+
+ // The image matrix is:
+ // | 1 | 2 | 3 | 4 |
+ // | 5 | 6 | 7 | 8 |
+ // | 9 | 10 | 11 | 12 |
+ m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+ // The filter matrix is:
+ // | 1 | 4 | 7 |
+ // | 2 | 5 | 8 |
+ // | 3 | 6 | 9 |
+ m.SetFilter({1, 4, 7, 2, 5, 8, 3, 6, 9});
+ // Bias is | -200 |.
+ m.SetBias({-200});
+
+ m.Invoke();
+ // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+ // the input set to zero because we're using the 'SAME' padding mode.
+ // The calculations behind the expected output are:
+ // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)-200=-95
+ // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)-200=-50
+ // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)-200=-17
+ // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)-200=-105
+ // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)-200=35
+ // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)-200=112
+ // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)-200=157
+ // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)-200=-22
+ // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)-200=-13
+ // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)-200=34
+ // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)-200=61
+ // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)-200=-79
+ // All negative values are gated to zero by the Relu activation function.
+ // This means we should end up with this matrix:
+ // | 0 | 0 | 0 | 0 |
+ // | 35 | 112 | 157 | 0 |
+ // | 0 | 34 | 61 | 0 |
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray({0, 0, 0, 0, 35, 112, 157, 0, 0, 34, 61, 0}));
+}
+
+TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
+ const int depth = 1;
+ const int image_width = 4;
+ const int image_height = 3;
+ const int image_batch_count = 1;
+ const int filter_size = 3;
+ const int filter_count = 1;
+ const int stride_width = 1;
+ const int stride_height = 1;
+ const Padding padding = Padding_VALID;
+ ConvolutionOpModel m(
+ {TensorType_FLOAT32,
+ {image_batch_count, image_height, image_width, depth}},
+ {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+ {TensorType_FLOAT32, {}}, stride_width, stride_height, padding);
+
+ // The image matrix is:
+ // | 1 | 2 | 3 | 4 |
+ // | 5 | 6 | 7 | 8 |
+ // | 9 | 10 | 11 | 12 |
+ m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+ // The filter matrix is:
+ // | 1 | 4 | 7 |
+ // | 2 | 5 | 8 |
+ // | 3 | 6 | 9 |
+ m.SetFilter({1, 4, 7, 2, 5, 8, 3, 6, 9});
+ // No bias for this test.
+ m.SetBias({0});
+
+ m.Invoke();
+ // We're sliding the 3x3 filter across the 3x4 image, with no accesses outside
+ // the input because we're using the 'VALID' padding mode, giving a 2x1
+ // output.
+ // The calculations behind the expected output are:
+ // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+ // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+ // This means we should end up with this matrix:
+ // | 312 | 357 |
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({312, 357}));
+}
+
+class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+ public:
+ using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+ void SetInput(std::initializer_list<float> data) {
+ QuantizeAndPopulate<uint8_t>(input_, data);
+ }
+
+ void SetFilter(std::initializer_list<float> data) {
+ QuantizeAndPopulate<uint8_t>(filter_, data);
+ }
+
+ void SetBias(std::initializer_list<float> data) {
+ QuantizeAndPopulate<int32_t>(bias_, data);
+ }
+
+ std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+ std::vector<float> GetDequantizedOutput() {
+ return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+ GetScale(output_), GetZeroPoint(output_));
+ }
+};
+
+// In this tests we set the input and output scales so that the results
+// match exactly the 'non-quantized' version.
+TEST(ConvolutionOpTest, SimpleTestQuantized) {
+ QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+ {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+ {TensorType_UINT8, {}, -127, 128});
+ m.SetInput({
+ // First batch
+ 1, 1, 1, 1, // row = 1
+ 2, 2, 2, 2, // row = 2
+ // Second batch
+ 1, 2, 3, 4, // row = 1
+ 1, 2, 3, 4, // row = 2
+ });
+ m.SetFilter({
+ 1, 2, 3, 4, // first 2x2 filter
+ -1, 1, -1, 1, // second 2x2 filter
+ -1, -1, 1, 1, // third 2x2 filter
+ });
+ m.SetBias({1, 2, 3});
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetDequantizedOutput(),
+ ElementsAreArray(ArrayFloatNear(
+ {
+ 18, 2, 5, // first batch, left
+ 18, 2, 5, // first batch, right
+ 17, 4, 3, // second batch, left
+ 37, 4, 3, // second batch, right
+ },
+ 1e-5)));
+ // For good measure, let's also verify the quantized values:
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 145, 129, 132, //
+ 145, 129, 132, //
+ 144, 131, 130, //
+ 164, 131, 130, //
+ }));
+}
+
+TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
+ QuantizedConvolutionOpModel m({TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
+ {TensorType_UINT8, {1, 2, 2, 1}, -63.5, 64},
+ {TensorType_UINT8, {}, -127, 128},
+ /*stride_width=*/3, /*stride_height=*/1);
+ m.SetInput({
+ 3, 2, 1, -1, -2, -3, //
+ 4, 3, 2, -2, -3, -4, //
+ 5, 4, 3, -3, -4, -5, //
+ });
+ m.SetFilter({
+ 1, 2, //
+ 3, 4, //
+ });
+ m.SetBias({-1});
+ m.Invoke();
+ EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({
+ 30, -24, //
+ 40, -34, //
+ })));
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 157, 103, //
+ 167, 93, //
+ }));
+}
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
new file mode 100644
index 0000000000..15dbfe08c8
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -0,0 +1,289 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace depthwise_conv {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// This file has three implementation of DepthwiseConv.
+enum KernelType {
+ kReference,
+ kGenericOptimized, // Neon-free
+ kNeonOptimized,
+};
+
+struct OpData {
+ TfLitePaddingValues padding;
+ // The scaling factor from input to output (aka the 'real multiplier') can
+ // be represented as a fixed point multipler plus a left shift.
+ int32_t output_multiplier;
+ int output_shift;
+ // The range of the fused activation layer. For example for kNone and
+ // uint8_t these would be 0 and 255.
+ int32_t output_activation_min;
+ int32_t output_activation_max;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+ // This is a builtin op, so we don't use the contents in 'buffer', if any.
+ // Instead, we allocate a new object to carry information from Prepare() to
+ // Eval().
+ return new OpData;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+ delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ // TODO(ahentz): use could use GetOptionalInputTensor() here, but we need to
+ // decide whether we are OK with optional tensors being completely absent, as
+ // opposed to having -1 as their index.
+ bool hasBias = NumInputs(node) == 3;
+
+ TF_LITE_ENSURE(context, hasBias || NumInputs(node) == 2);
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+ TfLiteTensor* bias = nullptr;
+
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 4);
+
+ // The parameter 'depth_multiplier' is redundant, so we check here to make
+ // sure it is consistent with the given dimensions.
+ TF_LITE_ENSURE_EQ(context,
+ params->depth_multiplier * SizeOfDimension(input, 3),
+ SizeOfDimension(filter, 3));
+
+ const TfLiteType data_type = input->type;
+ TF_LITE_ENSURE(context,
+ data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
+ TF_LITE_ENSURE_EQ(context, output->type, data_type);
+ TF_LITE_ENSURE_EQ(context, filter->type, data_type);
+
+ if (hasBias) {
+ bias = GetInput(context, node, kBiasTensor);
+ if (data_type == kTfLiteUInt8) {
+ TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+ TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+ } else {
+ TF_LITE_ENSURE_EQ(context, bias->type, data_type);
+ }
+ TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
+ TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 3),
+ SizeOfDimension(bias, 0));
+ }
+
+ int channels_out = SizeOfDimension(filter, 3);
+ int width = SizeOfDimension(input, 2);
+ int height = SizeOfDimension(input, 1);
+ int filter_width = SizeOfDimension(filter, 2);
+ int filter_height = SizeOfDimension(filter, 1);
+ int batches = SizeOfDimension(input, 0);
+
+ // Matching GetWindowedOutputSize in TensorFlow.
+ auto padding = params->padding;
+ auto compute_out_size = [padding](int imageSize, int filterSize,
+ int stride) -> int {
+ return padding == kTfLitePaddingSame
+ ? (imageSize + stride - 1) / stride
+ : padding == kTfLitePaddingValid
+ ? (imageSize - filterSize + stride) / stride
+ : 0;
+ };
+
+ int out_width = compute_out_size(width, filter_width, params->stride_width);
+ int out_height =
+ compute_out_size(height, filter_height, params->stride_height);
+
+ data->padding.height =
+ ComputePadding(params->stride_height, height, filter_height, out_height);
+ data->padding.width =
+ ComputePadding(params->stride_width, width, filter_width, out_width);
+
+ // Note that quantized inference requires that all tensors have their
+ // parameters set. This is usually done during quantized training.
+ if (data_type != kTfLiteFloat32) {
+ double real_multiplier = 0.0;
+ TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+ context, input, filter, bias, output, &real_multiplier));
+ QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
+ &data->output_shift);
+ CalculateActivationRangeUint8(params->activation, output,
+ &data->output_activation_min,
+ &data->output_activation_max);
+ }
+
+ TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
+ outputSize->data[0] = batches;
+ outputSize->data[1] = out_height;
+ outputSize->data[2] = out_width;
+ outputSize->data[3] = channels_out;
+ return context->ResizeTensor(context, output, outputSize);
+}
+
+template <KernelType kernel_type>
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+ TfLiteDepthwiseConvParams* params, OpData* data,
+ TfLiteTensor* input, TfLiteTensor* filter, TfLiteTensor* bias,
+ TfLiteTensor* output) {
+ float output_activation_min, output_activation_max;
+ CalculateActivationRangeFloat(params->activation, &output_activation_min,
+ &output_activation_max);
+
+ void (*depthwise_conv)(const float*, const Dims<4>&, const float*,
+ const Dims<4>&, const float*, const Dims<4>&, int, int,
+ int, int, int, float, float, float*, const Dims<4>&);
+ if (kernel_type == kReference) {
+ depthwise_conv = &reference_ops::DepthwiseConv;
+ } else {
+ depthwise_conv = &optimized_ops::DepthwiseConv;
+ }
+
+ depthwise_conv(
+ GetTensorData<float>(input), GetTensorDims(input),
+ GetTensorData<float>(filter), GetTensorDims(filter),
+ GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width,
+ params->stride_height, data->padding.width, data->padding.height,
+ params->depth_multiplier, output_activation_min, output_activation_max,
+ GetTensorData<float>(output), GetTensorDims(output));
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+ TfLiteDepthwiseConvParams* params, OpData* data,
+ TfLiteTensor* input, TfLiteTensor* filter,
+ TfLiteTensor* bias, TfLiteTensor* output) {
+ auto input_offset = -input->params.zero_point;
+ auto filter_offset = -filter->params.zero_point;
+ auto output_offset = output->params.zero_point;
+
+ void (*depthwise_conv)(const uint8*, const Dims<4>&, int32, const uint8*,
+ const Dims<4>&, int32, const int32*, const Dims<4>&,
+ int, int, int, int, int, int32, int32, int, int32,
+ int32, uint8*, const Dims<4>&);
+ if (kernel_type == kReference) {
+ depthwise_conv = &reference_ops::DepthwiseConv;
+ } else {
+ depthwise_conv = &optimized_ops::DepthwiseConv;
+ }
+
+ depthwise_conv(
+ GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+ GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+ GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
+ params->stride_height, data->padding.width, data->padding.height,
+ params->depth_multiplier, output_offset, data->output_multiplier,
+ data->output_shift, data->output_activation_min,
+ data->output_activation_max, GetTensorData<uint8_t>(output),
+ GetTensorDims(output));
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+ TfLiteTensor* bias =
+ (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+ // TODO(aselle): Consider whether float conv and quantized conv should be
+ // separate ops to avoid dispatch overhead here.
+ switch (input->type) { // Already know in/out types are same.
+ case kTfLiteFloat32:
+ EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
+ output);
+ break;
+ case kTfLiteUInt8:
+ EvalQuantized<kernel_type>(context, node, params, data, input, filter,
+ bias, output);
+ break;
+ default:
+ context->ReportError(context, "Type not currently supported.");
+ return kTfLiteError;
+ }
+ return kTfLiteOk;
+}
+
+} // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF() {
+ static TfLiteRegistration r = {
+ depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
+ depthwise_conv::Eval<depthwise_conv::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT() {
+ static TfLiteRegistration r = {
+ depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
+ depthwise_conv::Eval<depthwise_conv::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT() {
+ static TfLiteRegistration r = {
+ depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
+ depthwise_conv::Eval<depthwise_conv::kNeonOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+#ifdef USE_NEON
+ return Register_DEPTHWISE_CONVOLUTION_NEON_OPT();
+#else
+ return Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT();
+#endif
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
new file mode 100644
index 0000000000..39227b2811
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
@@ -0,0 +1,186 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
+ public:
+ // TODO(ahentz): Also test different activation types, bias, padding types,
+ // stride values.
+ BaseDepthwiseConvolutionOpModel(const TensorData& input,
+ const TensorData& filter,
+ const TensorData& output) {
+ input_ = AddInput(input);
+ filter_ = AddInput(filter);
+
+ int bias_size = GetShape(filter_)[3];
+ if (input.type == TensorType_FLOAT32) {
+ bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+ } else {
+ // This is a quantized version. The scale of 'bias' depends on the scales
+ // of input and filter. Supposedly this is correctly set during quantized
+ // training.
+ auto bias_scale = GetScale(input_) * GetScale(filter_);
+ TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+ bias_ = AddInput(bias);
+ }
+
+ output_ = AddOutput(output);
+ if (input.type != TensorType_FLOAT32) {
+ // The following is required by quantized inference. It is the unittest's
+ // responsibility to make sure the output scale falls into the correct
+ // range.
+ CHECK_LT(GetScale(input_) * GetScale(filter_), GetScale(output_));
+ }
+
+ int input_depth = GetShape(input_)[3];
+ int output_depth = GetShape(filter_)[3];
+ int depth_mul = output_depth / input_depth;
+
+ SetBuiltinOp(
+ BuiltinOperator_DEPTHWISE_CONV_2D,
+ BuiltinOptions_DepthwiseConv2DOptions,
+ CreateDepthwiseConv2DOptions(builder_, Padding_VALID, 1, 1, depth_mul,
+ ActivationFunctionType_NONE)
+ .Union());
+
+ BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
+ }
+
+ protected:
+ int input_;
+ int filter_;
+ int bias_;
+ int output_;
+};
+
+class DepthwiseConvolutionOpModel : public BaseDepthwiseConvolutionOpModel {
+ public:
+ using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
+
+ void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
+
+ void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+TEST(DepthwiseConvolutionOpTest, SimpleTest) {
+ DepthwiseConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 2, 2}},
+ {TensorType_FLOAT32, {1, 2, 2, 4}},
+ {TensorType_FLOAT32, {}});
+
+ m.SetInput({
+ 1, 2, 7, 8, // column 1
+ 3, 4, 9, 10, // column 2
+ 5, 6, 11, 12, // column 3
+ });
+ m.SetFilter({
+ 1, 2, 3, 4, //
+ -9, 10, -11, 12, //
+ 5, 6, 7, 8, //
+ 13, -14, 15, -16, //
+ });
+ m.SetBias({1, 2, 3, 4});
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 71, -34, 99, -20, //
+ 91, -26, 127, -4, //
+ }));
+}
+
+class QuantizedDepthwiseConvolutionOpModel
+ : public BaseDepthwiseConvolutionOpModel {
+ public:
+ using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
+
+ void SetInput(std::initializer_list<float> data) {
+ QuantizeAndPopulate<uint8_t>(input_, data);
+ }
+
+ void SetFilter(std::initializer_list<float> data) {
+ QuantizeAndPopulate<uint8_t>(filter_, data);
+ }
+
+ void SetBias(std::initializer_list<float> data) {
+ QuantizeAndPopulate<int32_t>(bias_, data);
+ }
+
+ std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+ std::vector<float> GetDequantizedOutput() {
+ return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+ GetScale(output_), GetZeroPoint(output_));
+ }
+};
+
+// In this test we set the input and output scales so that the results match
+// exactly the 'non-quantized' version.
+TEST(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) {
+ QuantizedDepthwiseConvolutionOpModel m(
+ {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64},
+ {TensorType_UINT8, {1, 2, 2, 4}, -63.5, 64},
+ {TensorType_UINT8, {}, -127, 128});
+
+ m.SetInput({
+ 1, 2, 7, 8, // column 1
+ 3, 4, 9, 10, // column 2
+ 5, 6, 11, 12, // column 3
+ });
+ m.SetFilter({
+ 1, 2, 3, 4, //
+ -9, 10, -11, 12, //
+ 5, 6, 7, 8, //
+ 13, -14, 15, -16, //
+ });
+ m.SetBias({1, 2, 3, 4});
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+ {
+ 71, -34, 99, -20, //
+ 91, -26, 127, -4, //
+ },
+ 1e-5)));
+ // For good measure, let's also verify the quantized values:
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 198, 93, 226, 107, //
+ 218, 101, 254, 123, //
+ }));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup.cc b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
new file mode 100644
index 0000000000..4e8cb396d4
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Ops that looks up items from matrix.
+//
+// Input:
+// Tensor[0]: Row number to lookup, dim.size == 1, int32
+// Tensor[1]: 2-dimensional matrix of multi-dimensional items
+// dim.size >= 2, any data type.
+// first dimension is row, second dimension is column.
+//
+// Output:
+// Output.dim[0] == Tensor[0].dim[0], num of lookups
+// Output.dim[1] == Tensor[1].dim[1], num of items per row
+// Each item in output is a raw bytes copy of corresponding item in input.
+// When indices are out of bound, the ops will not succeed.
+//
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace embedding_lookup {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* lookup = GetInput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
+ TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
+
+ TfLiteTensor* value = GetInput(context, node, 1);
+ TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
+
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TfLiteIntArray* outputSize = TfLiteIntArrayCreate(NumDimensions(value));
+
+ outputSize->data[0] = SizeOfDimension(lookup, 0);
+ outputSize->data[1] = SizeOfDimension(value, 1);
+ for (int i = 2; i < NumDimensions(value); i++) {
+ outputSize->data[i] = SizeOfDimension(value, i);
+ }
+ return context->ResizeTensor(context, output, outputSize);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TfLiteTensor* lookup = GetInput(context, node, 0);
+ TfLiteTensor* value = GetInput(context, node, 1);
+
+ const int row_size = SizeOfDimension(value, 0);
+ const int row_bytes = value->bytes / row_size;
+
+ for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
+ int idx = lookup->data.i32[i];
+ if (idx >= row_size || idx < 0) {
+ context->ReportError(context, "Embedding Lookup: index out of bounds.");
+ return kTfLiteError;
+ } else {
+ memcpy(output->data.raw + i * row_bytes,
+ value->data.raw + idx * row_bytes, row_bytes);
+ }
+ }
+
+ return kTfLiteOk;
+}
+
+} // namespace embedding_lookup
+
+TfLiteRegistration* Register_EMBEDDING_LOOKUP() {
+ static TfLiteRegistration r = {nullptr, nullptr, embedding_lookup::Prepare,
+ embedding_lookup::Eval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
new file mode 100644
index 0000000000..6c770e7f71
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
@@ -0,0 +1,248 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Op that looks up items from a sparse tensor in an embedding matrix.
+// The sparse lookup tensor is represented by three individual tensors: lookup,
+// indices, and dense_shape. The representation assume that the corresponding
+// dense tensor would satisfy:
+// * dense.shape = dense_shape
+// * dense[tuple(indices[i])] = lookup[i]
+//
+// By convention, indices should be sorted.
+//
+// Options:
+// combiner: The reduction op (SUM, MEAN, SQRTN).
+// * SUM computes the weighted sum of the embedding results.
+// * MEAN is the weighted sum divided by the total weight.
+// * SQRTN is the weighted sum divided by the square root of the sum of the
+// squares of the weights.
+//
+// Input:
+// Tensor[0]: Ids to lookup, dim.size == 1, int32.
+// Tensor[1]: Indices, int32.
+// Tensor[2]: Dense shape, int32.
+// Tensor[3]: Weights to use for aggregation, float.
+// Tensor[4]: Params, a matrix of multi-dimensional items,
+// dim.size >= 2, float.
+//
+// Output:
+// A (dense) tensor representing the combined embeddings for the sparse ids.
+// For each row in the sparse tensor represented by (lookup, indices, shape)
+// the op looks up the embeddings for all ids in that row, multiplies them by
+// the corresponding weight, and combines these embeddings as specified in the
+// last dimension.
+//
+// Output.dim = [l0, ... , ln-1, e1, ..., em]
+// Where dense_shape == [l0, ..., ln] and Tensor[4].dim == [e0, e1, ..., em]
+//
+// For instance, if params is a 10x20 matrix and ids, weights are:
+//
+// [0, 0]: id 1, weight 2.0
+// [0, 1]: id 3, weight 0.5
+// [1, 0]: id 0, weight 1.0
+// [2, 3]: id 1, weight 3.0
+//
+// with combiner=MEAN, then the output will be a (3, 20) tensor where:
+//
+// output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
+// output[1, :] = (params[0, :] * 1.0) / 1.0
+// output[2, :] = (params[1, :] * 3.0) / 3.0
+//
+// When indices are out of bound, the op will not succeed.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+namespace {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 5);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* ids = GetInput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(ids), 1);
+ TF_LITE_ENSURE_EQ(context, ids->type, kTfLiteInt32);
+
+ TfLiteTensor* indices = GetInput(context, node, 1);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(indices), 2);
+ TF_LITE_ENSURE_EQ(context, indices->type, kTfLiteInt32);
+
+ TfLiteTensor* shape = GetInput(context, node, 2);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(shape), 1);
+ TF_LITE_ENSURE_EQ(context, shape->type, kTfLiteInt32);
+
+ TfLiteTensor* weights = GetInput(context, node, 3);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 1);
+ TF_LITE_ENSURE_EQ(context, weights->type, kTfLiteFloat32);
+
+ TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0),
+ SizeOfDimension(ids, 0));
+ TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0),
+ SizeOfDimension(weights, 0));
+
+ TfLiteTensor* value = GetInput(context, node, 4);
+ TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
+
+ // Mark the output as a dynamic tensor.
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+ output->allocation_type = kTfLiteDynamic;
+
+ return kTfLiteOk;
+}
+
+void FinalizeAggregation(TfLiteCombinerType combiner, int num_elements,
+ float current_total_weight,
+ float current_squares_weight, int embedding_size,
+ float* output) {
+ if (combiner != kTfLiteCombinerTypeSum && num_elements > 0) {
+ float multiplier = 1.0;
+ switch (combiner) {
+ case kTfLiteCombinerTypeMean:
+ multiplier = current_total_weight;
+ break;
+ case kTfLiteCombinerTypeSqrtn:
+ multiplier = std::sqrt(current_squares_weight);
+ break;
+ default:
+ break;
+ }
+ for (int k = 0; k < embedding_size; k++) {
+ output[k] /= multiplier;
+ }
+ }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteEmbeddingLookupSparseParams*>(node->builtin_data);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TfLiteTensor* ids = GetInput(context, node, 0);
+ TfLiteTensor* indices = GetInput(context, node, 1);
+ TfLiteTensor* dense_shape = GetInput(context, node, 2);
+ TfLiteTensor* weights = GetInput(context, node, 3);
+ TfLiteTensor* value = GetInput(context, node, 4);
+
+ const int lookup_rank = SizeOfDimension(indices, 1);
+ const int embedding_rank = NumDimensions(value);
+ const int num_lookups = SizeOfDimension(ids, 0);
+ const int num_rows = SizeOfDimension(value, 0);
+
+ // The last dimension gets replaced by the embedding.
+ const int output_rank = (lookup_rank - 1) + (embedding_rank - 1);
+
+ // Make sure that the actual dense shape of the sparse tensor represented by
+ // (loopkup, indices, dense_shape) is consistent.
+ TF_LITE_ENSURE_EQ(context, SizeOfDimension(dense_shape, 0), lookup_rank);
+
+ // Resize output tensor.
+ TfLiteIntArray* output_shape = TfLiteIntArrayCreate(output_rank);
+ int k = 0;
+ int embedding_size = 1;
+ int lookup_size = 1;
+ for (int i = 0; i < lookup_rank - 1; i++, k++) {
+ const int dim = dense_shape->data.i32[i];
+ lookup_size *= dim;
+ output_shape->data[k] = dim;
+ }
+ for (int i = 1; i < embedding_rank; i++, k++) {
+ const int dim = SizeOfDimension(value, i);
+ embedding_size *= dim;
+ output_shape->data[k] = dim;
+ }
+ TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_shape));
+ const int output_size = lookup_size * embedding_size;
+ TfLiteTensorRealloc(output_size * sizeof(float), output);
+
+ tensor_utils::ZeroVector(output->data.f, output_size);
+
+ // Keep track of the current bucket for aggregation/combination.
+ int current_output_offset = 0;
+ float current_total_weight = 0.0;
+ float current_squares_weight = 0.0;
+ int num_elements = 0;
+
+ for (int i = 0; i < num_lookups; i++) {
+ int idx = ids->data.i32[i];
+ if (idx >= num_rows || idx < 0) {
+ context->ReportError(context,
+ "Embedding Lookup Sparse: index out of bounds.");
+ return kTfLiteError;
+ }
+
+ // Check where we need to aggregate.
+ const int example_indices_offset = i * lookup_rank;
+ int output_bucket = 0;
+ int stride = 1;
+ for (int k = (lookup_rank - 1) - 1; k >= 0; k--) {
+ output_bucket += indices->data.i32[example_indices_offset + k] * stride;
+ stride *= dense_shape->data.i32[k];
+ }
+ const int output_offset = output_bucket * embedding_size;
+
+ // If we are in a new aggregation bucket and the combiner is not the sum,
+ // go back and finalize the result of the previous bucket.
+ if (output_offset != current_output_offset) {
+ FinalizeAggregation(params->combiner, num_elements, current_total_weight,
+ current_squares_weight, embedding_size,
+ &output->data.f[current_output_offset]);
+
+ // Track next bucket.
+ num_elements = 0;
+ current_total_weight = 0.0;
+ current_squares_weight = 0.0;
+ current_output_offset = output_offset;
+ }
+
+ // Add element to aggregation.
+ ++num_elements;
+ const int example_embedding_offset = idx * embedding_size;
+ const float w = weights->data.f[i];
+ current_squares_weight += w * w;
+ current_total_weight += w;
+ for (int k = 0; k < embedding_size; k++) {
+ output->data.f[current_output_offset + k] +=
+ (value->data.f[example_embedding_offset + k] * w);
+ }
+ }
+
+ // Finalize last bucket.
+ FinalizeAggregation(params->combiner, num_elements, current_total_weight,
+ current_squares_weight, embedding_size,
+ &output->data.f[current_output_offset]);
+
+ return kTfLiteOk;
+}
+
+} // namespace
+
+TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE() {
+ static TfLiteRegistration r = {nullptr, nullptr, Prepare, Eval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
new file mode 100644
index 0000000000..69d9c5cc7d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
@@ -0,0 +1,166 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite sparse lookup op.
+
+#include <cmath>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class EmbeddingLookupSparseOpModel : public SingleOpModel {
+ public:
+ EmbeddingLookupSparseOpModel(CombinerType type,
+ std::initializer_list<int> lookup_shape,
+ std::initializer_list<int> indices_shape,
+ std::initializer_list<int> dense_shape_shape,
+ std::initializer_list<int> value_shape) {
+ lookup_ = AddInput(TensorType_INT32);
+ indices_ = AddInput(TensorType_INT32);
+ dense_shape_ = AddInput(TensorType_INT32);
+ weights_ = AddInput(TensorType_FLOAT32);
+ value_ = AddInput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+ SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+ BuiltinOptions_EmbeddingLookupSparseOptions,
+ CreateEmbeddingLookupSparseOptions(builder_, type).Union());
+ BuildInterpreter({lookup_shape, indices_shape, dense_shape_shape,
+ lookup_shape, value_shape});
+ }
+
+ void SetInput(std::initializer_list<int> lookup_data,
+ std::initializer_list<int> indices_data,
+ std::initializer_list<int> dense_shape_data,
+ std::initializer_list<float> weights_data) {
+ PopulateTensor(lookup_, lookup_data);
+ PopulateTensor(indices_, indices_data);
+ PopulateTensor(dense_shape_, dense_shape_data);
+ PopulateTensor(weights_, weights_data);
+ }
+
+ void Set3DWeightMatrix(const std::function<float(int, int, int)>& function) {
+ TfLiteTensor* tensor = interpreter_->tensor(value_);
+ int rows = tensor->dims->data[0];
+ int columns = tensor->dims->data[1];
+ int features = tensor->dims->data[2];
+ for (int i = 0; i < rows; i++) {
+ for (int j = 0; j < columns; j++) {
+ for (int k = 0; k < features; k++) {
+ tensor->data.f[(i * columns + j) * features + k] = function(i, j, k);
+ }
+ }
+ }
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+ int lookup_;
+ int weights_;
+ int indices_;
+ int dense_shape_;
+ int value_;
+ int output_;
+};
+
+TEST(EmbeddingLookupOpTest, SimpleTest) {
+ EmbeddingLookupSparseOpModel m(CombinerType_SUM, {3}, {3, 2}, {2}, {4, 3, 2});
+ m.SetInput({1, 3, 0}, {0, 0, 2, 0, 2, 1}, {3, 2}, {1.0, 2.0, 4.0});
+ m.Set3DWeightMatrix(
+ [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({
+ 1.00, 1.01, 1.10, 1.11, 1.20, 1.21, // Row 1
+ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, // -
+ 6.00, 6.06, 6.60, 6.66, 7.20, 7.26, // 2 * Row 3 + 4 * Row 0
+ })));
+}
+
+TEST(EmbeddingLookupOpTest, SimpleTestMean) {
+ EmbeddingLookupSparseOpModel m(CombinerType_MEAN, {3}, {3, 2}, {2},
+ {4, 3, 2});
+ m.SetInput({1, 3, 0}, {0, 0, 2, 0, 2, 1}, {3, 2}, {1.0, 2.0, 4.0});
+ m.Set3DWeightMatrix(
+ [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({
+ 1.00, 1.01, 1.10, 1.11, 1.20, 1.21, // Row 1
+ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, // -
+ 1.00, 1.01, 1.10, 1.11, 1.20, 1.21, // 2 * Row 3 + 4 * Row 0
+ })));
+}
+
+TEST(EmbeddingLookupOpTest, SimpleTestSqrtn) {
+ EmbeddingLookupSparseOpModel m(CombinerType_SQRTN, {3}, {3, 2}, {2},
+ {4, 3, 2});
+ m.SetInput({1, 3, 0}, {0, 0, 2, 0, 2, 1}, {3, 2}, {1.0, 2.0, 4.0});
+ m.Set3DWeightMatrix(
+ [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+ m.Invoke();
+
+ EXPECT_THAT(
+ m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({
+ 1.00, 1.01, 1.10, 1.11, 1.20, 1.21, // Row 1
+ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, // -
+ 6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
+ 6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
+ 7.20f / std::sqrt(20.0f),
+ 7.26f /
+ std::sqrt(
+ 20.0f), // 2 * Row 3 + 4 * Row 0, // 2 * Row 3 + 4 * Row 0
+ })));
+}
+
+TEST(EmbeddingLookupOpTest, Indices3DTest) {
+ EmbeddingLookupSparseOpModel m(CombinerType_SUM, {3}, {3, 3}, {3}, {4, 3, 2});
+ m.SetInput({1, 3, 0}, {0, 0, 0, 2, 0, 0, 2, 0, 1}, {3, 2, 2},
+ {1.0, 2.0, 4.0});
+ m.Set3DWeightMatrix(
+ [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({
+ 1.00, 1.01, 1.10, 1.11, 1.20, 1.21, 0.00, 0.00, 0.00,
+ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
+ 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 6.00, 6.06, 6.60,
+ 6.66, 7.20, 7.26, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
+ })));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+#ifdef OS_LINUX
+ tflite::LogToStderr();
+#endif
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
new file mode 100644
index 0000000000..8c030b0677
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Lookup op.
+
+#include <iomanip>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class EmbeddingLookupOpModel : public SingleOpModel {
+ public:
+ EmbeddingLookupOpModel(std::initializer_list<int> index_shape,
+ std::initializer_list<int> weight_shape) {
+ input_ = AddInput(TensorType_INT32);
+ weight_ = AddInput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+ SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP, BuiltinOptions_NONE, 0);
+ BuildInterpreter({index_shape, weight_shape});
+ }
+
+ void SetInput(std::initializer_list<int> data) {
+ PopulateTensor(input_, data);
+ }
+
+ void Set3DWeightMatrix(const std::function<float(int, int, int)>& function) {
+ TfLiteTensor* tensor = interpreter_->tensor(weight_);
+ int rows = tensor->dims->data[0];
+ int columns = tensor->dims->data[1];
+ int features = tensor->dims->data[2];
+ for (int i = 0; i < rows; i++) {
+ for (int j = 0; j < columns; j++) {
+ for (int k = 0; k < features; k++) {
+ tensor->data.f[(i * columns + j) * features + k] = function(i, j, k);
+ }
+ }
+ }
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+ int input_;
+ int weight_;
+ int output_;
+};
+
+// TODO(ahentz): write more tests that exercise the details of the op, such as
+// lookup errors and variable input shapes.
+TEST(EmbeddingLookupOpTest, SimpleTest) {
+ EmbeddingLookupOpModel m({3}, {3, 2, 4});
+ m.PopulateTensor<int>(0, {1, 0, 2});
+ m.Set3DWeightMatrix(
+ [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({
+ 1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13, // Row 1
+ 0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13, // Row 0
+ 2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13, // Row 2
+ })));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
new file mode 100644
index 0000000000..a77fe94e49
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -0,0 +1,307 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace fully_connected {
+
+// This file has four implementations of FullyConnected
+enum KernelType {
+ kReference,
+ kGenericOptimized, // Neon-free
+ kNeonOptimized,
+ kPie, // Used by the PIE team
+};
+
+struct OpData {
+ // The scaling factor from input to output (aka the 'real multiplier') can
+ // be represented as a fixed point multipler plus a left shift.
+ int32_t output_multiplier;
+ int output_shift;
+ // The range of the fused activation layer. For example for kNone and
+ // uint8_t these would be 0 and 255.
+ int32_t output_activation_min;
+ int32_t output_activation_max;
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+ // This is a builtin op, so we don't use the contents in 'buffer', if any.
+ // Instead, we allocate a new object to carry information from Prepare() to
+ // Eval().
+ gemm_support::IncrementUsageCounter(context);
+ return new OpData;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+ gemm_support::DecrementUsageCounter(context);
+ delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ // Check we have all the inputs and outputs we need.
+ TF_LITE_ENSURE_EQ(context, node->inputs->size, 3);
+ TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+ TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ // Check all the parameters of tensor match within themselves and match the
+ // input configuration.
+ int input_size = 1;
+ for (int i = 0; i < input->dims->size; i++) {
+ input_size *= input->dims->data[i];
+ }
+
+ const int batch_size = input_size / filter->dims->data[1];
+ const int num_units = filter->dims->data[0];
+
+ TF_LITE_ASSERT_EQ(input_size, batch_size * filter->dims->data[1]);
+ if (bias) {
+ TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
+ }
+
+ TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
+
+ // Note that quantized inference requires that all tensors have their
+ // parameters set. This is usually done during quantized training.
+ TfLiteType data_type = input->type;
+ if (data_type != kTfLiteFloat32) {
+ double real_multiplier = 0.0;
+ TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+ context, input, filter, bias, output, &real_multiplier));
+ QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
+ &data->output_shift);
+ CalculateActivationRangeUint8(params->activation, output,
+ &data->output_activation_min,
+ &data->output_activation_max);
+ }
+
+ // Resize output.
+ TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
+ output_size_array->data[0] = batch_size;
+ output_size_array->data[1] = num_units;
+ TF_LITE_ENSURE_OK(context,
+ context->ResizeTensor(context, output, output_size_array));
+ return kTfLiteOk;
+}
+
+TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
+ TfLiteFullyConnectedParams* params, OpData* data,
+ TfLiteTensor* input, TfLiteTensor* filter,
+ TfLiteTensor* bias, TfLiteTensor* output) {
+ int total_input_size = 1;
+ for (int i = 0; i < input->dims->size; i++) {
+ total_input_size *= input->dims->data[i];
+ }
+
+ int input_size = filter->dims->data[1];
+ const int batch_size = total_input_size / filter->dims->data[1];
+ const int num_units = filter->dims->data[0];
+
+ // Output = bias if bias tensor exists.
+ if (bias) {
+ tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
+ output->data.f);
+ } else {
+ tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
+ }
+
+ // Compute output += weight * input
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ filter->data.f, num_units, input_size, input->data.f, batch_size,
+ output->data.f, /*result_stride=*/1);
+
+ // Apply activation function
+ tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
+ params->activation, output->data.f);
+
+ return kTfLiteOk;
+}
+
+#define TF_LITE_MACRO_DISPATCH(macro_name, params, target_namespace) \
+ if (params->activation == kTfLiteActNone) { \
+ macro_name(target_namespace, kNone); \
+ } \
+ if (params->activation == kTfLiteActRelu) { \
+ macro_name(target_namespace, kRelu); \
+ } \
+ if (params->activation == kTfLiteActRelu6) { \
+ macro_name(target_namespace, kRelu6); \
+ }
+
+template <KernelType kernel_type>
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+ TfLiteFullyConnectedParams* params, OpData* data,
+ TfLiteTensor* input, TfLiteTensor* filter,
+ TfLiteTensor* bias, TfLiteTensor* output) {
+ gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
+
+ int32_t input_offset = -input->params.zero_point;
+ int32_t filter_offset = -filter->params.zero_point;
+ int32_t output_offset = output->params.zero_point;
+#define TF_LITE_FULLY_CONNECTED(type) \
+ type::FullyConnected( \
+ GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset, \
+ GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset, \
+ GetTensorData<int32_t>(bias), GetTensorDims(bias), output_offset, \
+ data->output_multiplier, data->output_shift, \
+ data->output_activation_min, data->output_activation_max, \
+ GetTensorData<uint8_t>(output), GetTensorDims(output), gemm_context)
+ if (kernel_type == kReference) {
+ TF_LITE_FULLY_CONNECTED(reference_ops);
+ } else if (kernel_type == kPie) {
+ // TODO(ahentz): we don't have a quantized version of the PIE kernels, so
+ // we just defer to the MINI ones.
+ TF_LITE_FULLY_CONNECTED(optimized_ops);
+ } else {
+ TF_LITE_FULLY_CONNECTED(optimized_ops);
+ }
+#undef TF_LITE_FULLY_CONNECTED
+
+ return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+ TfLiteFullyConnectedParams* params, OpData* data,
+ TfLiteTensor* input, TfLiteTensor* filter,
+ TfLiteTensor* bias, TfLiteTensor* output) {
+ float output_activation_min, output_activation_max;
+ CalculateActivationRangeFloat(params->activation, &output_activation_min,
+ &output_activation_max);
+#define TF_LITE_FULLY_CONNECTED(type) \
+ type::FullyConnected(GetTensorData<float>(input), GetTensorDims(input), \
+ GetTensorData<float>(filter), GetTensorDims(filter), \
+ GetTensorData<float>(bias), GetTensorDims(bias), \
+ output_activation_min, output_activation_max, \
+ GetTensorData<float>(output), GetTensorDims(output))
+ if (kernel_type == kReference) {
+ TF_LITE_FULLY_CONNECTED(reference_ops);
+ } else if (kernel_type == kPie) {
+ return EvalPie(context, node, params, data, input, filter, bias, output);
+ } else {
+ TF_LITE_FULLY_CONNECTED(optimized_ops);
+ }
+#undef TF_LITE_FULLY_CONNECTED
+
+ return kTfLiteOk;
+}
+
+#undef TF_LITE_MACRO_DISPATCH
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+ TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ switch (input->type) { // Already know in/out types are same.
+ case kTfLiteFloat32:
+ return EvalFloat<kernel_type>(context, node, params, data, input, filter,
+ bias, output);
+ case kTfLiteUInt8:
+ return EvalQuantized<kernel_type>(context, node, params, data, input,
+ filter, bias, output);
+ default:
+ context->ReportError(context, "Type not currently supported.");
+ return kTfLiteError;
+ }
+ return kTfLiteOk;
+}
+
+} // namespace fully_connected
+
+TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
+ static TfLiteRegistration r = {
+ fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+ fully_connected::Eval<fully_connected::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT() {
+ static TfLiteRegistration r = {
+ fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+ fully_connected::Eval<fully_connected::kNeonOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT() {
+ static TfLiteRegistration r = {
+ fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+ fully_connected::Eval<fully_connected::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED_PIE() {
+ static TfLiteRegistration r = {fully_connected::Init, fully_connected::Free,
+ fully_connected::Prepare,
+ fully_connected::Eval<fully_connected::kPie>};
+ return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED() {
+ // TODO(ahentz): We don't have a dedicated quantized version of the PIE
+ // kernel. For now, the quantized version just defer to the corresponding
+ // optimized MINI kernel. At some point we will allow different libraries to
+ // be built with different kernels, but for now we have to pick one here.
+ return Register_FULLY_CONNECTED_PIE();
+#ifdef USE_NEON
+ return Register_FULLY_CONNECTED_NEON_OPT();
+#else
+ return Register_FULLY_CONNECTED_GENERIC_OPT();
+#endif
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
new file mode 100644
index 0000000000..112e3f1ba0
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -0,0 +1,377 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite FULLY_CONNECTED op.
+
+#include <iomanip>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+static float fully_connected_input[] = {
+ 0.503691, 0.196961, 0.521017, 0.554248, 0.288678, 0.792476, 0.561653,
+ 0.462230, 0.650736, 0.163132, 0.029658, 0.411544, 0.470539, 0.572390,
+ 0.538755, 0.212030, 0.264309, 0.193908, 0.777480, 0.745661, 0.423314,
+ 0.470804, 0.175501, 0.492225, 0.192743, 0.540183, 0.372514, 0.446550,
+ 0.498173, 0.126472, 0.132706, 0.001864, 0.323433, 0.653723, 0.556112,
+ 0.612111, 0.446199, 0.117765, 0.074341, 0.096935, 0.280897, 0.103999,
+ 0.508479, 0.751437, 0.676389, 0.047234, 0.963467, 0.940698, 0.241142,
+ 0.740947, 0.686359, 0.664456, 0.211751, 0.861860, 0.156681, 0.404494,
+ 0.402043, 0.529195, 0.851044, 0.900216, 0.655667, 0.983750, 0.902081,
+ 0.979100, 0.637473, 0.458193, 0.591211, 0.083671, 0.575958, 0.665552,
+ 0.180606, 0.856856, 0.769551, 0.689086, 0.608293, 0.445940, 0.736320,
+ 0.571760, 0.386637, 0.977461, 0.312707, 0.072996, 0.641918, 0.524458,
+ 0.934856, 0.798598, 0.928951, 0.336899, 0.327793, 0.779995, 0.237115,
+ 0.983460, 0.763746, 0.139196, 0.962560, 0.401218, 0.597389, 0.553771,
+ 0.484890, 0.173347, 0.219322, 0.665496, 0.030203, 0.988873, 0.354582,
+ 0.638496, 0.434813, 0.090902, 0.210256, 0.821450, 0.068363, 0.522962,
+ 0.894446, 0.710280, 0.047420, 0.829302, 0.508879, 0.976371, 0.166202,
+ 0.836672, 0.756367, 0.403317, 0.820132, 0.520112, 0.542513, 0.782691,
+ 0.921330, 0.139902};
+
+static float fully_connected_golden_output[] = {
+ 0, 0.0732134, 0, 0, 0, 0.280859,
+ 0, 0.128927, 0, 0.0777251, 0, 0.270268,
+ 0.271435, 0.0173503, 0.335465, 0.235562,
+
+ 0, 0.0745866, 0, 0.051611, 0, 0.253876,
+ 0, 0.0814873, 0, 0.104104, 0, 0.248529,
+ 0.264194, 0, 0.302973, 0.166252,
+
+ 0, 0.0170409, 0, 0.0509851, 0, 0.212834,
+ 0, 0.0208326, 0, 0.129932, 0.203978, 0.103428,
+ 0.298051, 0, 0.332233, 0.00445903,
+
+ 0, 0.125246, 0, 0.0735336, 0, 0.0910256,
+ 0, 0, 0, 0.18933, 0.378111, 0.0712443,
+ 0.277298, 0.0123414, 0.267454, 0,
+
+ 0, 0.14687, 0, 0.155495, 0.0300215, 0.147256,
+ 0, 0, 0, 0.156412, 0.434914, 0.0461529,
+ 0.246508, 0, 0.363138, 0,
+
+ 0, 0, 0, 0.0212949, 0, 0.301708,
+ 0, 0.35497, 0, 0.406223, 0.0260211, 0.049195,
+ 0.197161, 0, 0.37316, 0,
+
+ 0, 0.221783, 0, 0, 0.0116515, 0.281945,
+ 0, 0, 0, 0, 0.285626, 0.181773,
+ 0.296401, 0.170452, 0.367135, 0.142597,
+
+ 0, 0, 0, 0, 0, 0.418886,
+ 0, 0.291063, 0, 0.227541, 0.0424759, 0.27589,
+ 0.398286, 0.177146, 0.40359, 0.121452,
+
+ 0, 0.0834884, 0, 0, 0, 0.287441,
+ 0, 0.0046838, 0, 0.0122087, 0, 0.217376,
+ 0.140183, 0.0948412, 0.436677, 0.0589876,
+
+ 0, 0.0289969, 0, 0.0921397, 0, 0.396802,
+ 0, 0.0126157, 0, 0.0968433, 0, 0.172271,
+ 0.173295, 0.0664741, 0.53645, 0.00915603,
+
+ 0, 0, 0, 0, 0, 0.147942,
+ 0, 0.263795, 0, 0.39782, 0, 0.382435,
+ 0.561072, 0.0579847, 0.145712, 0.13508,
+
+ 0, 0, 0, 0.16382, 0, 0.322294,
+ 0, 0.163798, 0, 0.405211, 0.367953, 0.076852,
+ 0.342473, 0.0834118, 0.377537, 0,
+
+ 0, 0.206, 0, 0, 0, 0.375769,
+ 0, 0, 0, 0, 0, 0.125165,
+ 0, 0.105591, 0.52055, 0.0536445,
+
+ 0, 0.259261, 0, 0, 0, 0.247707,
+ 0, 0, 0, 0, 0, 0.215862,
+ 0.149153, 0.224678, 0.359519, 0.129419,
+
+ 0, 0.17611, 0, 0.280895, 0, 0.576484,
+ 0, 0.000418848, 0, 0, 0, 0.151112,
+ 0.211902, 0, 0.566341, 0.106305,
+
+ 0, 0.0246284, 0, 0, 0, 0.196267,
+ 0, 0.0248624, 0, 0.265635, 0, 0.436199,
+ 0.408079, 0.134514, 0.328489, 0.411368};
+
+class BaseFullyConnectedOpModel : public SingleOpModel {
+ public:
+ // TODO(ahentz): test different activation types too.
+ BaseFullyConnectedOpModel(int units, int batches, const TensorData& input,
+ const TensorData& output = {TensorType_FLOAT32})
+ : batches_(batches), units_(units) {
+ int total_input_size = 1;
+ for (int i = 0; i < input.shape.size(); ++i) {
+ total_input_size *= input.shape[i];
+ }
+ input_size_ = total_input_size / batches_;
+
+ input_ = AddInput(input);
+ weights_ =
+ AddInput({input.type, {units_, input_size_}, input.min, input.max});
+
+ if (input.type == TensorType_FLOAT32) {
+ bias_ = AddInput({TensorType_FLOAT32, {units_}});
+ } else {
+ // This is a quantized version. The scale of 'bias' depends on the scales
+ // of input and filter. Supposedly this is correctly set during quantized
+ // training.
+ auto bias_scale = GetScale(input_) * GetScale(weights_);
+ TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+ bias_ = AddInput(bias);
+ }
+
+ output_ = AddOutput(output);
+
+ SetBuiltinOp(
+ BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
+ CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
+ .Union());
+ BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+ }
+
+ int input_size() { return input_size_; }
+ int num_units() { return units_; }
+ int num_batches() { return batches_; }
+
+ protected:
+ int input_;
+ int weights_;
+ int bias_;
+ int output_;
+
+ int batches_;
+ int units_;
+ int input_size_;
+};
+
+class FloatFullyConnectedOpModel : public BaseFullyConnectedOpModel {
+ public:
+ using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel;
+
+ void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+ void SetWeights(std::initializer_list<float> f) {
+ PopulateTensor(weights_, f);
+ }
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+ void SetInput(int offset, float* begin, float* end) {
+ PopulateTensor(input_, offset, begin, end);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
+ public:
+ using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel;
+
+ void SetBias(std::initializer_list<float> data) {
+ QuantizeAndPopulate<int32_t>(bias_, data);
+ }
+ void SetWeights(std::initializer_list<float> data) {
+ QuantizeAndPopulate<uint8_t>(weights_, data);
+ }
+ void SetInput(std::initializer_list<float> data) {
+ QuantizeAndPopulate<uint8_t>(input_, data);
+ }
+
+ std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+ std::vector<float> GetDequantizedOutput() {
+ return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+ GetScale(output_), GetZeroPoint(output_));
+ }
+};
+
+// TODO(ahentz): add more small tests like this one, focused on making sure the
+// calculations are correct.
+TEST(FullyConnectedOpTest, SimpleTest) {
+ FloatFullyConnectedOpModel m(3, 2, {TensorType_FLOAT32, {2, 10}});
+ m.SetWeights({
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ });
+ m.SetBias({1, 2, 3});
+
+ m.SetInput({
+ 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0
+ 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1
+ });
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
+}
+
+TEST(FullyConnectedOpTest, SimpleTestQuantized) {
+ QuantizedFullyConnectedOpModel m(
+ 3, 2,
+ /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+ /*output=*/{TensorType_UINT8, {}, -127, 128});
+
+ // input_product_scale < output_scale was not true.
+ m.SetWeights({
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ });
+ m.SetBias({1, 2, 3});
+
+ m.SetInput({
+ 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0
+ 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1
+ });
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({
+ 24, 25, 26, //
+ 58, 59, 60, //
+ })));
+ EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187));
+}
+
+TEST(FullyConnectedOpTest, SimpleTest4DInput) {
+ // Note that it is not required that the first dimension be the number of
+ // batches. All we care is that the input can be evenly distributed in
+ // batches. In this case, we need the input to have multiples of '2'.
+ FloatFullyConnectedOpModel m(/*units=*/3,
+ /*batches=*/2,
+ /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}});
+ m.SetWeights({
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ });
+ m.SetBias({1, 2, 3});
+
+ m.SetInput({
+ 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // first batch
+ 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // second batch
+ });
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+ 24, 25, 26, // first batch
+ 58, 59, 60, // second batch
+ }));
+}
+
+TEST(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
+ QuantizedFullyConnectedOpModel m(
+ 3, 2,
+ /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
+ /*output=*/{TensorType_UINT8, {}, -127, 128});
+
+ // input_product_scale < output_scale was not true.
+ m.SetWeights({
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1
+ });
+ m.SetBias({1, 2, 3});
+
+ m.SetInput({
+ 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0
+ 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1
+ });
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({
+ 24, 25, 26, //
+ 58, 59, 60, //
+ })));
+ EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187));
+}
+
+// TODO(ahentz): Reconsider this test. Having arbitrary weights makes it hard
+// to debug errors and doesn't necessarily test all the important details.
+TEST(FullyConnectedOpTest, BlackBoxTest) {
+ FloatFullyConnectedOpModel m(16, 2, {TensorType_FLOAT32, {2, 8}});
+ m.SetWeights(
+ {0.091327, 0.103366, -0.316505, -0.083120, 0.149366, -0.196636,
+ -0.123672, 0.062800, 0.063031, 0.191670, -0.062001, -0.061504,
+ -0.275581, 0.059388, -0.118497, -0.079224, 0.109758, 0.008307,
+ -0.062657, -0.060962, -0.049782, -0.106719, -0.319482, -0.103650,
+ 0.266455, 0.051517, -0.123448, 0.322464, 0.043282, -0.173782,
+ -0.190381, 0.002013, 0.096086, 0.131157, 0.031164, 0.100638,
+ -0.312191, -0.080923, -0.101318, -0.116614, 0.142238, 0.086540,
+ -0.139154, 0.174268, -0.073161, 0.080072, 0.006874, 0.229382,
+ -0.104321, -0.176035, -0.208587, -0.001019, -0.162032, 0.080824,
+ -0.025021, 0.074460, -0.252595, -0.161750, -0.136403, 0.008308,
+ 0.005710, 0.096600, 0.289839, 0.218816, -0.304651, -0.070958,
+ 0.054598, 0.147113, -0.139112, -0.072798, -0.163335, -0.167863,
+ -0.128762, -0.035780, 0.117262, 0.017177, 0.263335, -0.176612,
+ 0.262961, -0.093654, -0.339283, 0.333071, 0.180827, 0.287583,
+ 0.066350, -0.197947, -0.114449, -0.236035, 0.103532, -0.034284,
+ 0.093299, -0.145361, 0.054001, 0.250570, 0.157010, -0.143480,
+ -0.139061, -0.048873, 0.067557, 0.139038, 0.324106, 0.227041,
+ 0.037793, -0.225747, -0.241619, 0.357835, 0.135762, -0.306764,
+ -0.125982, 0.091916, 0.266587, 0.030135, 0.265148, 0.141627,
+ 0.020120, 0.083815, -0.124556, -0.100124, -0.048159, 0.181172,
+ 0.302309, -0.041084, 0.146334, -0.061511, -0.232605, 0.281324,
+ 0.145408, -0.221897});
+ m.SetBias({-0.160594, 0.205770, -0.078307, -0.077984, 0.001937, 0.015860,
+ 0.036810, 0.012346, 0.001028, 0.038551, 0.075415, 0.020804,
+ 0.048478, -0.032270, 0.175688, -0.085662});
+
+ const int input_sequence_size = sizeof(fully_connected_input) /
+ sizeof(float) /
+ (m.input_size() * m.num_batches());
+ for (int i = 0; i < input_sequence_size; i++) {
+ // TODO(ahentz): This is what the original test was doing: two equal
+ // batches per invocation. We could instead use two different batches.
+ float* batch_start = fully_connected_input + i * m.input_size();
+ float* batch_end = batch_start + m.input_size();
+ m.SetInput(0, batch_start, batch_end);
+ m.SetInput(m.input_size(), batch_start, batch_end);
+
+ m.Invoke();
+
+ float* golden_start = fully_connected_golden_output + i * m.num_units();
+ float* golden_end = golden_start + m.num_units();
+ std::vector<float> expected;
+ expected.insert(expected.end(), golden_start, golden_end);
+ expected.insert(expected.end(), golden_start, golden_end);
+
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+ }
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.cc b/tensorflow/contrib/lite/kernels/gemm_support.cc
new file mode 100644
index 0000000000..eb2b0aacf7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/gemm_support.cc
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace gemm_support {
+
+struct RefCountedGemmContext {
+ gemmlowp::GemmContext* gemm_context_ = nullptr;
+ int num_references_ = 0;
+};
+
+void IncrementUsageCounter(TfLiteContext* context) {
+ auto* ptr = reinterpret_cast<RefCountedGemmContext*>(context->gemm_context);
+ if (ptr == nullptr) {
+ ptr = new RefCountedGemmContext;
+ ptr->gemm_context_ = new gemmlowp::GemmContext();
+ ptr->num_references_ = 0;
+ context->gemm_context = ptr;
+ }
+ ptr->num_references_++;
+}
+
+void DecrementUsageCounter(TfLiteContext* context) {
+ auto* ptr = reinterpret_cast<RefCountedGemmContext*>(context->gemm_context);
+ if (ptr == nullptr) {
+ TF_LITE_FATAL(
+ "Call to DecrementUsageCounter() not preceded by "
+ "IncrementUsageCounter()");
+ }
+ if (--ptr->num_references_ == 0) {
+ delete ptr->gemm_context_;
+ delete ptr;
+ context->gemm_context = nullptr;
+ }
+}
+
+gemmlowp::GemmContext* GetFromContext(TfLiteContext* context) {
+ auto* ptr = reinterpret_cast<RefCountedGemmContext*>(context->gemm_context);
+ if (ptr == nullptr) {
+ TF_LITE_FATAL(
+ "Call to GetFromContext() not preceded by IncrementUsageCounter()");
+ }
+ return ptr->gemm_context_;
+}
+
+void SetMaxNumThreads(TfLiteContext* context, int num_threads) {
+ IncrementUsageCounter(context);
+ GetFromContext(context)->set_max_num_threads(num_threads);
+ DecrementUsageCounter(context);
+}
+
+} // namespace gemm_support
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.h b/tensorflow/contrib/lite/kernels/gemm_support.h
new file mode 100644
index 0000000000..b531959ffb
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/gemm_support.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+namespace gemm_support {
+
+// Returns the GemmContext stored in 'context', allowing multiple ops to
+// share a single object, as long as they share a TfLiteContext. The caller
+// must ensure that this is called between IncrementUsageCounter() and
+// DecrementUsageCounter(). For example, in the implementation of an op:
+// void* Init(TfLiteContext* context, const char*, size_t) {
+// gemm_support::IncrementUsageCounter(context);
+// return nullptr;
+// }
+// void Free(TfLiteContext* context, void*) {
+// gemm_support::DecrementUsageCounter(context);
+// }
+// TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+// auto* gemm_context = gemm_support::GetFromContext(context);
+// }
+gemmlowp::GemmContext* GetFromContext(TfLiteContext* context);
+
+// Let the framework know that the GemmContext stored in 'context' will be used
+// by an op. If necessary a new GemmContext is created and placed in 'context'.
+void IncrementUsageCounter(TfLiteContext* context);
+
+// Let the framework know that the op stopped using the GemmContext stored in
+// 'context'. If there are no more usages the GemmContext will be deleted.
+void DecrementUsageCounter(TfLiteContext* context);
+
+// Set the maximum number threads available for gemmlowp operations.
+void SetMaxNumThreads(TfLiteContext* context, int num_threads);
+
+} // namespace gemm_support
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
new file mode 100644
index 0000000000..3b82601d11
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
@@ -0,0 +1,155 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Op that looks up items from hashtable.
+//
+// Input:
+// Tensor[0]: Hash key to lookup, dim.size == 1, int32
+// Tensor[1]: Key of hashtable, dim.size == 1, int32
+// *MUST* be sorted in ascending order.
+// Tensor[2]: Value of hashtable, dim.size >= 1
+// Tensor[1].Dim[0] == Tensor[2].Dim[0]
+//
+// Output:
+// Output[0].dim[0] == Tensor[0].dim[0], num of lookups
+// Each item in output is a raw bytes copy of corresponding item in input.
+// When key does not exist in hashtable, the returned bytes are all 0s.
+//
+// Output[1].dim = { Tensor[0].dim[0] }, num of lookups
+// Each item indicates whether the corresponding lookup has a returned value.
+// 0 for missing key, 1 for found key.
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+namespace {
+
+int greater(const void* a, const void* b) {
+ return *static_cast<const int*>(a) - *static_cast<const int*>(b);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
+
+ TfLiteTensor* lookup = GetInput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
+ TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
+
+ TfLiteTensor* key = GetInput(context, node, 1);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(key), 1);
+ TF_LITE_ENSURE_EQ(context, key->type, kTfLiteInt32);
+
+ TfLiteTensor* value = GetInput(context, node, 2);
+ TF_LITE_ENSURE(context, NumDimensions(value) >= 1);
+ TF_LITE_ENSURE_EQ(context, SizeOfDimension(key, 0),
+ SizeOfDimension(value, 0));
+ if (value->type == kTfLiteString) {
+ TF_LITE_ENSURE_EQ(context, NumDimensions(value), 1);
+ }
+
+ TfLiteTensor* hits = GetOutput(context, node, 1);
+ TF_LITE_ENSURE_EQ(context, hits->type, kTfLiteUInt8);
+ TfLiteIntArray* hitSize = TfLiteIntArrayCreate(1);
+ hitSize->data[0] = SizeOfDimension(lookup, 0);
+
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, value->type, output->type);
+
+ TfLiteStatus status = kTfLiteOk;
+ if (output->type != kTfLiteString) {
+ TfLiteIntArray* outputSize = TfLiteIntArrayCreate(NumDimensions(value));
+ outputSize->data[0] = SizeOfDimension(lookup, 0);
+ for (int i = 1; i < NumDimensions(value); i++) {
+ outputSize->data[i] = SizeOfDimension(value, i);
+ }
+ status = context->ResizeTensor(context, output, outputSize);
+ }
+ if (context->ResizeTensor(context, hits, hitSize) == kTfLiteError) {
+ status = kTfLiteError;
+ }
+ return status;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TfLiteTensor* hits = GetOutput(context, node, 1);
+ TfLiteTensor* lookup = GetInput(context, node, 0);
+ TfLiteTensor* key = GetInput(context, node, 1);
+ TfLiteTensor* value = GetInput(context, node, 2);
+
+ const int num_rows = SizeOfDimension(value, 0);
+ const int row_bytes = value->bytes / num_rows;
+ void* pointer = nullptr;
+ DynamicBuffer buf;
+
+ for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
+ int idx = -1;
+ pointer = bsearch(&(lookup->data.i32[i]), key->data.i32, num_rows,
+ sizeof(int32_t), greater);
+ if (pointer != nullptr) {
+ idx = (reinterpret_cast<char*>(pointer) - (key->data.raw)) /
+ sizeof(int32_t);
+ }
+
+ if (idx >= num_rows || idx < 0) {
+ if (output->type == kTfLiteString) {
+ buf.AddString(nullptr, 0);
+ } else {
+ memset(output->data.raw + i * row_bytes, 0, row_bytes);
+ }
+ hits->data.uint8[i] = 0;
+ } else {
+ if (output->type == kTfLiteString) {
+ buf.AddString(GetString(value, idx));
+ } else {
+ memcpy(output->data.raw + i * row_bytes,
+ value->data.raw + idx * row_bytes, row_bytes);
+ }
+ hits->data.uint8[i] = 1;
+ }
+ }
+ if (output->type == kTfLiteString) {
+ buf.WriteToTensor(output);
+ }
+
+ return kTfLiteOk;
+}
+} // namespace
+
+TfLiteRegistration* Register_HASHTABLE_LOOKUP() {
+ static TfLiteRegistration r = {nullptr, nullptr, Prepare, Eval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
new file mode 100644
index 0000000000..916a23225e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Lookup op.
+
+#include <iomanip>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class HashtableLookupOpModel : public SingleOpModel {
+ public:
+ HashtableLookupOpModel(std::initializer_list<int> lookup_shape,
+ std::initializer_list<int> key_shape,
+ std::initializer_list<int> value_shape,
+ TensorType type) {
+ lookup_ = AddInput(TensorType_INT32);
+ key_ = AddInput(TensorType_INT32);
+ value_ = AddInput(type);
+ output_ = AddOutput(type);
+ hit_ = AddOutput(TensorType_UINT8);
+ SetBuiltinOp(BuiltinOperator_HASHTABLE_LOOKUP, BuiltinOptions_NONE, 0);
+ BuildInterpreter({lookup_shape, key_shape, value_shape});
+ }
+
+ void SetLookup(std::initializer_list<int> data) {
+ PopulateTensor<int>(lookup_, data);
+ }
+
+ void SetHashtableKey(std::initializer_list<int> data) {
+ PopulateTensor<int>(key_, data);
+ }
+
+ void SetHashtableValue(const std::vector<string>& content) {
+ PopulateStringTensor(value_, content);
+ }
+
+ void SetHashtableValue(const std::function<float(int)>& function) {
+ TfLiteTensor* tensor = interpreter_->tensor(value_);
+ int rows = tensor->dims->data[0];
+ for (int i = 0; i < rows; i++) {
+ tensor->data.f[i] = function(i);
+ }
+ }
+
+ void SetHashtableValue(const std::function<float(int, int)>& function) {
+ TfLiteTensor* tensor = interpreter_->tensor(value_);
+ int rows = tensor->dims->data[0];
+ int features = tensor->dims->data[1];
+ for (int i = 0; i < rows; i++) {
+ for (int j = 0; j < features; j++) {
+ tensor->data.f[i * features + j] = function(i, j);
+ }
+ }
+ }
+
+ std::vector<string> GetStringOutput() {
+ TfLiteTensor* output = interpreter_->tensor(output_);
+ int num = GetStringCount(output);
+ std::vector<string> result(num);
+ for (int i = 0; i < num; i++) {
+ auto ref = GetString(output, i);
+ result[i] = string(ref.str, ref.len);
+ }
+ return result;
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+ std::vector<uint8_t> GetHit() { return ExtractVector<uint8_t>(hit_); }
+
+ private:
+ int lookup_;
+ int key_;
+ int value_;
+ int output_;
+ int hit_;
+};
+
+// TODO(yichengfan): write more tests that exercise the details of the op,
+// such as lookup errors and variable input shapes.
+TEST(HashtableLookupOpTest, Test2DInput) {
+ HashtableLookupOpModel m({4}, {3}, {3, 2}, TensorType_FLOAT32);
+
+ m.SetLookup({1234, -292, -11, 0});
+ m.SetHashtableKey({-11, 0, 1234});
+ m.SetHashtableValue([](int i, int j) { return i + j / 10.0f; });
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ 2.0, 2.1, // 2-nd item
+ 0, 0, // Not found
+ 0.0, 0.1, // 0-th item
+ 1.0, 1.1, // 1-st item
+ })));
+ EXPECT_THAT(m.GetHit(), ElementsAreArray({
+ 1, 0, 1, 1,
+ }));
+}
+
+TEST(HashtableLookupOpTest, Test1DInput) {
+ HashtableLookupOpModel m({4}, {3}, {3}, TensorType_FLOAT32);
+
+ m.SetLookup({1234, -292, -11, 0});
+ m.SetHashtableKey({-11, 0, 1234});
+ m.SetHashtableValue([](int i) { return i * i / 10.0f; });
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ 0.4, // 2-nd item
+ 0, // Not found
+ 0.0, // 0-th item
+ 0.1, // 1-st item
+ })));
+ EXPECT_THAT(m.GetHit(), ElementsAreArray({
+ 1,
+ 0,
+ 1,
+ 1,
+ }));
+}
+
+TEST(HashtableLookupOpTest, TestString) {
+ HashtableLookupOpModel m({4}, {3}, {3}, TensorType_STRING);
+
+ m.SetLookup({1234, -292, -11, 0});
+ m.SetHashtableKey({-11, 0, 1234});
+ m.SetHashtableValue({"Hello", "", "Hi"});
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({
+ "Hi", // 2-nd item
+ "", // Not found
+ "Hello", // 0-th item
+ "", // 1-st item
+ }));
+ EXPECT_THAT(m.GetHit(), ElementsAreArray({
+ 1,
+ 0,
+ 1,
+ 1,
+ }));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
new file mode 100644
index 0000000000..288534099b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -0,0 +1,359 @@
+package(default_visibility = [
+ "//visibility:public",
+])
+
+licenses(["notice"]) # Apache 2.0
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+
+tflite_deps_intel = [
+ "@arm_neon_2_x86_sse",
+]
+
+NEON_FLAGS_IF_APPLICABLE = select({
+ ":arm": [
+ "-O3",
+ "-mfpu=neon",
+ "-mfloat-abi=softfp",
+ ],
+ ":armeabi-v7a": [
+ "-O3",
+ "-mfpu=neon",
+ "-mfloat-abi=softfp",
+ ],
+ ":armv7a": [
+ "-O3",
+ "-mfpu=neon",
+ "-mfloat-abi=softfp",
+ ],
+ "//conditions:default": [
+ "-O3",
+ ],
+})
+
+cc_library(
+ name = "types",
+ srcs = [],
+ hdrs = [
+ "compatibility.h",
+ "types.h",
+ ],
+)
+
+config_setting(
+ name = "arm",
+ values = {
+ "cpu": "arm",
+ },
+)
+
+config_setting(
+ name = "arm64-v8a",
+ values = {
+ "cpu": "arm64-v8a",
+ },
+)
+
+config_setting(
+ name = "armv7a",
+ values = {
+ "cpu": "armv7a",
+ },
+)
+
+config_setting(
+ name = "armeabi-v7a",
+ values = {
+ "cpu": "armeabi-v7a",
+ },
+)
+
+config_setting(
+ name = "haswell",
+ values = {
+ "cpu": "haswell",
+ },
+)
+
+config_setting(
+ name = "ios_x86_64",
+ values = {
+ "cpu": "ios_x86_64",
+ },
+)
+
+config_setting(
+ name = "ios_armv7",
+ values = {
+ "cpu": "ios_armv7",
+ },
+)
+
+config_setting(
+ name = "ios_arm64",
+ values = {
+ "cpu": "ios_arm64",
+ },
+)
+
+config_setting(
+ name = "k8",
+ values = {
+ "cpu": "k8",
+ },
+)
+
+config_setting(
+ name = "x86",
+ values = {
+ "cpu": "x86",
+ },
+)
+
+config_setting(
+ name = "x86_64",
+ values = {
+ "cpu": "x86_64",
+ },
+)
+
+config_setting(
+ name = "darwin",
+ values = {
+ "cpu": "darwin",
+ },
+)
+
+cc_library(
+ name = "optimized_base",
+ srcs = [],
+ hdrs = [
+ "common.h",
+ "optimized/depthwiseconv_float.h",
+ "optimized/depthwiseconv_uint8.h",
+ "optimized/optimized_ops.h",
+ ],
+ copts = tflite_copts(),
+ deps = [
+ ":types",
+ ":round",
+ "//third_party/eigen3",
+ "@gemmlowp//:gemmlowp",
+ "//tensorflow/contrib/lite:builtin_op_data",
+ ] + select({
+ ":haswell": tflite_deps_intel,
+ ":ios_x86_64": tflite_deps_intel,
+ ":k8": tflite_deps_intel,
+ ":x86": tflite_deps_intel,
+ ":x86_64": tflite_deps_intel,
+ ":darwin": tflite_deps_intel,
+ "//conditions:default": [],
+ }),
+)
+
+cc_library(
+ name = "optimized",
+ hdrs = [
+ "optimized/eigen_spatial_convolutions.h",
+ "optimized/eigen_tensor_reduced_instantiations_oss.h",
+ "optimized/multithreaded_conv.h",
+ "tensor.h",
+ ],
+ deps = [
+ ":optimized_base",
+ ":types",
+ "//tensorflow/contrib/lite:builtin_op_data",
+ "//tensorflow/contrib/lite:context",
+ "//third_party/eigen3",
+ ],
+)
+
+cc_test(
+ name = "tensor_test",
+ srcs = ["tensor_test.cc"],
+ deps = [
+ ":reference",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+cc_library(
+ name = "round",
+ srcs = [],
+ hdrs = ["round.h"],
+)
+
+cc_library(
+ name = "quantization_util",
+ srcs = ["quantization_util.cc"],
+ hdrs = [
+ "compatibility.h",
+ "quantization_util.h",
+ ],
+ deps = [":round"],
+)
+
+cc_test(
+ name = "quantization_util_test",
+ srcs = ["quantization_util_test.cc"],
+ deps = [
+ ":quantization_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+cc_library(
+ name = "reference_base",
+ srcs = [],
+ hdrs = [
+ "common.h",
+ "reference/depthwiseconv_float.h",
+ "reference/depthwiseconv_uint8.h",
+ "reference/reference_ops.h",
+ ],
+ deps = [
+ ":round",
+ ":types",
+ "//third_party/eigen3",
+ "@gemmlowp//:gemmlowp",
+ "//tensorflow/contrib/lite:builtin_op_data",
+ ] + select({
+ ":haswell": tflite_deps_intel,
+ ":ios_x86_64": tflite_deps_intel,
+ ":k8": tflite_deps_intel,
+ ":x86": tflite_deps_intel,
+ ":x86_64": tflite_deps_intel,
+ ":darwin": tflite_deps_intel,
+ "//conditions:default": [],
+ }),
+)
+
+cc_library(
+ name = "reference",
+ hdrs = ["tensor.h"],
+ deps = [
+ ":types",
+ "//tensorflow/contrib/lite:context",
+ ],
+)
+
+cc_library(
+ name = "portable_tensor_utils",
+ srcs = [
+ "reference/portable_tensor_utils.cc",
+ ],
+ hdrs = [
+ "reference/portable_tensor_utils.h",
+ ],
+ deps = [
+ "//tensorflow/contrib/lite:builtin_op_data",
+ "//tensorflow/contrib/lite/kernels:activation_functor",
+ "//tensorflow/contrib/lite/kernels:op_macros",
+ ],
+)
+
+cc_library(
+ name = "neon_tensor_utils",
+ srcs = [
+ "optimized/neon_tensor_utils.cc",
+ ],
+ hdrs = [
+ "optimized/neon_tensor_utils.h",
+ "optimized/tensor_utils_impl.h",
+ ],
+ copts = NEON_FLAGS_IF_APPLICABLE,
+ deps = [
+ ":cpu_check",
+ ":portable_tensor_utils",
+ "//tensorflow/contrib/lite:builtin_op_data",
+ "//tensorflow/contrib/lite/kernels:activation_functor",
+ ],
+)
+
+cc_library(
+ name = "tensor_utils",
+ srcs = [
+ "tensor_utils.cc",
+ ],
+ hdrs = [
+ "optimized/tensor_utils_impl.h",
+ "reference/portable_tensor_utils.h",
+ "tensor_utils.h",
+ ],
+ copts = NEON_FLAGS_IF_APPLICABLE,
+ deps = [
+ "//tensorflow/contrib/lite/kernels:activation_functor",
+ "//tensorflow/contrib/lite:builtin_op_data",
+ ] + select({
+ ":arm": [
+ ":neon_tensor_utils",
+ ],
+ ":arm64-v8a": [
+ ":neon_tensor_utils",
+ ],
+ ":armeabi-v7a": [
+ ":neon_tensor_utils",
+ ],
+ ":armv7a": [
+ ":neon_tensor_utils",
+ ],
+ ":ios_armv7": [
+ ":neon_tensor_utils",
+ ],
+ ":ios_arm64": [
+ ":neon_tensor_utils",
+ ],
+ "//conditions:default": [
+ ":portable_tensor_utils",
+ ],
+ }),
+)
+
+cc_test(
+ name = "tensor_utils_test",
+ srcs = ["tensor_utils_test.cc"],
+ copts = NEON_FLAGS_IF_APPLICABLE,
+ linkopts = select({
+ "//tensorflow:android": [
+ "-fPIE -pie",
+ ],
+ "//conditions:default": [],
+ }),
+ linkstatic = 1,
+ deps = [
+ ":tensor_utils",
+ "//tensorflow/contrib/lite:builtin_op_data",
+ "//tensorflow/contrib/lite/kernels:test_util",
+ "@com_google_googletest//:gtest_main",
+ ],
+)
+
+cc_library(
+ name = "cpu_check",
+ hdrs = [
+ "optimized/cpu_check.h",
+ ],
+ deps = [
+ ] + select(
+ {
+ "//tensorflow:android": [
+ "@androidndk//:cpufeatures",
+ ],
+ "//conditions:default": [],
+ },
+ ),
+)
+
+exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
+
+filegroup(
+ name = "all_files",
+ srcs = glob(
+ ["**/*"],
+ exclude = [
+ "**/METADATA",
+ "**/OWNERS",
+ ],
+ ),
+ visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
new file mode 100644
index 0000000000..28f19a2506
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/common.h
@@ -0,0 +1,107 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif
+#endif
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined __GNUC__ && defined __SSE4_1__
+#define USE_NEON
+
+#define OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wattributes"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wsequence-point"
+
+#include "NEON_2_SSE.h"
+
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+inline void GetActivationMinMax(FusedActivationFunctionType ac,
+ float* output_activation_min,
+ float* output_activation_max) {
+ switch (ac) {
+ case FusedActivationFunctionType::kNone:
+ *output_activation_min = std::numeric_limits<float>::lowest();
+ *output_activation_max = std::numeric_limits<float>::max();
+ break;
+ case FusedActivationFunctionType::kRelu:
+ *output_activation_min = 0.f;
+ *output_activation_max = std::numeric_limits<float>::max();
+ break;
+ case FusedActivationFunctionType::kRelu1:
+ *output_activation_min = -1.f;
+ *output_activation_max = 1.f;
+ break;
+ case FusedActivationFunctionType::kRelu6:
+ *output_activation_min = 0.f;
+ *output_activation_max = 6.f;
+ break;
+ }
+}
+
+inline float ActivationFunctionWithMinMax(float x, float output_activation_min,
+ float output_activation_max) {
+ return std::min(std::max(x, output_activation_min), output_activation_max);
+}
+
+// Legacy function, left for compatibility only.
+template <FusedActivationFunctionType Ac>
+float ActivationFunction(float x) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ return ActivationFunctionWithMinMax(x, output_activation_min,
+ output_activation_max);
+}
+
+inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
+ int32 x, int32 quantized_multiplier, int right_shift) {
+ using gemmlowp::RoundingDivideByPOT;
+ using gemmlowp::SaturatingRoundingDoublingHighMul;
+ return RoundingDivideByPOT(
+ SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
+}
+
+inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
+ int32 x, int32 quantized_multiplier, int left_shift) {
+ using gemmlowp::SaturatingRoundingDoublingHighMul;
+ return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+ quantized_multiplier);
+}
+
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
new file mode 100644
index 0000000000..796a03566a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+
+#ifndef TFLITE_DCHECK
+#define TFLITE_DCHECK(condition) (condition) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_EQ
+#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_GE
+#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_GT
+#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_LE
+#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_LT
+#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : assert(false)
+#endif
+
+// TODO(ahentz): Clean up: We should stick to the DCHECK versions.
+#ifndef TFLITE_CHECK
+#define TFLITE_CHECK(condition) (condition) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_EQ
+#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_GE
+#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_GT
+#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_LE
+#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_LT
+#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : abort()
+#endif
+
+// TODO(ahentz): Clean up.
+using uint8 = std::uint8_t;
+using int16 = std::int16_t;
+using uint16 = std::uint16_t;
+using int32 = std::int32_t;
+using uint32 = std::uint32_t;
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
new file mode 100644
index 0000000000..dea46cc120
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_
+
+namespace tflite {
+
+#ifdef __ANDROID__
+#include "ndk/sources/android/cpufeatures/cpu-features.h"
+
+// Runtime check for Neon support on Android.
+inline bool TestCPUFeatureNeon() {
+#ifdef __aarch64__
+ // ARM-64 always has NEON support.
+ return true;
+#else
+ static bool kUseAndroidNeon =
+ (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM &&
+ android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_ARMv7 &&
+ android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
+ return kUseAndroidNeon;
+#endif // __aarch64__
+}
+
+#elif __ARM_NEON
+
+inline bool TestCPUFeatureNeon() {
+ return true;
+}
+
+#else
+
+inline bool TestCPUFeatureNeon() {
+ return false;
+}
+
+#endif
+
+} // namespace tflite
+
+// NEON_OR_PORTABLE(SomeFunc, arcs) calls NeonSomeFunc(args) if Neon is both
+// enabled at build time and detected at runtime, or PortableSomeFunc(args)
+// otherwise.
+#ifdef __ARM_ARCH_5TE__
+// Neon isn't available at all on ARMv5.
+#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
+#else
+#define NEON_OR_PORTABLE(funcname, ...) \
+ TestCPUFeatureNeon() ? Neon##funcname(__VA_ARGS__) \
+ : Portable##funcname(__VA_ARGS__)
+#endif
+
+#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
new file mode 100644
index 0000000000..974611f52a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -0,0 +1,987 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+
+template <>
+struct FloatDepthwiseConvKernel<false, 8, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ // Load the filters
+ float32x4_t filter[2];
+ for (int i = 0; i < 2; i++) {
+ filter[i] = vld1q_f32(filter_ptr + 4 * i);
+ }
+ int outp = 0;
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2) {
+ // Load the inputs
+ float32x4_t input[4];
+ for (int i = 0; i < 4; i++) {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ input_ptr += 16;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+ acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+ acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+ acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the inputs
+ float32x4_t input[2];
+ for (int i = 0; i < 2; i++) {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ input_ptr += 8;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<false, 2, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ const float32x2_t filters = vld1_f32(filter_ptr);
+ const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+ int outp = 0;
+ // Handle 8 output pixels at a time.
+ for (; outp <= num_output_pixels - 8; outp += 8) {
+ // Load the inputs
+ float32x4_t input[4];
+ for (int i = 0; i < 4; i++) {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ input_ptr += 16;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4) {
+ // Load the inputs
+ float32x4_t input[2];
+ for (int i = 0; i < 2; i++) {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ input_ptr += 8;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2) {
+ // Load the inputs
+ const float32x4_t input = vld1q_f32(input_ptr);
+ input_ptr += 4;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+ // Multiply-accumulate
+ acc = vmlaq_f32(acc, input, filters_dup2);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ // Handle 1 output pixel at a time
+ for (; outp < num_output_pixels; outp++) {
+ // Load the inputs
+ const float32x2_t input = vld1_f32(input_ptr);
+ input_ptr += 2;
+ // Load the accumulators from acc_buffer
+ float32x2_t acc = vld1_f32(acc_buffer_ptr);
+ // Multiply-accumulate
+ acc = vmla_f32(acc, input, filters);
+ // Store the accumulators back to acc_buffer
+ vst1_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ const float* local_filter_ptr = filter_ptr;
+ const float* local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 16 input channels at a time.
+ for (; ic <= input_depth - 16; ic += 16) {
+ // Load the filters
+ float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
+ float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
+ float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
+ float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
+ local_filter_ptr += 16;
+ // Load the inputs
+ float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
+ float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
+ float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
+ float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
+ local_input_ptr += 16;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+ float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+ float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+ float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+ // Multiply-accumulate
+ acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
+ acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
+ acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
+ acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+ acc_buffer_ptr += 16;
+ }
+ // Handle 4 input channels at a time.
+ for (; ic <= input_depth - 4; ic += 4) {
+ // Load the filters
+ float32x4_t filter;
+ filter = vld1q_f32(local_filter_ptr);
+ local_filter_ptr += 4;
+ // Load the inputs
+ float32x4_t input;
+ input = vld1q_f32(local_input_ptr);
+ local_input_ptr += 4;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc;
+ acc = vld1q_f32(acc_buffer_ptr);
+ // Multiply-accumulate
+ acc = vmlaq_f32(acc, input, filter);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++) {
+ const float input_val = *local_input_ptr++;
+ const float filter_val = *local_filter_ptr++;
+ *acc_buffer_ptr++ += filter_val * input_val;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 8> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ const float* local_filter_ptr = filter_ptr;
+ const float* local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 2 input channels at a time.
+ for (; ic <= input_depth - 2; ic += 2) {
+ // Load the filters
+ float32x4_t filter[4];
+ for (int i = 0; i < 4; i++) {
+ filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+ }
+ local_filter_ptr += 16;
+ // Load the inputs
+ const float32x2_t input = vld1_f32(local_input_ptr);
+ local_input_ptr += 2;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+ acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+ acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+ acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++) {
+ // Load the filters
+ float32x4_t filter[2];
+ for (int i = 0; i < 2; i++) {
+ filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+ }
+ local_filter_ptr += 8;
+ // Load the inputs
+ const float input_val = *local_input_ptr++;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 2> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ const float* local_filter_ptr = filter_ptr;
+ const float* local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 8 input channels at a time.
+ for (; ic <= input_depth - 8; ic += 8) {
+ // Load the filters
+ float32x4_t filter[4];
+ for (int i = 0; i < 4; i++) {
+ filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+ }
+ local_filter_ptr += 16;
+ // Load the inputs
+ float32x4x2_t input_dup2[2];
+ for (int i = 0; i < 2; i++) {
+ const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+ input_dup2[i] = vzipq_f32(input, input);
+ }
+ local_input_ptr += 8;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+ acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+ acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+ acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle 4 input channels at a time.
+ for (; ic <= input_depth - 4; ic += 4) {
+ // Load the filters
+ float32x2_t filter[4];
+ for (int i = 0; i < 4; i++) {
+ filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+ }
+ local_filter_ptr += 8;
+ // Load the inputs
+ const float32x4_t input = vld1q_f32(local_input_ptr);
+ local_input_ptr += 4;
+ // Load the accumulators from acc_buffer
+ float32x2_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+ acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+ acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+ acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ // Handle 2 input channels at a time.
+ for (; ic <= input_depth - 2; ic += 2) {
+ // Load the filters
+ const float32x4_t filter = vld1q_f32(local_filter_ptr);
+ local_filter_ptr += 4;
+ // Load the inputs
+ const float32x2_t input = vld1_f32(local_input_ptr);
+ local_input_ptr += 2;
+ // Load the accumulators from acc_buffer
+ float32x2_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+ acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+ }
+ acc_buffer_ptr += 4;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++) {
+ // Load the inputs
+ const float input_val = *local_input_ptr++;
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+ }
+ local_filter_ptr += 2;
+ acc_buffer_ptr += 2;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 8> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ // Load the filters
+ float32x4_t filter[2];
+ for (int i = 0; i < 2; i++) {
+ filter[i] = vld1q_f32(filter_ptr + 4 * i);
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ // Load the inputs
+ const float input_val = *input_ptr;
+ input_ptr += input_ptr_increment;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 32> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ // Load the filters
+ float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+ float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+ float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+ float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+ float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+ float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
+ float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
+ float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ // Load the inputs
+ const float input_val = *input_ptr;
+ input_ptr += input_ptr_increment;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+ float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+ float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+ float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+ float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+ float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
+ float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
+ float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
+ // Multiply-accumulate
+ acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+ acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+ acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+ acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+ acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+ acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
+ acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
+ acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+ vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+ vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
+ vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
+ vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
+ acc_buffer_ptr += 32;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 16> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ const float* local_filter_ptr = filter_ptr;
+ const float* local_input_ptr = input_ptr;
+ for (int ic = 0; ic < input_depth; ic++) {
+ // Load the filters
+ float32x4_t filter[4];
+ for (int i = 0; i < 4; i++) {
+ filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+ }
+ local_filter_ptr += 16;
+ // Load the inputs
+ const float input_val = *local_input_ptr++;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 8, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ // Load the filters
+ float32x4_t filter[2];
+ for (int i = 0; i < 2; i++) {
+ filter[i] = vld1q_f32(filter_ptr + 4 * i);
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ // Load the inputs
+ float32x4_t input[2];
+ for (int i = 0; i < 2; i++) {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 2, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ float32x2_t filter = vld1_f32(filter_ptr);
+ float32x4_t filter_x4 = vcombine_f32(filter, filter);
+ int outp = 0;
+
+ // Handle two output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2) {
+ // Load the inputs
+ float32x2_t input_1 = vld1_f32(input_ptr);
+ input_ptr += input_ptr_increment;
+ float32x2_t input_2 = vld1_f32(input_ptr);
+ input_ptr += input_ptr_increment;
+ float32x4_t input = vcombine_f32(input_1, input_2);
+
+ // Load the accumulators from acc_buffer
+ float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+
+ // Multiply-accumulate
+ acc = vmlaq_f32(acc, input, filter_x4);
+
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the inputs
+ float32x2_t input = vld1_f32(input_ptr);
+ input_ptr += input_ptr_increment;
+
+ // Load the accumulators from acc_buffer
+ float32x2_t acc = vld1_f32(acc_buffer_ptr);
+
+ // Multiply-accumulate
+ acc = vmla_f32(acc, input, filter);
+
+ // Store the accumulators back to acc_buffer
+ vst1_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 4, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ float32x4_t filter = vld1q_f32(filter_ptr);
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ // Load the inputs
+ float32x4_t input = vld1q_f32(input_ptr);
+ // Load the accumulators from acc_buffer
+ float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+ // Multiply-accumulate
+ acc = vmlaq_f32(acc, input, filter);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
+ const float* input_data, int pad_width,
+ int depth_multiplier, int filter_width,
+ const float* filter_data,
+ int out_x_buffer_start, int out_x_buffer_end,
+ int output_depth, float* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+ gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+ // Sanity check parameters. This is important in particular to ensure
+ // that we keep the number of template instantiations minimal, so we don't
+ // increase binary size unnecessarily.
+ static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+ static_assert(kFixedInputDepth || kAllowStrided, "");
+ TFLITE_DCHECK(stride == 1 || kAllowStrided);
+ if (kFixedInputDepth) {
+ TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
+ }
+ if (kFixedDepthMultiplier) {
+ TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+ }
+ TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+ const int input_ptr_increment = stride * input_depth;
+ const float* filter_base_ptr = filter_data;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ // For the current (filter_x, filter_y) point in the filter,
+ // compute the boundaries of the corresponding output row segment.
+ int out_x_loop_start_unclampled = 0;
+ int out_x_loop_end_unclampled = 0;
+ if (kAllowStrided) {
+ if (stride == 2) {
+ out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+ out_x_loop_end_unclampled =
+ (pad_width + input_width - filter_x + 1) / 2;
+ } else if (stride == 4) {
+ out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+ out_x_loop_end_unclampled =
+ (pad_width + input_width - filter_x + 3) / 4;
+ } else {
+ out_x_loop_start_unclampled =
+ (pad_width - filter_x + stride - 1) / stride;
+ out_x_loop_end_unclampled =
+ (pad_width + input_width - filter_x + stride - 1) / stride;
+ }
+ } else {
+ out_x_loop_start_unclampled = pad_width - filter_x;
+ out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+ }
+ // The kernel will have to iterate on the segment of the
+ // output row that starts at out_x_loop_start and out_x_loop_end.
+ const int out_x_loop_start =
+ std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+ const int out_x_loop_end =
+ std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+ float* acc_buffer_ptr =
+ acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+ const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+ const float* input_ptr = input_data + in_x_origin * input_depth;
+ const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+ FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
+ kFixedDepthMultiplier>::Run(num_output_pixels,
+ input_depth,
+ depth_multiplier,
+ input_ptr,
+ input_ptr_increment,
+ filter_base_ptr,
+ acc_buffer_ptr);
+ filter_base_ptr += output_depth;
+ }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(
+ int stride, int input_depth, int input_width, const float* input_data,
+ int pad_width, int depth_multiplier, int filter_width,
+ const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
+ int output_depth, float* acc_buffer) {
+ gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+ LOG(FATAL)
+ << "\n\n"
+ << "*****************************************************************\n"
+ << "* This tfmini inference code was about to use the slow generic\n"
+ << "* fallback implementation for a DepthwiseConv op, and we want you\n"
+ << "* to be aware of that so that you will know why you get terrible\n"
+ << "* performance.\n"
+ << "*\n"
+ << "* If you would like to carry on with the slow code, compile\n"
+ << "* with this preprocessor token defined:\n"
+ << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
+ << "*\n"
+ << "* The right thing to do, if you care about performance, is to add\n"
+ << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
+ << "* The relevant parameters defining your case are:\n"
+ << "* stride = " << stride << "\n"
+ << "* input_depth = " << input_depth << "\n"
+ << "* depth_multiplier = " << depth_multiplier << "\n"
+ << "*\n"
+ << "* Please do not hesitate to contact benoitjacob@ with this\n"
+ << "* information.\n"
+ << "*****************************************************************\n";
+#endif // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+ const float* filter_base_ptr = filter_data;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int out_x_loop_start = std::max(
+ out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
+ const int out_x_loop_end =
+ std::min(out_x_buffer_end,
+ (pad_width + input_width - filter_x + stride - 1) / stride);
+
+ float* acc_buffer_ptr =
+ acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+ const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+ const float* input_ptr = input_data + in_x_origin * input_depth;
+ const int input_ptr_increment = (stride - 1) * input_depth;
+ for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+ const float* filter_ptr = filter_base_ptr;
+ for (int ic = 0; ic < input_depth; ++ic) {
+ const float input_val = *input_ptr++;
+ for (int m = 0; m < depth_multiplier; m++) {
+ const float filter_val = *filter_ptr++;
+ *acc_buffer_ptr++ += filter_val * input_val;
+ }
+ }
+ input_ptr += input_ptr_increment;
+ }
+ filter_base_ptr += output_depth;
+ }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+ const float* bias_data,
+ float* acc_buffer) {
+ // TODO(benoitjacob): This might need optimized specializations
+ // for small output_depth values, if that ever becomes an important
+ // case (like it was for some quantized DepthwiseConv cases).
+ for (int i = 0; i < num_output_pixels; i++) {
+ memcpy(acc_buffer + i * output_depth, bias_data,
+ sizeof(acc_buffer[0]) * output_depth);
+ }
+}
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int depth_multiplier,
+ float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int input_depth = ArraySize(input_dims, 0);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+
+ static const int kAccBufferMaxSize = 2048;
+ float acc_buffer[kAccBufferMaxSize];
+ TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
+ const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+ const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+ TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+ kAccBufferActualSize);
+ TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+ TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+ // row_accum_func will point to the core accumulation function to be used
+ // for this DepthwiseConv op.
+ using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
+ row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+ FIXED_DEPTH_MULTIPLIER) \
+ if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
+ (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
+ depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \
+ row_accum_func = \
+ FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+ FIXED_DEPTH_MULTIPLIER>; \
+ }
+
+#ifdef USE_NEON
+ // We go over our list of kernels by decreasing order of preference
+ // for the cases where multiple kernels could apply.
+
+ // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+
+ // Next come the strided kernels: AllowStrided=true, fixed input depth.
+ // They are a bit less efficient, but allow stride!=1.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+ // Finally, the kernels allowing a variable input depth,
+ // these are the least efficient but most general kernels.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
+
+#endif // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+ // No matching fast kernel found, use slow fallback.
+ if (!row_accum_func) {
+ row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+ }
+
+ // Now that we have determined row_accum_func, we can start work.
+ float* output_ptr = output_data;
+ for (int b = 0; b < batches; ++b) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(filter_height, input_height - in_y_origin);
+ for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+ out_x_buffer_start += kOutputPixelsInAccBuffer) {
+ const int out_x_buffer_end = std::min(
+ output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+ // We call a 'pixel' a group of activation that share all but the
+ // 'depth'/'channel' coordinate. num_output_pixels is the number of
+ // output pixels that we will accumulate in this loop iteration.
+ const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+ // Initialize our local accumulator with the bias values, so we don't
+ // have to add them later.
+ DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+ acc_buffer);
+ // Accumulation loop. Most of the time should be spent in here.
+ for (int filter_y = filter_y_start; filter_y < filter_y_end;
+ ++filter_y) {
+ const int in_y = in_y_origin + filter_y;
+ row_accum_func(stride_width, input_depth, input_width,
+ input_data + in_y * input_dims.strides[2] +
+ b * input_dims.strides[3],
+ pad_width, depth_multiplier, filter_width,
+ filter_data + filter_y * filter_dims.strides[2],
+ out_x_buffer_start, out_x_buffer_end, output_depth,
+ acc_buffer);
+ }
+ // Finished accumulating. Now store to destination.
+ const int num_output_values = output_depth * num_output_pixels;
+ int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+ // Handle 16 values at a time
+ for (; i <= num_output_values - 16; i += 16) {
+ float32x4_t acc[4];
+ for (int k = 0; k < 4; k++) {
+ acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+ }
+ for (int k = 0; k < 4; k++) {
+ acc[k] = vmaxq_f32(
+ vdupq_n_f32(output_activation_min),
+ vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+ }
+ for (int k = 0; k < 4; k++) {
+ vst1q_f32(output_ptr + 4 * k, acc[k]);
+ }
+ output_ptr += 16;
+ }
+ // Handle 4 values at a time
+ for (; i <= num_output_values - 4; i += 4) {
+ float32x4_t acc = vld1q_f32(acc_buffer + i);
+
+ acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
+ vminq_f32(vdupq_n_f32(output_activation_max), acc));
+
+ vst1q_f32(output_ptr, acc);
+ output_ptr += 4;
+ }
+#endif
+ // Handle leftover values, one by one. This is very slow.
+ for (; i < num_output_values; i++) {
+ float acc = acc_buffer[i];
+ acc = std::max(output_activation_min,
+ std::min(output_activation_max, acc));
+
+ *output_ptr++ = acc;
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int depth_multiplier, float* output_data,
+ const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+ bias_dims, stride_width, stride_height, pad_width, pad_height,
+ depth_multiplier, output_activation_min, output_activation_max,
+ output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims, int stride,
+ int pad_width, int pad_height, int depth_multiplier,
+ float* output_data, const Dims<4>& output_dims) {
+ DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+ bias_dims, stride, stride, pad_width, pad_height,
+ depth_multiplier, output_data, output_dims);
+}
+
+} // namespace optimized_ops
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
new file mode 100644
index 0000000000..051ed2a2c4
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -0,0 +1,1916 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 2> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8x2_t filter_u8;
+ filter_u8.val[0] = vld1_u8(filter_ptr);
+ filter_u8.val[1] = vld1_u8(filter_ptr + 8);
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++) {
+ filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
+ vdupq_n_s16(filter_offset));
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer
+ int32x4x2_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+ }
+ // Load the inputs, add input_offset.
+ const uint8x8_t input_u8 = vld1_u8(input_ptr);
+ input_ptr += input_ptr_increment;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
+ vget_low_s16(input_dup2.val[i]));
+ acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
+ vget_high_s16(input_dup2.val[i]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+ vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 8, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+ const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+ const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+ int outp = 0;
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2) {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8[2];
+ for (int i = 0; i < 2; i++) {
+ input_u8[i] = vld1_u8(input_ptr + 8 * i);
+ }
+ input_ptr += 16;
+ int16x8_t input[2];
+ for (int i = 0; i < 2; i++) {
+ input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+ }
+ for (int i = 0; i < 2; i++) {
+ input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+ }
+ // Multiply-accumulate.
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+ acc[1] =
+ vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+ acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+ acc[3] =
+ vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle 1 output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[2];
+ acc[0] = vld1q_s32(acc_buffer_ptr);
+ acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+ // Load the inputs, add input_offset.
+ const uint8x8_t input_u8 = vld1_u8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Multiply-accumulate.
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc[0]);
+ vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 2> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+ const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+ const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+ int outp = 0;
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ const uint8x8_t input_u8 = vld1_u8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
+ vget_low_s16(input_dup2.val[i]));
+ acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
+ vget_high_s16(input_dup2.val[i]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+ input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x4x2_t input_dup2 = vzip_s16(input, input);
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 8> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++) {
+ const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+ const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+ filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+ }
+ int outp = 0;
+ // Handle two output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2) {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[8];
+ for (int i = 0; i < 8; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+ input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Multiply-accumulate.
+ acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+ acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+ acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+ acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+ acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+ acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+ acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+ acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 8; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 32;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_ptr += 2;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+ acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+ acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+ acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 2> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8 = vdup_n_u8(0);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+ filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+ filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+ const int16x4_t filter_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+ const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+ int outp = 0;
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ const uint8x8_t input_u8 = vld1_u8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+ acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+ acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+ acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_ptr += 2;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+ // Multiply-accumulate
+ acc = vmlal_s16(acc, filter, input_dup2);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8 = vdup_n_u8(0);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+ const int16x4_t filter_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+ const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+ int outp = 0;
+ // Handle 8 output pixels at a time.
+ for (; outp <= num_output_pixels - 8; outp += 8) {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8[2];
+ for (int i = 0; i < 2; i++) {
+ input_u8[i] = vld1_u8(input_ptr + 8 * i);
+ }
+ input_ptr += 16;
+ int16x8_t input[2];
+ for (int i = 0; i < 2; i++) {
+ input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+ }
+ for (int i = 0; i < 2; i++) {
+ input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+ }
+
+ // Multiply-accumulate.
+ acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+ acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+ acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+ acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4) {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ const uint8x8_t input_u8 = vld1_u8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+ acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 2; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2) {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+ input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer.
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ // Handle 1 output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer.
+ int32x2_t acc = vld1_s32(acc_buffer_ptr);
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_ptr += 2;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+ // Store the accumulators back to acc_buffer.
+ vst1_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 2> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8 = vdup_n_u8(0);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+ const int16x4_t filter_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+ const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+ int outp = 0;
+ // Handle 8 output pixels at a time.
+ for (; outp <= num_output_pixels - 8; outp += 8) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ const uint8x8_t input_u8 = vld1_u8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+ acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+ acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+ acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer
+ int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ const uint32 input = *input_ptr++ + input_offset;
+
+ // Multiply-accumulate
+ acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+ // Store the accumulators back to acc_buffer
+ vst1_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 4> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8 = vdup_n_u8(0);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+ filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+ filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+ const int16x4_t filter_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+ const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+ int outp = 0;
+ // Handle 8 output pixels at a time.
+ for (; outp <= num_output_pixels - 8; outp += 8) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[8];
+ for (int i = 0; i < 8; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vld1_u8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+ // Multiply-accumulate
+ acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+ acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+ acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+ acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+ acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+ acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+ acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+ acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 8; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 32;
+ }
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+ input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate
+ acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+ acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+ acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+ acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ const uint32 input = *input_ptr++ + input_offset;
+
+ // Multiply-accumulate
+ acc = vmlal_n_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8 = vdup_n_u8(0);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+ filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+ filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+ const int16x4_t filter_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+ const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+ int outp = 0;
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ int16x8_t input[2];
+ for (int i = 0; i < 2; i++) {
+ const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ }
+ input_ptr += 16;
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[2 * i + 0] =
+ vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+ acc[2 * i + 1] =
+ vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc;
+ acc = vld1q_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+ input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Multiply-accumulate
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 4> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++) {
+ const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+ const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+ filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+ }
+
+ int outp = 0;
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[8];
+ for (int i = 0; i < 8; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vld1_u8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+ // Multiply-accumulate
+ acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
+ vget_low_s16(input), 0);
+ acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
+ vget_low_s16(input), 1);
+ acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
+ vget_low_s16(input), 2);
+ acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
+ vget_low_s16(input), 3);
+ acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
+ vget_high_s16(input), 0);
+ acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
+ vget_high_s16(input), 1);
+ acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
+ vget_high_s16(input), 2);
+ acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
+ vget_high_s16(input), 3);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 8; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 32;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+ input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate
+ acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+ acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+ acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+ acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 3> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // We will have to duplicate bytes in a NEON register, 3-fold.
+ // We will do that by register-level table-look-up using VTBL instructions.
+ // Here we prepare the registers containing the table-lookup indices.
+ static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
+ {2, 3, 3, 3, 4, 4, 4, 5},
+ {5, 5, 6, 6, 6, 7, 7, 7}};
+ uint8x8_t dup3_indices[3];
+ for (int i = 0; i < 3; i++) {
+ dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
+ }
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ const uint8* local_filter_ptr = filter_ptr;
+ const uint8* local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 8 input channels at a time.
+ for (; ic <= input_depth - 8; ic += 8) {
+ // Load the filters, add filter_offset.
+ int16x8_t filter[3];
+ uint8x8x3_t filter_u8;
+ filter_u8.val[0] = vld1_u8(local_filter_ptr);
+ filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+ filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
+ local_filter_ptr += 24;
+ for (int i = 0; i < 3; i++) {
+ const int16x8_t filter_s16 =
+ vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+ filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+ }
+ // Load the inputs, duplicate 3-fold, add input_offset.
+ const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+ local_input_ptr += 8;
+
+ uint8x8_t input_u8_dup3[3];
+ for (int i = 0; i < 3; i++) {
+ input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
+ }
+ int16x8_t input_dup3[3];
+ for (int i = 0; i < 3; i++) {
+ const int16x8_t input_s16_dup3 =
+ vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
+ input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+ }
+ // Load the accumulators from acc_buffer
+ int32x4x3_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+ acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+ }
+ // Multiply-accumulate
+ for (int j = 0; j < 3; j++) {
+ acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
+ vget_low_s16(filter[j]));
+ acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
+ vget_high_s16(filter[j]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+ vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+ vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+ }
+ acc_buffer_ptr += 24;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++) {
+ const int16 input_val = *local_input_ptr++ + input_offset;
+ for (int i = 0; i < 3; i++) {
+ const int16 filter_val = local_filter_ptr[i] + filter_offset;
+ *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+ }
+ local_filter_ptr += 3;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 2> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ const uint8* local_filter_ptr = filter_ptr;
+ const uint8* local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 8 input channels at a time.
+ for (; ic <= input_depth - 8; ic += 8) {
+ // Load the filters, add filter_offset.
+ int16x8_t filter[2];
+ uint8x8x2_t filter_u8;
+ filter_u8.val[0] = vld1_u8(local_filter_ptr);
+ filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+ local_filter_ptr += 16;
+ for (int i = 0; i < 2; i++) {
+ const int16x8_t filter_s16 =
+ vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+ filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+ }
+ // Load the inputs, add input_offset, duplicate 2-fold.
+ const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+ local_input_ptr += 8;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Load the accumulators from acc_buffer.
+ int32x4x2_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+ }
+ // Multiply-accumulate.
+ for (int j = 0; j < 2; j++) {
+ acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
+ vget_low_s16(input_dup2.val[j]));
+ acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
+ vget_high_s16(input_dup2.val[j]));
+ }
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 2; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+ vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++) {
+ // Load the inputs.
+ const int16 input_val = *local_input_ptr++ + input_offset;
+ for (int i = 0; i < 2; i++) {
+ const int16 filter_val = local_filter_ptr[i] + filter_offset;
+ *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+ }
+ local_filter_ptr += 2;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ const uint8* local_filter_ptr = filter_ptr;
+ const uint8* local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 16 input channels at a time.
+ for (; ic <= input_depth - 16; ic += 16) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
+ uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
+ local_filter_ptr += 16;
+ int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+ int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+ filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+ filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
+ uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
+ local_input_ptr += 16;
+ int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+ int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+ input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+ input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+ // Load the accumulators from acc_buffer
+ int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+ int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+ int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+ int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+ acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+ acc_1 =
+ vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+ acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+ acc_3 =
+ vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+ acc_buffer_ptr += 16;
+ }
+ // Handle 8 input channels at a time.
+ for (; ic <= input_depth - 8; ic += 8) {
+ // Load the filters, add filter_offset.
+ const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
+ local_filter_ptr += 8;
+ const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+ const int16x8_t filter =
+ vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+ // Load the inputs, add input_offset.
+ const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+ local_input_ptr += 8;
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++) {
+ const int16 input_val = *local_input_ptr++ + input_offset;
+ const int16 filter_val = *local_filter_ptr++ + filter_offset;
+ *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 16, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8[2];
+ for (int i = 0; i < 2; i++) {
+ filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+ }
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++) {
+ filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+ }
+ for (int i = 0; i < 2; i++) {
+ filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8[2];
+ for (int i = 0; i < 2; i++) {
+ input_u8[i] = vld1_u8(input_ptr + 8 * i);
+ }
+ input_ptr += input_ptr_increment;
+ int16x8_t input[2];
+ for (int i = 0; i < 2; i++) {
+ input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+ }
+ for (int i = 0; i < 2; i++) {
+ input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+ }
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
+ vget_low_s16(filter[i]));
+ acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
+ vget_high_s16(filter[i]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+ const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+ const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ // Load the inputs, add input_offset.
+ const uint8x8_t input_u8 = vld1_u8(input_ptr);
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 16> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8[2];
+ for (int i = 0; i < 2; i++) {
+ filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+ }
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++) {
+ filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+ }
+ for (int i = 0; i < 2; i++) {
+ filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ uint8 input_u8 = *input_ptr;
+ input_ptr += input_ptr_increment;
+ int16 input = static_cast<int16>(input_u8 + input_offset);
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++) {
+ acc[2 * i + 0] =
+ vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+ acc[2 * i + 1] =
+ vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 32> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+ uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+ uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
+ uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
+ int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+ int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+ int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
+ int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
+ filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+ filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+ filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
+ filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ uint8 input_u8 = *input_ptr;
+ input_ptr += input_ptr_increment;
+ int16 input = static_cast<int16>(input_u8 + input_offset);
+ // Load the accumulators from acc_buffer
+ int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+ int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+ int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+ int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+ int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+ int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+ int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+ int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+ // Multiply-accumulate
+ acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+ acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+ acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+ acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+ acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+ acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+ acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+ acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+ vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+ vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+ vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+ vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+ acc_buffer_ptr += 32;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 8> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+ const int16x8_t filter = vaddq_s16(
+ vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ uint8 input_u8 = *input_ptr;
+ input_ptr += input_ptr_increment;
+ int16 input = static_cast<int16>(input_u8 + input_offset);
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+ acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++) {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 2, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8 = vdup_n_u8(0);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+ const int16x4_t filter_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+ const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+ int outp = 0;
+
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2) {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+ // Load the inputs, add input_offset.
+ uint16x4_t input_u16 = vdup_n_u16(0);
+ input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
+ input_u16, 0);
+ input_ptr += input_ptr_increment;
+ input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
+ input_u16, 1);
+ input_ptr += input_ptr_increment;
+ const int16x4_t input_s16 = vreinterpret_s16_u16(
+ vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer.
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+
+ // Handle 1 output pixel at a time.
+ for (; outp < num_output_pixels; outp++) {
+ // Load the accumulators from acc_buffer.
+ int32x2_t acc = vld1_s32(acc_buffer_ptr);
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_ptr += input_ptr_increment;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+ // Store the accumulators back to acc_buffer.
+ vst1_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 4, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ if (num_output_pixels <= 0) {
+ return;
+ }
+
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8 = vdup_n_u8(0);
+ filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+ filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+ filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+ filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+ const int16x4_t filter_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+ const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+ int outp = 0;
+
+ // Handle one output pixel at a time until second to the last pixel. Second
+ // to the last because we read eight input pixels while only processing
+ // four.
+ for (; outp < num_output_pixels - 1; outp++) {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc;
+ acc = vld1q_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vld1_u8(input_ptr);
+ input_ptr += input_ptr_increment;
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Multiply-accumulate
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+
+ // Handle the last output pixel.
+ // Load the accumulators from acc_buffer
+ int32x4_t acc;
+ acc = vld1q_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8 = vdup_n_u8(0);
+ input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+ input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+ input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+ input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+ const int16x4_t input_s16 =
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Multiply-accumulate
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 12, 1> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
+ uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
+ int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+ int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+ filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
+ filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
+ int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+ int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+ int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ // Load the inputs, add input_offset.
+ uint8x8_t input_u8_0 = vld1_u8(input_ptr);
+ uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
+ input_ptr += input_ptr_increment;
+ int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+ int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+ input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+ input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+ // Load the accumulators from acc_buffer
+ int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+ int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+ int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+ // Multiply-accumulate
+ acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+ acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+ acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+ acc_buffer_ptr += 12;
+ }
+ }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(
+ int stride, int input_depth, int input_width, const uint8* input_data,
+ int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
+ const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
+ int out_x_buffer_end, int output_depth, int32* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+ gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+ // Sanity check parameters. This is important in particular to ensure
+ // that we keep the number of template instantiations minimal, so we don't
+ // increase binary size unnecessarily.
+ static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+ static_assert(kFixedInputDepth || kAllowStrided, "");
+ TFLITE_DCHECK(stride == 1 || kAllowStrided);
+ if (kFixedInputDepth) {
+ TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
+ }
+ if (kFixedDepthMultiplier) {
+ TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+ }
+ TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+ const int input_ptr_increment = stride * input_depth;
+ const uint8* filter_base_ptr = filter_data;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ // For the current (filter_x, filter_y) point in the filter,
+ // compute the boundaries of the corresponding output row segment.
+ int out_x_loop_start_unclampled = 0;
+ int out_x_loop_end_unclampled = 0;
+ if (kAllowStrided) {
+ if (stride == 2) {
+ out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+ out_x_loop_end_unclampled =
+ (pad_width + input_width - filter_x + 1) / 2;
+ } else if (stride == 4) {
+ out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+ out_x_loop_end_unclampled =
+ (pad_width + input_width - filter_x + 3) / 4;
+ } else {
+ out_x_loop_start_unclampled =
+ (pad_width - filter_x + stride - 1) / stride;
+ out_x_loop_end_unclampled =
+ (pad_width + input_width - filter_x + stride - 1) / stride;
+ }
+ } else {
+ out_x_loop_start_unclampled = pad_width - filter_x;
+ out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+ }
+ // The kernel will have to iterate on the segment of the
+ // output row that starts at out_x_loop_start and out_x_loop_end.
+ const int out_x_loop_start =
+ std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+ const int out_x_loop_end =
+ std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+ int32* acc_buffer_ptr =
+ acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+ const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+ const uint8* input_ptr = input_data + in_x_origin * input_depth;
+ const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+ QuantizedDepthwiseConvKernel<
+ kAllowStrided, kFixedInputDepth,
+ kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
+ depth_multiplier, input_ptr, input_offset,
+ input_ptr_increment, filter_base_ptr,
+ filter_offset, acc_buffer_ptr);
+ filter_base_ptr += output_depth;
+ }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(
+ int stride, int input_depth, int input_width, const uint8* input_data,
+ int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
+ const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
+ int out_x_buffer_end, int output_depth, int32* acc_buffer) {
+ gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+ LOG(FATAL)
+ << "\n\n"
+ << "*****************************************************************\n"
+ << "* This tfmini inference code was about to use the slow generic\n"
+ << "* fallback implementation for a DepthwiseConv op, and we want you\n"
+ << "* to be aware of that so that you will know why you get terrible\n"
+ << "* performance.\n"
+ << "*\n"
+ << "* If you would like to carry on with the slow code, compile\n"
+ << "* with this preprocessor token defined:\n"
+ << "* TFLITE_ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
+ << "*\n"
+ << "* The right thing to do, if you care about performance, is to add\n"
+ << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
+ << "* The relevant parameters defining your case are:\n"
+ << "* stride = " << stride << "\n"
+ << "* input_depth = " << input_depth << "\n"
+ << "* depth_multiplier = " << depth_multiplier << "\n"
+ << "*\n"
+ << "* Please do not hesitate to contact benoitjacob@ with this\n"
+ << "* information.\n"
+ << "*****************************************************************\n";
+#endif // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+ const uint8* filter_base_ptr = filter_data;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int out_x_loop_start = std::max(
+ out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
+ const int out_x_loop_end =
+ std::min(out_x_buffer_end,
+ (pad_width + input_width - filter_x + stride - 1) / stride);
+
+ int32* acc_buffer_ptr =
+ acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+ const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+ const uint8* input_ptr = input_data + in_x_origin * input_depth;
+ const int input_ptr_increment = (stride - 1) * input_depth;
+ for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+ const uint8* filter_ptr = filter_base_ptr;
+ for (int ic = 0; ic < input_depth; ++ic) {
+ const int16 input_val = *input_ptr++ + input_offset;
+ for (int m = 0; m < depth_multiplier; m++) {
+ const int16 filter_val = *filter_ptr++ + filter_offset;
+ *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+ }
+ }
+ input_ptr += input_ptr_increment;
+ }
+ filter_base_ptr += output_depth;
+ }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+ const int32* bias_data,
+ int32* acc_buffer) {
+ int i = 0;
+#ifdef USE_NEON
+ if (output_depth == 1) {
+ const int32x4_t b = vdupq_n_s32(bias_data[0]);
+ for (; i <= num_output_pixels - 16; i += 16) {
+ vst1q_s32(acc_buffer + i + 0, b);
+ vst1q_s32(acc_buffer + i + 4, b);
+ vst1q_s32(acc_buffer + i + 8, b);
+ vst1q_s32(acc_buffer + i + 12, b);
+ }
+ for (; i <= num_output_pixels - 4; i += 4) {
+ vst1q_s32(acc_buffer + i, b);
+ }
+ } else if (output_depth == 2) {
+ int32x4_t b = vdupq_n_s32(bias_data[0]);
+ b = vsetq_lane_s32(bias_data[1], b, 1);
+ b = vsetq_lane_s32(bias_data[1], b, 3);
+ for (; i <= num_output_pixels - 8; i += 8) {
+ vst1q_s32(acc_buffer + 2 * i + 0, b);
+ vst1q_s32(acc_buffer + 2 * i + 4, b);
+ vst1q_s32(acc_buffer + 2 * i + 8, b);
+ vst1q_s32(acc_buffer + 2 * i + 12, b);
+ }
+ for (; i <= num_output_pixels - 2; i += 2) {
+ vst1q_s32(acc_buffer + 2 * i, b);
+ }
+ } else if (output_depth == 4) {
+ const int32x4_t b = vld1q_s32(bias_data);
+ for (; i <= num_output_pixels - 4; i += 4) {
+ vst1q_s32(acc_buffer + 4 * i + 0, b);
+ vst1q_s32(acc_buffer + 4 * i + 4, b);
+ vst1q_s32(acc_buffer + 4 * i + 8, b);
+ vst1q_s32(acc_buffer + 4 * i + 12, b);
+ }
+ for (; i < num_output_pixels; i++) {
+ vst1q_s32(acc_buffer + 4 * i, b);
+ }
+ } else if (output_depth == 8) {
+ const int32x4_t b0 = vld1q_s32(bias_data);
+ const int32x4_t b1 = vld1q_s32(bias_data + 4);
+ for (; i <= num_output_pixels - 2; i += 2) {
+ vst1q_s32(acc_buffer + 8 * i + 0, b0);
+ vst1q_s32(acc_buffer + 8 * i + 4, b1);
+ vst1q_s32(acc_buffer + 8 * i + 8, b0);
+ vst1q_s32(acc_buffer + 8 * i + 12, b1);
+ }
+ for (; i < num_output_pixels; i++) {
+ vst1q_s32(acc_buffer + 8 * i + 0, b0);
+ vst1q_s32(acc_buffer + 8 * i + 4, b1);
+ }
+ } else if (output_depth == 16) {
+ const int32x4_t b0 = vld1q_s32(bias_data);
+ const int32x4_t b1 = vld1q_s32(bias_data + 4);
+ const int32x4_t b2 = vld1q_s32(bias_data + 8);
+ const int32x4_t b3 = vld1q_s32(bias_data + 12);
+ for (; i < num_output_pixels; i++) {
+ vst1q_s32(acc_buffer + 16 * i + 0, b0);
+ vst1q_s32(acc_buffer + 16 * i + 4, b1);
+ vst1q_s32(acc_buffer + 16 * i + 8, b2);
+ vst1q_s32(acc_buffer + 16 * i + 12, b3);
+ }
+ }
+#endif
+ for (; i < num_output_pixels; i++) {
+ memcpy(acc_buffer + i * output_depth, bias_data,
+ sizeof(acc_buffer[0]) * output_depth);
+ }
+}
+
+inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int depth_multiplier,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int input_depth = ArraySize(input_dims, 0);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+
+ static const int kAccBufferMaxSize = 2048;
+ int32 acc_buffer[kAccBufferMaxSize];
+ TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
+ const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+ const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+ TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+ kAccBufferActualSize);
+ TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+ TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+ // row_accum_func will point to the core accumulation function to be used
+ // for this DepthwiseConv op.
+ using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+ row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+ FIXED_DEPTH_MULTIPLIER) \
+ if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
+ (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
+ depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \
+ row_accum_func = \
+ QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+ FIXED_DEPTH_MULTIPLIER>; \
+ }
+
+#ifdef USE_NEON
+ // We go over our list of kernels by decreasing order of preference
+ // for the cases where multiple kernels could apply.
+
+ // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+ // Next come the strided kernels: AllowStrided=true, fixed input depth.
+ // They are a bit less efficient, but allow stride!=1.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+ // Finally, the kernels allowing a variable input depth,
+ // these are the least efficient but most general kernels.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif // USE_NEON
+
+ // No matching fast kernel found, use slow fallback.
+ if (!row_accum_func) {
+ row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+ }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+ // Now that we have determined row_accum_func, we can start work.
+ uint8* output_ptr = output_data;
+ for (int b = 0; b < batches; ++b) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(filter_height, input_height - in_y_origin);
+ for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+ out_x_buffer_start += kOutputPixelsInAccBuffer) {
+ const int out_x_buffer_end = std::min(
+ output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+ // We call a 'pixel' a group of activation that share all but the
+ // 'depth'/'channel' coordinate. num_output_pixels is the number of
+ // output pixels that we will accumulate in this loop iteration.
+ const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+ // Initialize our local accumulator with the bias values, so we don't
+ // have to add them later.
+ DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+ acc_buffer);
+ // Accumulation loop. Most of the time should be spent in here.
+ for (int filter_y = filter_y_start; filter_y < filter_y_end;
+ ++filter_y) {
+ const int in_y = in_y_origin + filter_y;
+ row_accum_func(
+ stride_width, input_depth, input_width,
+ input_data + in_y * input_dims.strides[2] +
+ b * input_dims.strides[3],
+ input_offset, pad_width, depth_multiplier, filter_width,
+ filter_data + filter_y * filter_dims.strides[2], filter_offset,
+ out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
+ }
+ // Finished accumulating int32 values. Now need to convert them to
+ // the final 8bit form and store them.
+ gemmlowp::ScopedProfilingLabel label("downquantize+store");
+ const int num_output_values = output_depth * num_output_pixels;
+ int i = 0;
+#ifdef USE_NEON
+ using gemmlowp::RoundingDivideByPOT;
+ const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+ const int32x4_t output_activation_min_vec =
+ vdupq_n_s32(output_activation_min);
+ const int32x4_t output_activation_max_vec =
+ vdupq_n_s32(output_activation_max);
+ // Handle 16 values at once.
+ // This allows us to issue 4 mutually independent int32
+ // multiplications (vqrdmulh), which should alleviate most of their
+ // high latency.
+ for (; i <= num_output_values - 16; i += 16) {
+ int32x4_t acc[4];
+ for (int j = 0; j < 4; j++) {
+ acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
+ }
+
+ // Fixed-point multiplication.
+ for (int j = 0; j < 4; j++) {
+ acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+ }
+ for (int j = 0; j < 4; j++) {
+ acc[j] = RoundingDivideByPOT(acc[j], output_shift);
+ }
+ // Add the output offset.
+ for (int j = 0; j < 4; j++) {
+ acc[j] = vaddq_s32(acc[j], output_offset_vec);
+ }
+ // Apply the activation function.
+ for (int j = 0; j < 4; j++) {
+ acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
+ }
+ for (int j = 0; j < 4; j++) {
+ acc[j] = vminq_s32(acc[j], output_activation_max_vec);
+ }
+ // Saturating cast to uint8 and store to destination.
+ int16x4_t acc_s16[4];
+ for (int j = 0; j < 4; j++) {
+ acc_s16[j] = vqmovn_s32(acc[j]);
+ }
+ const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
+ const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
+ const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
+ const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
+ vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
+ output_ptr += 16;
+ }
+ // Handle 8 values at once.
+ // Not as good as 16 (now we're only issuing 2 mutually independent
+ // vqrdmulh instructions, so we're probably paying for their high
+ // latency).
+ for (; i <= num_output_values - 8; i += 8) {
+ int32x4_t acc0 = vld1q_s32(acc_buffer + i);
+ int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
+ // Fixed-point multiplication.
+ acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+ acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+ // Rounding right shift.
+ acc0 = RoundingDivideByPOT(acc0, output_shift);
+ acc1 = RoundingDivideByPOT(acc1, output_shift);
+ // Add the output offset.
+ acc0 = vaddq_s32(acc0, output_offset_vec);
+ acc1 = vaddq_s32(acc1, output_offset_vec);
+ // Apply the activation function.
+ acc0 = vmaxq_s32(acc0, output_activation_min_vec);
+ acc1 = vmaxq_s32(acc1, output_activation_min_vec);
+ acc0 = vminq_s32(acc0, output_activation_max_vec);
+ acc1 = vminq_s32(acc1, output_activation_max_vec);
+ // Saturating cast to uint8 and store to destination.
+ const int16x4_t acc0_s16 = vqmovn_s32(acc0);
+ const int16x4_t acc1_s16 = vqmovn_s32(acc1);
+ const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
+ const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+ vst1_u8(output_ptr, res_u8);
+ output_ptr += 8;
+ }
+ // Handle 4 values at once. Now we're paying the full price of the
+ // high latency of vqrdmulh. Also, storing only 4 bytes at the end
+ // (without any alignment) can only be done 1 byte at a time.
+ // Yet, that is still worth doing to minimize the amount of leftover
+ // that will have to go through the very slow scalar code.
+ for (; i <= num_output_values - 4; i += 4) {
+ int32x4_t acc = vld1q_s32(acc_buffer + i);
+ // Fixed-point multiplication.
+ acc = vqrdmulhq_n_s32(acc, output_multiplier);
+ // Rounding right shift.
+ acc = RoundingDivideByPOT(acc, output_shift);
+ // Add the output offset.
+ acc = vaddq_s32(acc, output_offset_vec);
+ // Apply the activation function.
+ acc = vmaxq_s32(acc, output_activation_min_vec);
+ acc = vminq_s32(acc, output_activation_max_vec);
+ // Saturating cast to uint8 and store to destination.
+ const int16x4_t acc_s16 = vqmovn_s32(acc);
+ const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
+ const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+ vst1_lane_u8(output_ptr + 0, res_u8, 0);
+ vst1_lane_u8(output_ptr + 1, res_u8, 1);
+ vst1_lane_u8(output_ptr + 2, res_u8, 2);
+ vst1_lane_u8(output_ptr + 3, res_u8, 3);
+ output_ptr += 4;
+ }
+#endif // USE_NEON
+
+ // Handle leftover values, one by one. This is very slow.
+ for (; i < num_output_values; i++) {
+ int32 acc = acc_buffer[i];
+ acc = MultiplyByQuantizedMultiplierSmallerThanOne(
+ acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ *output_ptr++ = static_cast<uint8>(acc);
+ }
+ }
+ }
+ }
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int depth_multiplier, int32 output_offset,
+ int32 output_multiplier, int output_shift,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, stride_width,
+ stride_height, pad_width, pad_height, depth_multiplier,
+ output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max, output_data,
+ output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims, int stride,
+ int pad_width, int pad_height, int depth_multiplier,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
+ filter_dims, filter_offset, bias_data, bias_dims, stride,
+ stride, pad_width, pad_height, depth_multiplier,
+ output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max, output_data,
+ output_dims);
+}
+
+} // namespace optimized_ops
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
new file mode 100644
index 0000000000..8004c24a99
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -0,0 +1,231 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Copied from tensorflow/core/kernels/eigen_spatial_convolutions.h.
+// TODO(petewarden) - move this to a common location in Eigen itself.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+#define EIGEN_USE_THREADS
+
+// NOTE: Eigen is slightly different internally and externally. We need to
+// hack the unsupported/Eigen/CXX11/Tensor header instantiation macros at
+// specific places, so we need two copies of the hacked file, one for
+// internal and one for external.
+// If you have trouble simply undef out the reducer macro e.g.
+// TFLITE_REDUCE_INSTANTIATIONS_GOOGLE, but be aware this will make
+// the binary much bigger!
+#define TFLITE_REDUCE_INSTANTIATIONS_OPEN_SOURCE
+#define Eigen EigenForTFLite
+#if defined(TFLITE_REDUCE_INSTANTIATIONS_GOOGLE)
+#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h"
+#elif defined(TFLITE_REDUCE_INSTANTIATIONS_OPEN_SOURCE)
+#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h"
+#else
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#endif
+
+
+namespace Eigen {
+
+/** SpatialConvolution
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
+ * (channels, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_height, kernel_width)
+ * The input and the kernel must both be in col-major layout. The result will
+ * also be in col-major layout.
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
+ * pixels.
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be filters, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the width and height dimensions provided
+ * that the same order is used in the input, the kernel, and the output.
+ *
+ */
+template <typename Input, typename Kernel>
+EIGEN_DEVICE_FUNC
+ EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+ internal::traits<Input>::Layout == ColMajor,
+ TensorReshapingOp<
+ const DSizes<typename internal::traits<Input>::Index,
+ internal::traits<Input>::NumDimensions>,
+ const TensorContractionOp<
+ const array<IndexPair<typename internal::traits<Input>::Index>,
+ 1>,
+ const TensorReshapingOp<
+ const DSizes<typename internal::traits<Input>::Index, 2>,
+ const Kernel>,
+ const TensorReshapingOp<
+ const DSizes<typename internal::traits<Input>::Index, 2>,
+ const TensorImagePatchOp<Dynamic, Dynamic,
+ const Input> > > >,
+ TensorReshapingOp<
+ const DSizes<typename internal::traits<Input>::Index,
+ internal::traits<Input>::NumDimensions>,
+ const TensorContractionOp<
+ const array<IndexPair<typename internal::traits<Input>::Index>,
+ 1>,
+ const TensorReshapingOp<
+ const DSizes<typename internal::traits<Input>::Index, 2>,
+ const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+ const TensorReshapingOp<
+ const DSizes<typename internal::traits<Input>::Index, 2>,
+ const Kernel> > > >::type
+ SpatialConvolution(const Input& input, const Kernel& kernel,
+ const DenseIndex row_stride = 1,
+ const DenseIndex col_stride = 1,
+ const PaddingType padding_type = PADDING_SAME,
+ const DenseIndex row_in_stride = 1,
+ const DenseIndex col_in_stride = 1) {
+ typedef typename internal::traits<Input>::Index TensorIndex;
+ TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+ internal::traits<Input>::NumDimensions,
+ internal::traits<Input>::Layout, TensorIndex> >
+ in(input);
+ TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
+ internal::traits<Kernel>::NumDimensions,
+ internal::traits<Kernel>::Layout, TensorIndex> >
+ kern(kernel);
+
+ EIGEN_STATIC_ASSERT(
+ internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
+ YOU_MADE_A_PROGRAMMING_MISTAKE);
+ const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+ const int NumDims = internal::traits<Input>::NumDimensions;
+
+ // Number of filters to apply. This is the same as the output depth of the
+ // result
+ const TensorIndex kernelFilters =
+ isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+ // Number of channels. This is the same as the input depth.
+ const TensorIndex kernelChannels =
+ isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+ const TensorIndex kernelRows =
+ isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+ const TensorIndex kernelCols =
+ isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+ const DenseIndex kernelRowsEff =
+ kernelRows + (kernelRows - 1) * (row_in_stride - 1);
+ const DenseIndex kernelColsEff =
+ kernelCols + (kernelCols - 1) * (col_in_stride - 1);
+
+ array<IndexPair<TensorIndex>, 1> contract_dims;
+ contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+ const TensorIndex InputRows =
+ isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+ const TensorIndex InputCols =
+ isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+
+ TensorIndex out_height;
+ TensorIndex out_width;
+ switch (padding_type) {
+ case PADDING_VALID:
+ out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) /
+ static_cast<float>(row_stride));
+ out_width = numext::ceil((InputCols - kernelColsEff + 1.f) /
+ static_cast<float>(col_stride));
+ break;
+ case PADDING_SAME:
+ out_height = numext::ceil(InputRows / static_cast<float>(row_stride));
+ out_width = numext::ceil(InputCols / static_cast<float>(col_stride));
+ break;
+ default:
+ // Initialize unused variables to avoid a compiler warning
+ out_height = 0;
+ out_width = 0;
+ eigen_assert(false && "unexpected padding");
+ }
+
+ // Molds the output of the patch extraction code into a 2d tensor:
+ // - the first dimension (dims[0]): the patch values to be multiplied with the
+ // kernels
+ // - the second dimension (dims[1]): everything else
+ DSizes<TensorIndex, 2> pre_contract_dims;
+ if (isColMajor) {
+ pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
+ pre_contract_dims[1] = out_height * out_width;
+ for (int i = 3; i < NumDims; ++i) {
+ pre_contract_dims[1] *= in.dimension(i);
+ }
+ } else {
+ pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
+ pre_contract_dims[0] = out_height * out_width;
+ for (int i = 0; i < NumDims - 3; ++i) {
+ pre_contract_dims[0] *= in.dimension(i);
+ }
+ }
+
+ // Molds the output of the contraction into the shape expected by the used
+ // (assuming this is ColMajor):
+ // - 1st dim: kernel filters
+ // - 2nd dim: output height
+ // - 3rd dim: output width
+ // - 4th dim and beyond: everything else including batch size
+ DSizes<TensorIndex, NumDims> post_contract_dims;
+ if (isColMajor) {
+ post_contract_dims[0] = kernelFilters;
+ post_contract_dims[1] = out_height;
+ post_contract_dims[2] = out_width;
+ for (int i = 3; i < NumDims; ++i) {
+ post_contract_dims[i] = in.dimension(i);
+ }
+ } else {
+ post_contract_dims[NumDims - 1] = kernelFilters;
+ post_contract_dims[NumDims - 2] = out_height;
+ post_contract_dims[NumDims - 3] = out_width;
+ for (int i = 0; i < NumDims - 3; ++i) {
+ post_contract_dims[i] = in.dimension(i);
+ }
+ }
+
+ DSizes<TensorIndex, 2> kernel_dims;
+ if (isColMajor) {
+ kernel_dims[0] = kernelFilters;
+ kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
+ } else {
+ kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
+ kernel_dims[1] = kernelFilters;
+ }
+ // TODO(yangke): choose() is defined in TensorContraction.h -- consider
+ // moving it to somewhere more "common".
+ return
+ input
+ .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+ row_in_stride, col_in_stride, padding_type)
+ .reshape(pre_contract_dims)
+ .contract(kernel.reshape(kernel_dims), contract_dims)
+ .reshape(post_contract_dims);
+}
+
+} // end namespace Eigen
+
+// clang-format on
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
new file mode 100644
index 0000000000..7f78f69360
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+#define EIGEN_USE_THREADS
+
+// clang-format off
+
+#include <stdint.h>
+
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+#include <random>
+#include <atomic>
+#include <condition_variable> // NOLINT(build/c++11)
+#include <mutex> // NOLINT(build/c++11)
+#include <thread> // NOLINT(build/c++11)
+#include <functional>
+
+#ifdef _WIN32
+#include <winbase.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+
+// Because some programs may link Eigen in through other frameworks with
+// different flags, we can run into multiple definition issues if we don't have
+// a private namespace for our versions. This is a nasty hack, but a similar
+// approach is used elsewhere to handle the problem, so it should be stable.
+#define Eigen EigenForTFLite
+
+#include "Eigen/src/Core/util/StaticAssert.h"
+#include "unsupported/Eigen/CXX11/Core"
+#include "unsupported/Eigen/SpecialFunctions"
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "Eigen/Core"
+
+// Beware: the order of the include matters to some compilers. For example
+// TensorIndexList.h should be included before TensorDimensions.h in order to
+// use index lists to encode tensor dimensions when compiling with llvm.
+// We're defining this ourselves rather than using the Eigen Tensor header file
+// so that we can alter the macro definition of TENSOR_CONTRACTION_DISPATCH to
+// reduce binary size.
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/ThreadPoolInterface.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorNonBlockingThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStats.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+#undef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
+ if (this->m_lhs_inner_dim_contiguous && \
+ this->m_rhs_inner_dim_contiguous && \
+ !this->m_rhs_inner_dim_reordered) { \
+ METHOD<true, true, false, ALIGNMENT> ARGS; \
+ } else { \
+ eigen_assert(false && "Unsupported contraction formats"); \
+ }
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_H
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
new file mode 100644
index 0000000000..1d5c316194
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
@@ -0,0 +1,167 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is essentially unsupported/CXX11/Eigen/Tensor.h
+// TODO(petewarden) - move this to a common location in Eigen itself.
+
+// clang-format off
+
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+
+
+#include "Eigen/Core"
+
+#if defined(EIGEN_USE_SYCL)
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <utility>
+#endif
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+
+
+
+
+
+#ifdef _WIN32
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#include <windows.h>
+#else
+#include <stdint.h>
+#include <unistd.h>
+#endif
+
+#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
+#include <random>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+// #if defined(EIGEN_USE_LIBXSMM)
+// #include "libxsmm.h"
+// #endif
+
+#ifdef EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/ThreadPool"
+#endif
+
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "unsupported/Eigen/SpecialFunctions"
+#include "unsupported/Eigen/CXX11/src/util/CXX11Meta.h"
+#include "unsupported/Eigen/CXX11/src/util/MaxSizeVector.h"
+
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+
+#undef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
+ if (this->m_lhs_inner_dim_contiguous && \
+ this->m_rhs_inner_dim_contiguous && \
+ !this->m_rhs_inner_dim_reordered) { \
+ METHOD<true, true, false, ALIGNMENT> ARGS; \
+ } else { \
+ eigen_assert(false && "Unsupported contraction formats"); \
+ }
+
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorScan.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
new file mode 100644
index 0000000000..b3615f4658
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -0,0 +1,195 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace multithreaded_ops {
+
+class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
+ public:
+ explicit EigenThreadPoolWrapper(Eigen::ThreadPool* pool) : pool_(pool) {}
+ ~EigenThreadPoolWrapper() override {}
+
+ void Schedule(std::function<void()> fn) override {
+ pool_->Schedule(std::move(fn));
+ }
+ int NumThreads() const override { return pool_->NumThreads(); }
+ int CurrentThreadId() const override { return pool_->CurrentThreadId(); }
+
+ private:
+ Eigen::ThreadPool* pool_ = nullptr;
+};
+
+// We have a single global threadpool for all convolution operations. This means
+// that inferences started from different threads may block each other, but
+// since the underlying resource of CPU cores should be consumed by the
+// operations anyway, it shouldn't affect overall performance.
+const Eigen::ThreadPoolDevice& GetThreadPoolDevice() {
+ const int thread_count = 4;
+ static Eigen::ThreadPool* tp = new Eigen::ThreadPool(thread_count);
+ static EigenThreadPoolWrapper* thread_pool_wrapper =
+ new EigenThreadPoolWrapper(tp);
+ static Eigen::ThreadPoolDevice* device =
+ new Eigen::ThreadPoolDevice(thread_pool_wrapper, thread_count);
+ return *device;
+}
+
+// Shorthands for the types we need when interfacing with the EigenTensor
+// library.
+typedef Eigen::TensorMap<
+ Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
+ EigenMatrix;
+typedef Eigen::TensorMap<
+ Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
+ Eigen::Aligned>
+ ConstEigenMatrix;
+
+typedef Eigen::TensorMap<
+ Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
+ EigenTensor;
+typedef Eigen::TensorMap<
+ Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
+ Eigen::Aligned>
+ ConstEigenTensor;
+
+// Utility functions we need for the EigenTensor API.
+template <typename Device, typename T>
+struct MatMulConvFunctor {
+ // Computes on device "d": out = in0 * in1, where * is matrix
+ // multiplication.
+ void operator()(
+ const Device& d, EigenMatrix out, ConstEigenMatrix in0,
+ ConstEigenMatrix in1,
+ const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+ out.device(d) = in0.contract(in1, dim_pair);
+ }
+};
+
+template <class T>
+class EigenTensorConvFunctor {
+ private:
+ Eigen::PaddingType TfLitePadding2EigenPadding(TfLitePadding padding) {
+ switch (padding) {
+ case kTfLitePaddingValid:
+ return Eigen::PADDING_VALID;
+ case kTfLitePaddingSame:
+ return Eigen::PADDING_SAME;
+ case kTfLitePaddingUnknown:
+ assert(false); // should never get here.
+ return Eigen::PADDING_VALID;
+ }
+ return Eigen::PADDING_SAME; // Prevent compiler warning about missing
+ // return
+ }
+
+ public:
+ void operator()(const T* input_data, T* im2col_buffer, int input_batches,
+ int input_height, int input_width, int input_depth,
+ const T* filter_data, int filter_height, int filter_width,
+ int filter_count, int stride_rows, int stride_cols,
+ int pad_width, int pad_height, TfLitePadding padding,
+ T* output_data, int output_height, int output_width) {
+ const Eigen::ThreadPoolDevice& device = GetThreadPoolDevice();
+
+ const bool is_1x1_kernel = (filter_height == 1 && filter_width == 1 &&
+ stride_rows == 1 && stride_cols == 1);
+ if (is_1x1_kernel) {
+ // For 1x1 kernel, the 2D convolution is reduced to matrix
+ // multiplication.
+ const int conv_width = output_height * output_width;
+ Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+ dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+ EigenMatrix output(output_data, conv_width, filter_count);
+ ConstEigenMatrix input(input_data, conv_width, input_depth);
+ ConstEigenMatrix filter(filter_data, input_depth, filter_count);
+ MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
+ filter, dim_pair);
+ } else if (filter_height == input_height && filter_width == input_width &&
+ pad_width == 0 && pad_height == 0) {
+ // If the input data and filter have the same height/width,
+ // the 2D convolution is reduced to matrix multiplication.
+ const int k = // Length of reduction dimension.
+ filter_width * filter_height * input_depth;
+ Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+ dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+ EigenMatrix output(output_data, 1, filter_count);
+ ConstEigenMatrix input(input_data, 1, k);
+ ConstEigenMatrix filter(filter_data, k, filter_count);
+ MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
+ filter, dim_pair);
+ } else {
+ EigenTensor output(output_data, input_batches, output_height,
+ output_width, filter_count);
+ ConstEigenTensor input(input_data, input_batches, input_height,
+ input_width, input_depth);
+ ConstEigenTensor filter(filter_data, filter_height, filter_width,
+ input_depth, filter_count);
+ output.device(device) =
+ Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows,
+ TfLitePadding2EigenPadding(padding));
+ }
+ }
+};
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, TfLitePadding padding,
+ float output_activation_min, float output_activation_max,
+ float* output_data, const Dims<4>& output_dims,
+ float* im2col_data, const Dims<4>& im2col_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+ const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ EigenTensorConvFunctor<float> conv_functor;
+ conv_functor(input_data, im2col_data, batches, input_height, input_width,
+ input_depth, filter_data, filter_height, filter_width,
+ output_depth, stride_height, stride_width, pad_height, pad_width,
+ padding, output_data, output_height, output_width);
+
+ optimized_ops::AddBiasAndEvalActivationFunction(
+ bias_data, bias_dims, output_data, output_dims, output_activation_min,
+ output_activation_max);
+}
+
+} // namespace multithreaded_ops
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
new file mode 100644
index 0000000000..bf0bdfb1fb
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -0,0 +1,337 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
+
+#ifdef USE_NEON
+
+#include <arm_neon.h>
+#define kFloatWeightsPerNeonLane 4
+
+namespace tflite {
+namespace tensor_utils {
+
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+ int m_cols, const float* vector,
+ int n_batch, float* result,
+ int result_stride) {
+ // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+ // vectorized loop, and we need to process sequentially. postamble_start shows
+ // the start index where this should happen.
+ const int postamble_start =
+ m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
+
+ // The arrays used to cache the vector.
+ float32x4_t* vector_cache_float32x4 =
+ new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) *
+ sizeof(float32x4_t)];
+ const int kUnrollSize = 2;
+ for (int b = 0; b < n_batch; b++) {
+ float* result_in_batch = result + b * m_rows * result_stride;
+ const float* vector_in_batch = vector + b * m_cols;
+
+ const float* matrix_ptr0 = matrix;
+ // If there is only 1 row, we don't want to assign an illegal pointer.
+ const float* matrix_ptr1 = nullptr;
+ if (m_rows > 1) {
+ matrix_ptr1 = matrix + m_cols;
+ }
+
+ // Cahce the vector.
+ for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+ vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c);
+ }
+
+ // Main matrix by vector multiplication loop, which handles two rows of
+ // matrix by vector multiplication.
+ for (int r = 0; r < (m_rows & ~(kUnrollSize - 1)); r += kUnrollSize) {
+ float32x4_t acc0_32x4 = vmovq_n_f32(0.0);
+ float32x4_t acc1_32x4 = vmovq_n_f32(0.0);
+ for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+ float32x4_t temp = vector_cache_float32x4[c >> 2];
+ // Load 4 float values from vector1 and vector2 and accumulator.
+ float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c);
+ float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr1 + c);
+ // Vector multiply-accumulate 4 float
+ acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp);
+ acc1_32x4 = vmlaq_f32(acc1_32x4, v1_f32x4, temp);
+ }
+ // Add the 4 intermediate sum values to get the final dot-prod value for
+ // this column.
+ *result_in_batch +=
+ (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) +
+ vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3));
+ *(result_in_batch + result_stride) +=
+ (vgetq_lane_f32(acc1_32x4, 0) + vgetq_lane_f32(acc1_32x4, 1) +
+ vgetq_lane_f32(acc1_32x4, 2) + vgetq_lane_f32(acc1_32x4, 3));
+ for (int c = postamble_start; c < m_cols; c++) {
+ *result_in_batch += matrix_ptr0[c] * vector_in_batch[c];
+ *(result_in_batch + result_stride) +=
+ matrix_ptr1[c] * vector_in_batch[c];
+ }
+ matrix_ptr0 += kUnrollSize * m_cols;
+ matrix_ptr1 += kUnrollSize * m_cols;
+ result_in_batch += kUnrollSize * result_stride;
+ }
+ for (int r = (m_rows & ~(kUnrollSize - 1)); r < m_rows; r++) {
+ float32x4_t acc0_32x4 = vmovq_n_f32(0.0);
+ for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+ float32x4_t temp = vector_cache_float32x4[c >> 2];
+ // Load 4 float values from vector1 and vector2 and accumulator.
+ float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c);
+ // Vector multiply-accumulate 4 float
+ acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp);
+ }
+ // Add the 4 intermediate sum values to get the final dot-prod value for
+ // this column.
+ *result_in_batch +=
+ (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) +
+ vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3));
+ for (int c = postamble_start; c < m_cols; c++) {
+ *result_in_batch += matrix_ptr0[c] * vector_in_batch[c];
+ }
+ matrix_ptr0 += m_cols;
+ result_in_batch += result_stride;
+ }
+ }
+ delete[] vector_cache_float32x4;
+}
+
+void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
+ int v_size, float* result) {
+ // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+ // vectorized loop, and we need to process sequentially. postamble_start shows
+ // the start index where this should happen.
+ const int postamble_start =
+ v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+ for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+ // Load 4 float values from vector1 and vector2.
+ float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+ float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+ // Vector multiply 4 float
+ float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
+ // Save to result array.
+ vst1q_f32(&result[v], mul_32x4);
+ }
+ for (int v = postamble_start; v < v_size; v++) {
+ result[v] = vector1[v] * vector2[v];
+ }
+}
+
+void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
+ const float* vector2, int v_size,
+ float* result) {
+ // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+ // vectorized loop, and we need to process sequentially. postamble_start shows
+ // the start index where this should happen.
+ const int postamble_start =
+ v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+ for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+ // Load 4 float values from vector1 and vector2 and accumulator.
+ float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+ float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+ float32x4_t acc_32x4 = vld1q_f32(result + v);
+ // Vector multiply-accumulate 4 float
+ acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
+ // Save to result array.
+ vst1q_f32(&result[v], acc_32x4);
+ }
+ for (int v = postamble_start; v < v_size; v++) {
+ result[v] += vector1[v] * vector2[v];
+ }
+}
+
+void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
+ int v_size,
+ const float* batch_vector,
+ int n_batch, float* result) {
+ // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+ // vectorized loop, and we need to process sequentially. postamble_start shows
+ // the start index where this should happen.
+ const int postamble_start =
+ v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+ // The arrays used to cache the vector.
+ float32x4_t* vector_cache_float32x4 =
+ new float32x4_t[(v_size / kFloatWeightsPerNeonLane) *
+ sizeof(float32x4_t)];
+ for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+ vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v);
+ }
+
+ float* result_ptr = result;
+ const float* batch_vector_ptr = batch_vector;
+ for (int b = 0; b < n_batch; b++) {
+ for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+ // Load from memory to vectors.
+ float32x4_t result_f32x4 = vld1q_f32(result_ptr + v);
+ float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v);
+ // Multiply-accumulate.
+ result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4,
+ vector_cache_float32x4[v >> 2]);
+ // Store.
+ vst1q_f32(result_ptr + v, result_f32x4);
+ }
+ // Postamble loop
+ for (int v = postamble_start; v < v_size; v++) {
+ result_ptr[v] += vector[v] * batch_vector_ptr[v];
+ }
+ // Update the pointers.
+ result_ptr += v_size;
+ batch_vector_ptr += v_size;
+ }
+ delete[] vector_cache_float32x4;
+}
+
+void NeonSub1Vector(const float* vector, int v_size, float* result) {
+ // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+ // vectorized loop, and we need to process sequentially. postamble_start shows
+ // the start index where this should happen.
+ const int postamble_start =
+ v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+ float32x4_t one_f32x4 = vmovq_n_f32(1.0);
+ for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+ // Load 4 float values from the current pointers of the input column and
+ // subtract from 1.
+ float32x4_t v_f32x4 = vld1q_f32(vector + v);
+ float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
+ // Save to output.
+ vst1q_f32(result + v, result_f32x4);
+ }
+ for (int v = postamble_start; v < v_size; v++) {
+ result[v] = 1.0f - vector[v];
+ }
+}
+
+void NeonClipVector(const float* vector, int v_size, float abs_limit,
+ float* result) {
+ // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+ // vectorized loop, and we need to process sequentially. postamble_start shows
+ // the start index where this should happen.
+ const int postamble_start =
+ v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+ // Replicate abs_limit and -abs_limit in two vectors.
+ const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
+ const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);
+
+ for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+ // Load from memory to vector.
+ float32x4_t v_f32x4 = vld1q_f32(vector + v);
+ // Clip between abs_limit and -abs_limit.
+ float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4);
+ result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4);
+ // Save to output.
+ vst1q_f32(result + v, result_f32x4);
+ }
+ // Postamble loop.
+ for (int v = postamble_start; v < v_size; v++) {
+ result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v];
+ result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v];
+ }
+}
+
+float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
+ int v_size) {
+ // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+ // vectorized loop, and we need to process sequentially. postamble_start shows
+ // the start index where this should happen.
+ const int postamble_start =
+ v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+ float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+ for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+ // Load 4 float values from vector1 and vector2 and accumulator.
+ float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+ float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+ // Vector multiply-accumulate 4 float
+ acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
+ }
+
+ float result = (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
+ vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
+ // Postamble loop.
+ for (int v = postamble_start; v < v_size; v++) {
+ result += vector1[v] * vector2[v];
+ }
+ return result;
+}
+
+void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
+ const float* vector2, int v_size,
+ int n_batch, float* result,
+ int result_stride) {
+ float* result_ptr = result;
+ const float* vector1_ptr = vector1;
+ const float* vector2_ptr = vector2;
+ for (int b = 0; b < n_batch; b++) {
+ *result_ptr = NeonVectorVectorDotProduct(vector1_ptr, vector2_ptr, v_size);
+ vector1_ptr += v_size;
+ vector2_ptr += v_size;
+ result_ptr += result_stride;
+ }
+}
+
+void NeonReductionSumVector(const float* input_vector, float* output_vector,
+ int output_size, int reduction_size) {
+ const float* input_vector_ptr = input_vector;
+ for (int o = 0; o < output_size; o++) {
+ // If reduction_size is not divisible by kWeightsPerNeonLane, we cannot use
+ // the main vectorized loop, and we need to process sequentially.
+ // postamble_start shows the start index where this should happen.
+ const int postamble_start =
+ reduction_size - (reduction_size & (kFloatWeightsPerNeonLane - 1));
+ float32x4_t sum_f32x4 = vmovq_n_f32(0.0);
+ for (int r = 0; r < postamble_start; r += kFloatWeightsPerNeonLane) {
+ float32x4_t v1_f32x4 = vld1q_f32(input_vector_ptr + r);
+ sum_f32x4 = vaddq_f32(sum_f32x4, v1_f32x4);
+ }
+ output_vector[o] +=
+ (vgetq_lane_f32(sum_f32x4, 0) + vgetq_lane_f32(sum_f32x4, 1) +
+ vgetq_lane_f32(sum_f32x4, 2) + vgetq_lane_f32(sum_f32x4, 3));
+ input_vector_ptr += postamble_start;
+
+ // Postamble loop.
+ for (int r = postamble_start; r < reduction_size; r++) {
+ output_vector[o] += *input_vector_ptr++;
+ }
+ }
+}
+
+void NeonVectorShiftLeft(float* vector, int v_size, float shift_value) {
+ // This variable keeps track of the next to the last index which is being
+ // copied to make sure we are not out of the vector boundary.
+ int last_index_copy = kFloatWeightsPerNeonLane;
+ int current_index_copy = 0;
+ while (last_index_copy < v_size) {
+ float32x4_t v_f32x4 = vld1q_f32(vector + current_index_copy + 1);
+ vst1q_f32(vector + current_index_copy, v_f32x4);
+ current_index_copy += kFloatWeightsPerNeonLane;
+ last_index_copy += kFloatWeightsPerNeonLane;
+ }
+ // Postamble loop.
+ for (int i = current_index_copy; i < v_size - 1; i++) {
+ vector[i] = vector[i + 1];
+ }
+ vector[v_size - 1] = shift_value;
+}
+
+} // namespace tensor_utils
+} // namespace tflite
+
+#endif // USE_NEON
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
new file mode 100644
index 0000000000..3a4af87304
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -0,0 +1,113 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+
+// TODO(ghodrat): Remove this header file and the dependency to internal data
+// structure.
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+ int m_cols, const float* vector,
+ int n_batch, float* result,
+ int result_stride) {
+ NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+ vector, n_batch, result, result_stride);
+}
+
+void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
+ int v_size, float* result) {
+ NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
+}
+
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+ const float* vector2, int v_size,
+ float* result) {
+ NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
+ result);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
+ const float* batch_vector,
+ int n_batch, float* result) {
+ NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
+ batch_vector, n_batch, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+ int v_size) {
+ return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size);
+}
+
+void BatchVectorBatchVectorDotProduct(const float* vector1,
+ const float* vector2, int v_size,
+ int n_batch, float* result,
+ int result_stride) {
+ NEON_OR_PORTABLE(BatchVectorBatchVectorDotProduct, vector1, vector2, v_size,
+ n_batch, result, result_stride);
+}
+
+void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
+ float* batch_vector) {
+ PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
+}
+
+void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
+ PortableApplySigmoidToVector(vector, v_size, result);
+}
+
+void ApplyActivationToVector(const float* vector, int v_size,
+ TfLiteFusedActivation activation, float* result) {
+ PortableApplyActivationToVector(vector, v_size, activation, result);
+}
+
+void CopyVector(const float* vector, int v_size, float* result) {
+ PortableCopyVector(vector, v_size, result);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+ NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
+}
+
+void ZeroVector(float* vector, int v_size) {
+ PortableZeroVector(vector, v_size);
+}
+
+float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
+
+void ClipVector(const float* vector, int v_size, float abs_limit,
+ float* result) {
+ NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
+}
+
+void VectorShiftLeft(float* vector, int v_size, float shift_value) {
+ NEON_OR_PORTABLE(VectorShiftLeft, vector, v_size, shift_value);
+}
+
+void ReductionSumVector(const float* input_vector, float* output_vector,
+ int output_size, int reduction_size) {
+ NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+ reduction_size);
+}
+
+} // namespace tensor_utils
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
new file mode 100644
index 0000000000..cd565c16a1
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -0,0 +1,3715 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen vector expression. The std::conditional here is to
+// construct the suitable Eigen type for the constness of the
+// data. Indeed, for const data, we need to produce
+// Eigen::Map<const Eigen::Matrix<float, ...>>
+// and not the more straightforward
+// Eigen::Map<Eigen::Matrix<const float, ...>>
+template <typename Scalar>
+using VectorMap = typename std::conditional<
+ std::is_const<Scalar>::value,
+ Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+ Eigen::Dynamic, 1>>,
+ Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
+
+template <typename Scalar, int N>
+VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
+ const int size = RequiredBufferSizeForDims(dims);
+ return VectorMap<Scalar>(data, size, 1);
+}
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen matrix expression. The same explanation as for VectorMap
+// above also applies here.
+template <typename Scalar>
+using MatrixMap = typename std::conditional<
+ std::is_const<Scalar>::value,
+ Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+ Eigen::Dynamic, Eigen::Dynamic>>,
+ Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
+ const Dims<N>& dims) {
+ const int rows = dims.sizes[0];
+ int cols = 1;
+ for (int d = 1; d < N; d++) {
+ cols *= dims.sizes[d];
+ }
+ return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
+ const Dims<N>& dims) {
+ const int cols = dims.sizes[N - 1];
+ int rows = 1;
+ for (int d = 0; d < N - 1; d++) {
+ rows *= dims.sizes[d];
+ }
+ return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar>
+using ArrayMap = typename std::conditional<
+ std::is_const<Scalar>::value,
+ Eigen::Map<const Eigen::Array<typename std::remove_const<Scalar>::type,
+ Eigen::Dynamic, Eigen::Dynamic>>,
+ Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar, int N>
+ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data,
+ const Dims<N>& dims) {
+ const int rows = dims.sizes[0];
+ int cols = 1;
+ for (int d = 1; d < N; d++) {
+ cols *= dims.sizes[d];
+ }
+ return ArrayMap<Scalar>(data, rows, cols);
+}
+
+// TODO(b/62193649): this function is only needed as long
+// as we have the --variable_batch hack.
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
+ const Dims<N>& dims,
+ int rows) {
+ int cols = 1;
+ bool matched_rows = false;
+ for (int d = 0; d < N; d++) {
+ cols *= dims.sizes[d];
+ if (cols == rows) {
+ matched_rows = true;
+ cols = 1;
+ }
+ }
+ TFLITE_DCHECK(matched_rows);
+ return MatrixMap<Scalar>(data, rows, cols);
+}
+
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N>
+struct NdArrayDesc {
+ // The "extent" of each dimension. Indices along dimension d must be in the
+ // half-open interval [0, extents[d]).
+ int extents[N];
+
+ // The number of *elements* (not bytes) between consecutive indices of each
+ // dimension.
+ int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// ELEMENT-WISE BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+ int i3) {
+ TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+ TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+ TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+ TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+ return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+ i3 * desc.strides[3];
+}
+
+// Given the dimensions of the operands for an element-wise binary broadcast,
+// adjusts them so that they can be directly iterated over with simple loops.
+// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
+// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
+//
+// This function assumes that the two input shapes are compatible up to
+// broadcasting and the shorter one has already been prepended with 1s to be the
+// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
+// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
+// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
+// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
+//
+// When two shapes are compatible up to broadcasting, for each dimension d,
+// the input extents are either equal, or one of them is 1.
+//
+// This function performs the following for each dimension d:
+// - If the extents are equal, then do nothing since the loop that walks over
+// both of the input arrays is correct.
+// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
+// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
+// array0 to be referenced *at any index* in dimension d and still access the
+// same slice.
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
+ const Dims<N>& input1_dims,
+ NdArrayDesc<N>* desc0_out,
+ NdArrayDesc<N>* desc1_out) {
+ TFLITE_DCHECK(desc0_out != nullptr);
+ TFLITE_DCHECK(desc1_out != nullptr);
+
+ // Copy dims to desc.
+ for (int i = 0; i < N; ++i) {
+ desc0_out->extents[i] = input0_dims.sizes[i];
+ desc0_out->strides[i] = input0_dims.strides[i];
+ desc1_out->extents[i] = input1_dims.sizes[i];
+ desc1_out->strides[i] = input1_dims.strides[i];
+ }
+
+ // Walk over each dimension. If the extents are equal do nothing.
+ // Otherwise, set the desc with extent 1 to have extent equal to the other and
+ // stride 0.
+ for (int i = 0; i < N; ++i) {
+ const int extent0 = ArraySize(input0_dims, i);
+ const int extent1 = ArraySize(input1_dims, i);
+ if (extent0 != extent1) {
+ if (extent0 == 1) {
+ desc0_out->strides[i] = 0;
+ desc0_out->extents[i] = extent1;
+ } else {
+ TFLITE_DCHECK_EQ(extent1, 1);
+ desc1_out->strides[i] = 0;
+ desc1_out->extents[i] = extent0;
+ }
+ }
+ }
+}
+
+inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) {
+ for (int i = 0; i < 4; i++) {
+ if (dims1.sizes[i] != dims2.sizes[i]) {
+ return false;
+ }
+ }
+ return true;
+}
+
+inline void AddBiasAndEvalActivationFunction(const float* bias_data,
+ const Dims<4>& bias_dims,
+ float* array_data,
+ const Dims<4>& array_dims,
+ float output_activation_min,
+ float output_activation_max) {
+#ifdef USE_NEON
+ gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
+ const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
+ const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+ TFLITE_DCHECK_EQ((array_size % bias_size), 0);
+ float* array_ptr = array_data;
+ float* array_end_ptr = array_ptr + array_size;
+ const auto activation_min = vdupq_n_f32(output_activation_min);
+ const auto activation_max = vdupq_n_f32(output_activation_max);
+ for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
+ int i = 0;
+ for (; i <= bias_size - 16; i += 16) {
+ auto b0 = vld1q_f32(bias_data + i);
+ auto b1 = vld1q_f32(bias_data + i + 4);
+ auto b2 = vld1q_f32(bias_data + i + 8);
+ auto b3 = vld1q_f32(bias_data + i + 12);
+ auto a0 = vld1q_f32(array_ptr + i);
+ auto a1 = vld1q_f32(array_ptr + i + 4);
+ auto a2 = vld1q_f32(array_ptr + i + 8);
+ auto a3 = vld1q_f32(array_ptr + i + 12);
+ auto x0 = vaddq_f32(a0, b0);
+ auto x1 = vaddq_f32(a1, b1);
+ auto x2 = vaddq_f32(a2, b2);
+ auto x3 = vaddq_f32(a3, b3);
+ x0 = vmaxq_f32(activation_min, x0);
+ x1 = vmaxq_f32(activation_min, x1);
+ x2 = vmaxq_f32(activation_min, x2);
+ x3 = vmaxq_f32(activation_min, x3);
+ x0 = vminq_f32(activation_max, x0);
+ x1 = vminq_f32(activation_max, x1);
+ x2 = vminq_f32(activation_max, x2);
+ x3 = vminq_f32(activation_max, x3);
+ vst1q_f32(array_ptr + i, x0);
+ vst1q_f32(array_ptr + i + 4, x1);
+ vst1q_f32(array_ptr + i + 8, x2);
+ vst1q_f32(array_ptr + i + 12, x3);
+ }
+ for (; i <= bias_size - 4; i += 4) {
+ auto b = vld1q_f32(bias_data + i);
+ auto a = vld1q_f32(array_ptr + i);
+ auto x = vaddq_f32(a, b);
+ x = vmaxq_f32(activation_min, x);
+ x = vminq_f32(activation_max, x);
+ vst1q_f32(array_ptr + i, x);
+ }
+ for (; i < bias_size; i++) {
+ array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i],
+ output_activation_min,
+ output_activation_max);
+ }
+ }
+#else // not NEON
+ gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
+ const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
+ const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+ TFLITE_DCHECK_EQ((array_size % bias_size), 0);
+ for (int array_offset = 0; array_offset < array_size;
+ array_offset += bias_size) {
+ for (int i = 0; i < bias_size; i++) {
+ array_data[array_offset + i] = ActivationFunctionWithMinMax(
+ array_data[array_offset + i] + bias_data[i], output_activation_min,
+ output_activation_max);
+ }
+ }
+#endif
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AddBiasAndEvalActivationFunction(const float* bias_data,
+ const Dims<4>& bias_dims,
+ float* array_data,
+ const Dims<4>& array_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ AddBiasAndEvalActivationFunction(bias_data, bias_dims, array_data, array_dims,
+ output_activation_min,
+ output_activation_max);
+}
+
+template <typename Lhs, typename Rhs, typename Result>
+void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
+ Eigen::MatrixBase<Result>* result) {
+ if (rhs.cols() == 1) {
+ gemmlowp::ScopedProfilingLabel label("GEMV");
+ result->col(0).noalias() = lhs * rhs.col(0);
+ } else {
+ gemmlowp::ScopedProfilingLabel label("GEMM");
+ result->noalias() = lhs * rhs;
+ }
+}
+
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+ const float* weights_data,
+ const Dims<4>& weights_dims, const float* bias_data,
+ const Dims<4>& bias_dims,
+ float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("FullyConnected");
+ // TODO(b/62193649): this convoluted shape computation (determining
+ // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
+ // is because the current --variable_batch hack consists in overwriting the
+ // 3rd dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ // When that is fixed, this should become:
+ // const auto input_matrix_map =
+ // MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+ const int input_rows = ArraySize(weights_dims, 0);
+ const auto input_matrix_map =
+ MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows);
+ const auto filter_matrix_map =
+ MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims);
+ auto output_matrix_map =
+ MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+ Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+ AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data,
+ output_dims, output_activation_min,
+ output_activation_max);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+ const float* weights_data, const Dims<4>& weights_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+ bias_dims, output_activation_min, output_activation_max,
+ output_data, output_dims);
+}
+
+inline void preload_l1_stream(const uint8* ptr) {
+#ifdef GEMMLOWP_ARM_64
+ asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#else
+ gemmlowp::Prefetch(ptr);
+#endif
+}
+
+#ifdef USE_NEON
+inline void FullyConnectedAsGEMV(
+ const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
+ const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims, int32 output_offset,
+ int32 output_multiplier, int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
+ TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+ TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+ ArraySize(output_dims, 3),
+ 1);
+ const int input_size = input_dims.strides[3];
+ const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
+ static constexpr int kPeel = 4;
+ for (int k = 0; k < input_size; k += 64) {
+ preload_l1_stream(input_data + k);
+ }
+ for (int k = 0; k < kPeel * input_size; k += 64) {
+ preload_l1_stream(filter_data + k);
+ }
+ TFLITE_DCHECK(!(output_size % kPeel));
+ const int32* bias_ptr = bias_data;
+ uint8* output_ptr = output_data;
+ for (int out = 0; out < output_size; out += kPeel) {
+ int32x4_t acc[kPeel];
+ for (int k = 0; k < kPeel; k++) {
+ acc[k] = vdupq_n_s32(0);
+ }
+ const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+ const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+ int in = 0;
+ for (; in <= input_size - 16; in += 16) {
+ const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
+ uint8x16_t filter_val_u8[kPeel];
+ for (int k = 0; k < kPeel; k++) {
+ const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
+ filter_val_u8[k] = vld1q_u8(filter_ptr);
+ preload_l1_stream(filter_ptr + 64);
+ }
+ int16x8_t input_val[2];
+ const uint8x8_t low = vget_low_u8(input_val_u8);
+ const uint8x8_t high = vget_high_u8(input_val_u8);
+ input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low));
+ input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high));
+ input_val[0] = vaddq_s16(input_val[0], input_offset_vec);
+ input_val[1] = vaddq_s16(input_val[1], input_offset_vec);
+ int16x8_t filter_val[kPeel][2];
+ for (int k = 0; k < kPeel; k++) {
+ const uint8x8_t low = vget_low_u8(filter_val_u8[k]);
+ const uint8x8_t high = vget_high_u8(filter_val_u8[k]);
+ filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low));
+ filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high));
+ filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec);
+ filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec);
+ }
+ for (int p = 0; p < 2; p++) {
+ for (int k = 0; k < kPeel; k++) {
+ acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]),
+ vget_low_s16(input_val[p]));
+ }
+ for (int k = 0; k < kPeel; k++) {
+ acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]),
+ vget_high_s16(input_val[p]));
+ }
+ }
+ }
+ for (; in <= input_size - 8; in += 8) {
+ const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
+ uint8x8_t filter_val_u8[kPeel];
+ for (int k = 0; k < kPeel; k++) {
+ const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
+ filter_val_u8[k] = vld1_u8(filter_ptr);
+ }
+ int16x8_t input_val;
+ input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+ input_val = vaddq_s16(input_val, input_offset_vec);
+ int16x8_t filter_val[kPeel];
+ for (int k = 0; k < kPeel; k++) {
+ filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k]));
+ filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec);
+ }
+ for (int k = 0; k < kPeel; k++) {
+ acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]),
+ vget_low_s16(input_val));
+ }
+ for (int k = 0; k < kPeel; k++) {
+ acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]),
+ vget_high_s16(input_val));
+ }
+ }
+ if (in < input_size) {
+ int32 buf[4 * kPeel];
+ for (int k = 0; k < 4; k++) {
+ vst1q_s32(buf + 4 * k, acc[k]);
+ }
+ for (; in < input_size; in++) {
+ int lane = (in + 8 - input_size) % 4;
+ const int32 input_val = input_data[in] + input_offset;
+ for (int k = 0; k < kPeel; k++) {
+ int32 filter_val =
+ filter_data[in + (out + k) * input_size] + filter_offset;
+ buf[lane + 4 * k] += filter_val * input_val;
+ }
+ }
+ for (int k = 0; k < 4; k++) {
+ acc[k] = vld1q_s32(buf + 4 * k);
+ }
+ }
+
+ // Horizontally reduce accumulators
+ int32x2_t pairwise_reduced_acc[kPeel];
+ for (int k = 0; k < kPeel; k++) {
+ pairwise_reduced_acc[k] =
+ vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k]));
+ }
+ static_assert(kPeel == 4, "the code below currently assumes kPeel = 4");
+ const int32x2_t reduced_lo =
+ vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]);
+ const int32x2_t reduced_hi =
+ vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]);
+ int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+ // Add bias values.
+ int32x4_t bias_vec = vld1q_s32(bias_ptr);
+ bias_ptr += 4;
+ reduced = vaddq_s32(reduced, bias_vec);
+ // Multiply by the fixed-point multiplier.
+ reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+ // Rounding-shift-right.
+ using gemmlowp::RoundingDivideByPOT;
+ reduced = RoundingDivideByPOT(reduced, output_shift);
+ // Add the output offset.
+ const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+ reduced = vaddq_s32(reduced, output_offset_vec);
+ // Narrow values down to 16 bit signed.
+ const int16x4_t res16 = vqmovn_s32(reduced);
+ // Narrow values down to 8 bit unsigned, saturating.
+ uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
+ // Apply the clamping from the activation function
+ res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
+ res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
+ // Store results to destination. Assumes 32bit alignment.
+ vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr),
+ vreinterpret_u32_u8(res8), 0);
+ output_ptr += kPeel;
+ }
+}
+#endif // USE_NEON
+
+struct GemmlowpOutputPipeline {
+ typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+ ColVectorMap;
+ typedef std::tuple<
+ gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+ gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
+ gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
+ Pipeline;
+ static Pipeline Make(const int32* bias_data, int output_rows,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max) {
+ ColVectorMap bias_vector(bias_data, output_rows);
+ gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+ bias_addition_stage.bias_vector = bias_vector;
+ gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
+ quantize_down_stage;
+ quantize_down_stage.result_offset_after_shift = output_offset;
+ quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+ quantize_down_stage.result_shift = output_shift;
+ gemmlowp::OutputStageClamp clamp_stage;
+ clamp_stage.min = output_activation_min;
+ clamp_stage.max = output_activation_max;
+ gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+ return std::make_tuple(bias_addition_stage, quantize_down_stage,
+ clamp_stage, saturating_cast_stage);
+ }
+};
+
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit");
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+ ArraySize(output_dims, 3);
+#ifdef USE_NEON
+ const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
+ if (batches == 1 && !(output_size % 4)) {
+ return FullyConnectedAsGEMV(
+ input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, output_offset, output_multiplier,
+ output_shift, output_activation_min, output_activation_max, output_data,
+ output_dims);
+ }
+#endif // USE_NEON
+ const int filter_rows = filter_dims.sizes[1];
+ const int filter_cols = filter_dims.sizes[0];
+ TFLITE_DCHECK_EQ(filter_dims.sizes[2], 1);
+ TFLITE_DCHECK_EQ(filter_dims.sizes[3], 1);
+ const int output_rows = output_dims.sizes[0];
+ TFLITE_DCHECK_EQ(output_rows, filter_rows);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+
+ gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+ filter_data, output_rows, filter_cols, filter_cols);
+ gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+ input_data, filter_cols, batches, filter_cols);
+ gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+ output_data, output_rows, batches, output_rows);
+ const auto& output_pipeline = GemmlowpOutputPipeline::Make(
+ bias_data, output_rows, output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max);
+ gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+ gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+ gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+ input_offset, output_pipeline);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, output_offset,
+ output_multiplier, output_shift, output_activation_min,
+ output_activation_max, output_data, output_dims, gemm_context);
+}
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(
+ const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
+ int stride_width, int stride_height, int pad_width, int pad_height,
+ int in_width, int in_height, int in_depth, int single_buffer_length,
+ int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) {
+ gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn");
+ // This chunk of code reshapes all the inputs corresponding to
+ // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
+ const int kwidth_times_indepth = kwidth * in_depth;
+ const int inwidth_times_indepth = in_width * in_depth;
+ const int ih_ungated_start = h * stride_height - pad_height;
+ const int ih_ungated_end = (ih_ungated_start + kheight);
+ const int ih_end = std::min(ih_ungated_end, in_height);
+ const int iw_ungated_start = w * stride_width - pad_width;
+ const int iw_ungated_end = (iw_ungated_start + kwidth);
+ const int iw_end = std::min(iw_ungated_end, in_width);
+ // If the patch is off the edge of the input image, skip writing those rows
+ // and columns from the patch into the output array.
+ const int h_offset = std::max(0, -ih_ungated_start);
+ const int w_offset = std::max(0, -iw_ungated_start);
+ const int ih_start = std::max(0, ih_ungated_start);
+ const int iw_start = std::max(0, iw_ungated_start);
+ const int single_row_num =
+ std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
+ const int output_row_offset = (buffer_id * single_buffer_length);
+ int out_offset =
+ output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+ int in_offset = Offset(input_dims, 0, iw_start, ih_start, b);
+
+ // Express all of the calculations as padding around the input patch.
+ const int top_padding = h_offset;
+ const int bottom_padding = (ih_ungated_end - ih_end);
+ const int left_padding = w_offset;
+ const int right_padding = (iw_ungated_end - iw_end);
+ assert(single_row_num ==
+ ((kwidth - (left_padding + right_padding)) * in_depth));
+
+ // Write out zeroes to the elements representing the top rows of the input
+ // patch that are off the edge of the input image.
+ if (top_padding > 0) {
+ const int top_row_elements = (top_padding * kwidth * in_depth);
+ memset(conv_buffer_data + output_row_offset, byte_zero,
+ (top_row_elements * sizeof(T)));
+ }
+
+ // If the patch is on the interior of the input image horizontally, just copy
+ // over the rows sequentially, otherwise add zero padding at the start or end.
+ if ((left_padding == 0) && (right_padding == 0)) {
+ for (int ih = ih_start; ih < ih_end; ++ih) {
+ memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+ single_row_num * sizeof(T));
+ out_offset += kwidth_times_indepth;
+ in_offset += inwidth_times_indepth;
+ }
+ } else {
+ for (int ih = ih_start; ih < ih_end; ++ih) {
+ if (left_padding > 0) {
+ const int left_start = (out_offset - (left_padding * in_depth));
+ memset(conv_buffer_data + left_start, byte_zero,
+ (left_padding * in_depth * sizeof(T)));
+ }
+ memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+ single_row_num * sizeof(T));
+ if (right_padding > 0) {
+ const int right_start = (out_offset + single_row_num);
+ memset(conv_buffer_data + right_start, byte_zero,
+ (right_padding * in_depth * sizeof(T)));
+ }
+ out_offset += kwidth_times_indepth;
+ in_offset += inwidth_times_indepth;
+ }
+ }
+
+ // If the bottom of the patch falls off the input image, pad the values
+ // representing those input rows with zeroes.
+ if (bottom_padding > 0) {
+ const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
+ const int bottom_start =
+ output_row_offset +
+ ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+ memset(conv_buffer_data + bottom_start, byte_zero,
+ (bottom_row_elements * sizeof(T)));
+ }
+}
+
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
+ int stride_height, int pad_width, int pad_height, int kheight,
+ int kwidth, uint8 byte_zero, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Im2col");
+ TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int input_depth = ArraySize(input_dims, 0);
+ const int input_width = ArraySize(input_dims, 1);
+ const int input_height = ArraySize(input_dims, 2);
+ const int output_depth = ArraySize(output_dims, 0);
+ const int output_width = ArraySize(output_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+
+ int buffer_id = 0;
+ // Loop over the output nodes.
+ for (int b = 0; b < batches; ++b) {
+ for (int h = 0; h < output_height; ++h) {
+ for (int w = 0; w < output_width; ++w) {
+ ExtractPatchIntoBufferColumn(
+ input_dims, w, h, b, kheight, kwidth, stride_width, stride_height,
+ pad_width, pad_height, input_width, input_height, input_depth,
+ output_depth, buffer_id, input_data, output_data, byte_zero);
+ ++buffer_id;
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int kheight, int kwidth,
+ uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
+ Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+ kwidth, byte_zero, output_data, output_dims);
+}
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims, float* im2col_data,
+ const Dims<4>& im2col_dims) {
+ (void)im2col_data;
+ (void)im2col_dims;
+ gemmlowp::ScopedProfilingLabel label("Conv");
+
+ const float* gemm_input_data = nullptr;
+ const Dims<4>* gemm_input_dims = nullptr;
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+ filter_width != 1 || filter_height != 1;
+ if (need_im2col) {
+ TFLITE_DCHECK(im2col_data);
+ Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_height, filter_width, 0, im2col_data,
+ im2col_dims);
+ gemm_input_data = im2col_data;
+ gemm_input_dims = &im2col_dims;
+ } else {
+ // TODO(aselle): We need to make sure to not send im2col if it is not
+ // needed.
+ TFLITE_DCHECK(!im2col_data);
+ gemm_input_data = input_data;
+ gemm_input_dims = &input_dims;
+ }
+
+ const auto im2col_matrix_map =
+ MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims);
+ const auto filter_matrix_map =
+ MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+ auto output_matrix_map =
+ MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+ Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
+
+ AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data,
+ output_dims, output_activation_min,
+ output_activation_max);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+ int stride_height, int pad_width, int pad_height, float* output_data,
+ const Dims<4>& output_dims, float* im2col_data,
+ const Dims<4>& im2col_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+ stride_width, stride_height, pad_width, pad_height,
+ output_activation_min, output_activation_max, output_data, output_dims,
+ im2col_data, im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims, int stride,
+ int pad_width, int pad_height, float* output_data,
+ const Dims<4>& output_dims, float* im2col_data,
+ const Dims<4>& im2col_dims) {
+ Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+ bias_dims, stride, stride, pad_width, pad_height, output_data,
+ output_dims, im2col_data, im2col_dims);
+}
+
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims, uint8* im2col_data,
+ const Dims<4>& im2col_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ gemmlowp::ScopedProfilingLabel label("Conv/8bit");
+
+ TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+ const uint8* gemm_input_data = nullptr;
+ const Dims<4>* gemm_input_dims = nullptr;
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+ filter_width != 1 || filter_height != 1;
+ if (need_im2col) {
+ TFLITE_DCHECK(im2col_data);
+ const int input_zero_point = -input_offset;
+ TFLITE_DCHECK_GE(input_zero_point, 0);
+ TFLITE_DCHECK_LE(input_zero_point, 255);
+ Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_height, filter_width, input_zero_point,
+ im2col_data, im2col_dims);
+ gemm_input_data = im2col_data;
+ gemm_input_dims = &im2col_dims;
+ } else {
+ TFLITE_DCHECK(!im2col_data);
+ gemm_input_data = input_data;
+ gemm_input_dims = &input_dims;
+ }
+
+ const int gemm_input_rows = gemm_input_dims->sizes[0];
+ const int gemm_input_cols = gemm_input_dims->sizes[1] *
+ gemm_input_dims->sizes[2] *
+ gemm_input_dims->sizes[3];
+ const int filter_rows = filter_dims.sizes[3];
+ const int filter_cols =
+ filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+ const int output_rows = output_dims.sizes[0];
+ const int output_cols =
+ output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+ TFLITE_DCHECK_EQ(output_rows, filter_rows);
+ TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+ TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+ gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+ filter_data, filter_rows, filter_cols);
+ gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+ gemm_input_data, gemm_input_rows, gemm_input_cols);
+ gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+ output_data, output_rows, output_cols);
+ const auto& output_pipeline = GemmlowpOutputPipeline::Make(
+ bias_data, output_rows, output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max);
+ gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+ gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+ gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+ input_offset, output_pipeline);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims, uint8* im2col_data,
+ const Dims<4>& im2col_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, stride_width, stride_height,
+ pad_width, pad_height, output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max, output_data, output_dims,
+ im2col_data, im2col_dims, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims, int stride,
+ int pad_width, int pad_height, int32 output_offset,
+ int32 output_multiplier, int output_shift,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
+ const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, stride, stride, pad_width,
+ pad_height, output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max, output_data, output_dims,
+ im2col_data, im2col_dims, gemm_context);
+}
+
+template <typename T>
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
+ int block_size, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("DepthToSpace");
+
+ const int input_depth = ArraySize(input_dims, 0);
+ const int input_width = ArraySize(input_dims, 1);
+ const int input_height = ArraySize(input_dims, 2);
+
+ const int output_depth = ArraySize(output_dims, 0);
+ const int batch_size = ArraySize(output_dims, 3);
+
+ // Number of continuous values that we can copy in one interation.
+ const int stride = block_size * output_depth;
+
+ for (int batch = 0; batch < batch_size; ++batch) {
+ for (int in_h = 0; in_h < input_height; ++in_h) {
+ const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch);
+ for (int offset_h = 0; offset_h < block_size; ++offset_h) {
+ const T* src = input_ptr;
+ for (int in_w = 0; in_w < input_width; ++in_w) {
+ memcpy(output_data, src, stride * sizeof(T));
+ output_data += stride;
+ src += input_depth;
+ }
+ input_ptr += stride;
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int kheight, int kwidth,
+ uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
+ Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+ kwidth, byte_zero, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("ConvAsGemm");
+
+ const auto input_matrix_map =
+ MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+ const auto filter_matrix_map =
+ MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+ auto output_matrix_map =
+ MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+ Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+
+ AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
+ output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int32 output_offset, int32 output_multiplier, int output_shift,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit");
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ const int input_rows = input_dims.sizes[0];
+ const int input_cols =
+ input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3];
+ const int filter_rows = filter_dims.sizes[3];
+ const int filter_cols =
+ filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+ const int output_rows = output_dims.sizes[0];
+ const int output_cols =
+ output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+ TFLITE_DCHECK_EQ(output_rows, filter_rows);
+ TFLITE_DCHECK_EQ(output_cols, input_cols);
+ TFLITE_DCHECK_EQ(filter_cols, input_rows);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+ TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+ gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+ filter_data, output_rows, filter_cols, filter_cols);
+ gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+ input_data, filter_cols, output_cols, filter_cols);
+ gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+ output_data, output_rows, output_cols, output_rows);
+ const auto& output_pipeline = GemmlowpOutputPipeline::Make(
+ bias_data, output_rows, output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max);
+ gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+ gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+ gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+ input_offset, output_pipeline);
+}
+
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+ int block_size, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("SpaceToDepth");
+
+ const int output_depth = ArraySize(output_dims, 0);
+ const int output_width = ArraySize(output_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+
+ const int input_depth = ArraySize(input_dims, 0);
+ const int batch_size = ArraySize(input_dims, 3);
+
+ // Number of continuous values that we can copy in one interation.
+ const int stride = block_size * input_depth;
+
+ for (int batch = 0; batch < batch_size; ++batch) {
+ for (int out_h = 0; out_h < output_height; ++out_h) {
+ T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch);
+ for (int offset_h = 0; offset_h < block_size; ++offset_h) {
+ T* dst = output_ptr;
+ for (int out_w = 0; out_w < output_width; ++out_w) {
+ memcpy(dst, input_data, stride * sizeof(T));
+ input_data += stride;
+ dst += output_depth;
+ }
+ output_ptr += stride;
+ }
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac>
+void NonGlobalBatchNormalization(
+ const float* input_data, const Dims<4>& input_dims, const float* mean_data,
+ const Dims<4>& mean_dims, const float* multiplier_data,
+ const Dims<4>& multiplier_dims, const float* offset_data,
+ const Dims<4>& offset_dims, float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height =
+ MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
+ offset_dims, 2, output_dims, 2);
+ const int width =
+ MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
+ offset_dims, 1, output_dims, 1);
+ const int depth =
+ MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+ offset_dims, 0, output_dims, 0);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+ (input_data[Offset(input_dims, c, x, y, b)] -
+ mean_data[Offset(mean_dims, c, x, y, 0)]) *
+ multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
+ offset_data[Offset(offset_dims, c, x, y, 0)]);
+ }
+ }
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac>
+void GlobalBatchNormalization(const float* input_data,
+ const Dims<4>& input_dims, const float* mean_data,
+ const Dims<4>& mean_dims,
+ const float* multiplier_data,
+ const Dims<4>& multiplier_dims,
+ const float* offset_data,
+ const Dims<4>& offset_dims, float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth =
+ MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+ offset_dims, 0, output_dims, 0);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+ (input_data[Offset(input_dims, c, x, y, b)] -
+ mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
+ multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
+ offset_data[Offset(offset_dims, c, 0, 0, 0)]);
+ }
+ }
+ }
+ }
+}
+
+inline void Relu(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Relu (not fused)");
+
+ const auto input = MapAsVector(input_data, input_dims);
+ auto output = MapAsVector(output_data, output_dims);
+ output = input.cwiseMax(0.0f);
+}
+
+inline void Relu1(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ float val = input_data[Offset(input_dims, c, x, y, b)];
+ const float upper = 1;
+ const float lower = -1;
+ float clamped = val > upper ? upper : val < lower ? lower : val;
+ output_data[Offset(output_dims, c, x, y, b)] = clamped;
+ }
+ }
+ }
+ }
+}
+
+inline void Relu6(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ float val = input_data[Offset(input_dims, c, x, y, b)];
+ const float upper = 6;
+ const float lower = 0;
+ float clamped = val > upper ? upper : val < lower ? lower : val;
+ output_data[Offset(output_dims, c, x, y, b)] = clamped;
+ }
+ }
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("L2Normalization");
+ static_assert(Ac == FusedActivationFunctionType::kNone, "");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ float squared_l2_norm = 0;
+ for (int c = 0; c < depth; ++c) {
+ float val = input_data[Offset(input_dims, c, x, y, b)];
+ squared_l2_norm += val * val;
+ }
+ float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm);
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm;
+ }
+ }
+ }
+ }
+}
+
+inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
+ int* output_shift) {
+ *output_shift = 11;
+ while (input >= (1 << 29)) {
+ input /= 4;
+ ++*output_shift;
+ }
+ TFLITE_DCHECK_GT(input, 0);
+ const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
+ const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+ const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+ *output_shift -= left_shift_bit_pairs;
+ input <<= 2 * left_shift_bit_pairs;
+ TFLITE_DCHECK_GE(input, (1 << 27));
+ TFLITE_DCHECK_LT(input, (1 << 29));
+ using gemmlowp::FixedPoint;
+ using gemmlowp::Rescale;
+ using gemmlowp::SaturatingRoundingMultiplyByPOT;
+ // Using 3 integer bits gives us enough room for the internal arithmetic in
+ // this Newton-Raphson iteration.
+ using F3 = FixedPoint<int32, 3>;
+ using F0 = FixedPoint<int32, 0>;
+ const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+ const F3 fixedpoint_half_input =
+ SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+ const F3 fixedpoint_half_three =
+ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+ // Newton-Raphson iteration
+ // Naive unoptimized starting guess: x = 1
+ F3 x = F3::One();
+ // Naive unoptimized number of iterations: 5
+ for (int i = 0; i < 5; i++) {
+ const F3 x3 = Rescale<3>(x * x * x);
+ x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+ }
+ const F0 fixedpoint_half_sqrt_2 =
+ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+ x = x * fixedpoint_half_sqrt_2;
+ *output_inv_sqrt = x.raw();
+ if (*output_shift < 0) {
+ *output_inv_sqrt <<= -*output_shift;
+ *output_shift = 0;
+ }
+}
+
+inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_zero_point, uint8* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+ TFLITE_DCHECK_EQ(batches, 1);
+ TFLITE_DCHECK_EQ(height, 1);
+ TFLITE_DCHECK_EQ(width, 1);
+ int32 square_l2_norm = 0;
+ for (int i = 0; i < depth; i++) {
+ int32 diff = input_data[i] - input_zero_point;
+ square_l2_norm += diff * diff;
+ }
+ int32 inv_l2norm_multiplier;
+ int inv_l2norm_shift;
+ GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
+ &inv_l2norm_shift);
+
+ for (int i = 0; i < depth; i++) {
+ int32 diff = input_data[i] - input_zero_point;
+ int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
+ 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+ int32 unclamped_output_val = 128 + rescaled_diff;
+ int32 output_val = std::min(255, std::max(0, unclamped_output_val));
+ output_data[i] = static_cast<uint8>(output_val);
+ }
+}
+
+inline void Add(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float output_activation_min, float output_activation_max,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Add");
+ /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+ output_dims, 3);
+ /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+ output_dims, 2);
+ /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+ output_dims, 1);
+ /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+ output_dims, 0);
+ TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+ int i = 0;
+ const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+#ifdef USE_NEON
+ const auto activation_min = vdupq_n_f32(output_activation_min);
+ const auto activation_max = vdupq_n_f32(output_activation_max);
+ for (; i <= size - 16; i += 16) {
+ auto a10 = vld1q_f32(input1_data + i);
+ auto a11 = vld1q_f32(input1_data + i + 4);
+ auto a12 = vld1q_f32(input1_data + i + 8);
+ auto a13 = vld1q_f32(input1_data + i + 12);
+ auto a20 = vld1q_f32(input2_data + i);
+ auto a21 = vld1q_f32(input2_data + i + 4);
+ auto a22 = vld1q_f32(input2_data + i + 8);
+ auto a23 = vld1q_f32(input2_data + i + 12);
+ auto x0 = vaddq_f32(a10, a20);
+ auto x1 = vaddq_f32(a11, a21);
+ auto x2 = vaddq_f32(a12, a22);
+ auto x3 = vaddq_f32(a13, a23);
+ x0 = vmaxq_f32(activation_min, x0);
+ x1 = vmaxq_f32(activation_min, x1);
+ x2 = vmaxq_f32(activation_min, x2);
+ x3 = vmaxq_f32(activation_min, x3);
+ x0 = vminq_f32(activation_max, x0);
+ x1 = vminq_f32(activation_max, x1);
+ x2 = vminq_f32(activation_max, x2);
+ x3 = vminq_f32(activation_max, x3);
+ vst1q_f32(output_data + i, x0);
+ vst1q_f32(output_data + i + 4, x1);
+ vst1q_f32(output_data + i + 8, x2);
+ vst1q_f32(output_data + i + 12, x3);
+ }
+ for (; i <= size - 4; i += 4) {
+ auto a1 = vld1q_f32(input1_data + i);
+ auto a2 = vld1q_f32(input2_data + i);
+ auto x = vaddq_f32(a1, a2);
+ x = vmaxq_f32(activation_min, x);
+ x = vminq_f32(activation_max, x);
+ vst1q_f32(output_data + i, x);
+ }
+#endif // NEON
+
+ for (; i < size; i++) {
+ auto x = input1_data[i] + input2_data[i];
+ output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
+ output_activation_max);
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+ Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+ const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift,
+ const uint8* input2_data, const Dims<4>& input2_dims,
+ int32 input2_offset, int32 input2_multiplier, int input2_shift,
+ int32 output_offset, int32 output_multiplier, int output_shift,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ gemmlowp::ScopedProfilingLabel label("Add/8bit");
+ /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+ output_dims, 3);
+ /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+ output_dims, 2);
+ /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+ output_dims, 1);
+ /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+ output_dims, 0);
+ TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+ int i = 0;
+ const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+ TFLITE_DCHECK_GT(input1_offset, -256);
+ TFLITE_DCHECK_GT(input2_offset, -256);
+ TFLITE_DCHECK_LT(input1_offset, 256);
+ TFLITE_DCHECK_LT(input2_offset, 256);
+#ifdef USE_NEON
+ for (; i <= size - 8; i += 8) {
+ const auto input1_val_original = vld1_u8(input1_data + i);
+ const auto input2_val_original = vld1_u8(input2_data + i);
+ const auto input1_val_s16 =
+ vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+ const auto input2_val_s16 =
+ vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+ const auto input1_val =
+ vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset));
+ const auto input2_val =
+ vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset));
+ const auto input1_val_high = vget_high_s16(input1_val);
+ const auto input1_val_low = vget_low_s16(input1_val);
+ const auto input2_val_high = vget_high_s16(input2_val);
+ const auto input2_val_low = vget_low_s16(input2_val);
+ auto x11 = vmovl_s16(input1_val_low);
+ auto x12 = vmovl_s16(input1_val_high);
+ auto x21 = vmovl_s16(input2_val_low);
+ auto x22 = vmovl_s16(input2_val_high);
+ const auto left_shift_dup = vdupq_n_s32(left_shift);
+ x11 = vshlq_s32(x11, left_shift_dup);
+ x12 = vshlq_s32(x12, left_shift_dup);
+ x21 = vshlq_s32(x21, left_shift_dup);
+ x22 = vshlq_s32(x22, left_shift_dup);
+ x11 = vqrdmulhq_n_s32(x11, input1_multiplier);
+ x12 = vqrdmulhq_n_s32(x12, input1_multiplier);
+ x21 = vqrdmulhq_n_s32(x21, input2_multiplier);
+ x22 = vqrdmulhq_n_s32(x22, input2_multiplier);
+ const auto input1_shift_dup = vdupq_n_s32(-input1_shift);
+ const auto input2_shift_dup = vdupq_n_s32(-input2_shift);
+ x11 = vshlq_s32(x11, input1_shift_dup);
+ x12 = vshlq_s32(x12, input1_shift_dup);
+ x21 = vshlq_s32(x21, input2_shift_dup);
+ x22 = vshlq_s32(x22, input2_shift_dup);
+ auto s1 = vaddq_s32(x11, x21);
+ auto s2 = vaddq_s32(x12, x22);
+ s1 = vqrdmulhq_n_s32(s1, output_multiplier);
+ s2 = vqrdmulhq_n_s32(s2, output_multiplier);
+ using gemmlowp::RoundingDivideByPOT;
+ s1 = RoundingDivideByPOT(s1, output_shift);
+ s2 = RoundingDivideByPOT(s2, output_shift);
+ const auto s1_narrowed = vmovn_s32(s1);
+ const auto s2_narrowed = vmovn_s32(s2);
+ const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+ vdupq_n_s16(output_offset));
+ vst1_u8(output_data + i, vqmovun_s16(s));
+ }
+#endif // NEON
+
+ for (; i < size; i++) {
+ const int32 input1_val = input1_offset + input1_data[i];
+ const int32 input2_val = input2_offset + input2_data[i];
+ const int32 shifted_input1_val = input1_val * (1 << left_shift);
+ const int32 shifted_input2_val = input2_val * (1 << left_shift);
+ const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input1_val, input1_multiplier, input1_shift);
+ const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input2_val, input2_multiplier, input2_shift);
+ const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+ const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne(
+ raw_sum, output_multiplier, output_shift) +
+ output_offset;
+ const int32 clamped_output = std::min(
+ output_activation_max, std::max(output_activation_min, raw_output));
+ output_data[i] = static_cast<uint8>(clamped_output);
+ }
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const int32* input1_data, const Dims<4>& input1_dims,
+ const int32* input2_data, const Dims<4>& input2_dims,
+ int32* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Add/int32");
+ TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+ auto input1_map = MapAsVector(input1_data, input1_dims);
+ auto input2_map = MapAsVector(input2_data, input2_dims);
+ auto output_map = MapAsVector(output_data, output_dims);
+ if (AreSameDims(input1_dims, input2_dims)) {
+ output_map.array() = input1_map.array() + input2_map.array();
+ } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+ auto scalar = input2_data[0];
+ output_map.array() = input1_map.array() + scalar;
+ } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+ auto scalar = input1_data[0];
+ output_map.array() = scalar + input2_map.array();
+ } else {
+ // Should not come here.
+ TFLITE_DCHECK(false);
+ }
+}
+
+// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, const Dims<4>& input2_dims,
+ T* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+ input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+ input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+ }
+ }
+ }
+ }
+}
+
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+ const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift,
+ const uint8* input2_data, const Dims<4>& input2_dims,
+ int32 input2_offset, int32 input2_multiplier,
+ int input2_shift, int32 output_offset,
+ int32 output_multiplier, int output_shift,
+ int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ const int32 input1_val =
+ input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+ const int32 input2_val =
+ input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+ const int32 shifted_input1_val = input1_val * (1 << left_shift);
+ const int32 shifted_input2_val = input2_val * (1 << left_shift);
+ const int32 scaled_input1_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input1_val, input1_multiplier, input1_shift);
+ const int32 scaled_input2_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input2_val, input2_multiplier, input2_shift);
+ const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+ const int32 raw_output =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ raw_sum, output_multiplier, output_shift) +
+ output_offset;
+ const int32 clamped_output =
+ std::min(output_activation_max,
+ std::max(output_activation_min, raw_output));
+ output_data[Offset(output_dims, c, x, y, b)] =
+ static_cast<uint8>(clamped_output);
+ }
+ }
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+ const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift,
+ const uint8* input2_data, const Dims<4>& input2_dims,
+ int32 input2_offset, int32 input2_multiplier,
+ int input2_shift, int32 output_offset,
+ int32 output_multiplier, int output_shift,
+ int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
+ input1_multiplier, input1_shift, input2_data, input2_dims,
+ input2_offset, input2_multiplier, input2_shift, output_offset,
+ output_multiplier, output_shift, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float output_activation_min, float output_activation_max,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Mul");
+ /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+ output_dims, 3);
+ /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+ output_dims, 2);
+ /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+ output_dims, 1);
+ /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+ output_dims, 0);
+ TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+ int i = 0;
+ const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+#ifdef USE_NEON
+ const auto activation_min = vdupq_n_f32(output_activation_min);
+ const auto activation_max = vdupq_n_f32(output_activation_max);
+ for (; i <= size - 16; i += 16) {
+ auto a10 = vld1q_f32(input1_data + i);
+ auto a11 = vld1q_f32(input1_data + i + 4);
+ auto a12 = vld1q_f32(input1_data + i + 8);
+ auto a13 = vld1q_f32(input1_data + i + 12);
+ auto a20 = vld1q_f32(input2_data + i);
+ auto a21 = vld1q_f32(input2_data + i + 4);
+ auto a22 = vld1q_f32(input2_data + i + 8);
+ auto a23 = vld1q_f32(input2_data + i + 12);
+ auto x0 = vmulq_f32(a10, a20);
+ auto x1 = vmulq_f32(a11, a21);
+ auto x2 = vmulq_f32(a12, a22);
+ auto x3 = vmulq_f32(a13, a23);
+
+ x0 = vmaxq_f32(activation_min, x0);
+ x1 = vmaxq_f32(activation_min, x1);
+ x2 = vmaxq_f32(activation_min, x2);
+ x3 = vmaxq_f32(activation_min, x3);
+ x0 = vminq_f32(activation_max, x0);
+ x1 = vminq_f32(activation_max, x1);
+ x2 = vminq_f32(activation_max, x2);
+ x3 = vminq_f32(activation_max, x3);
+
+ vst1q_f32(output_data + i, x0);
+ vst1q_f32(output_data + i + 4, x1);
+ vst1q_f32(output_data + i + 8, x2);
+ vst1q_f32(output_data + i + 12, x3);
+ }
+ for (; i <= size - 4; i += 4) {
+ auto a1 = vld1q_f32(input1_data + i);
+ auto a2 = vld1q_f32(input2_data + i);
+ auto x = vmulq_f32(a1, a2);
+
+ x = vmaxq_f32(activation_min, x);
+ x = vminq_f32(activation_max, x);
+
+ vst1q_f32(output_data + i, x);
+ }
+#endif // NEON
+
+ for (; i < size; i++) {
+ auto x = input1_data[i] * input2_data[i];
+ output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
+ output_activation_max);
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Mul(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+ Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+void Mul(const int32* input1_data, const Dims<4>& input1_dims,
+ const int32* input2_data, const Dims<4>& input2_dims,
+ int32* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Mul/int32");
+ TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+ auto input1_map = MapAsVector(input1_data, input1_dims);
+ auto input2_map = MapAsVector(input2_data, input2_dims);
+ auto output_map = MapAsVector(output_data, output_dims);
+ if (AreSameDims(input1_dims, input2_dims)) {
+ output_map.array() = input1_map.array() * input2_map.array();
+ } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+ auto scalar = input2_data[0];
+ output_map.array() = input1_map.array() * scalar;
+ } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+ auto scalar = input1_data[0];
+ output_map.array() = scalar * input2_map.array();
+ } else {
+ // Should not come here.
+ TFLITE_DCHECK(false);
+ }
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastMul is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, const Dims<4>& input2_dims,
+ T* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastMul");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+ input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+ input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+ }
+ }
+ }
+ }
+}
+
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+ int32 input1_offset, const uint8* input2_data,
+ const Dims<4>& input2_dims, int32 input2_offset,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ const int32 input1_val =
+ input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+ const int32 input2_val =
+ input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+ const int32 unclamped_result =
+ output_offset +
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ input1_val * input2_val, output_multiplier, output_shift);
+ const int32 clamped_output =
+ std::min(output_activation_max,
+ std::max(output_activation_min, unclamped_result));
+ output_data[Offset(output_dims, c, x, y, b)] =
+ static_cast<uint8>(clamped_output);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+ int32 input1_offset, const uint8* input2_data,
+ const Dims<4>& input2_dims, int32 input2_offset,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+ input2_dims, input2_offset, output_offset, output_multiplier,
+ output_shift, output_activation_min, output_activation_max,
+ output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void Concatenation(int concat_dim, const Scalar* const* input_data,
+ const Dims<4>* const* input_dims, int inputs_count,
+ Scalar* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Concatenation");
+ int concat_size = 0;
+ for (int i = 0; i < inputs_count; i++) {
+ for (int j = 0; j < 4; j++) {
+ if (j != concat_dim) {
+ MatchingArraySize(*input_dims[i], j, output_dims, j);
+ }
+ }
+ concat_size += ArraySize(*input_dims[i], concat_dim);
+ }
+ TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+ TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+ // for now we dont have a model with a Concatenation
+ // with fused activation function.
+ TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+ int outer_size = 1;
+ for (int i = concat_dim + 1; i < 4; i++) {
+ outer_size *= output_dims.sizes[i];
+ }
+ Scalar* output_ptr = output_data;
+ for (int k = 0; k < outer_size; k++) {
+ for (int i = 0; i < inputs_count; ++i) {
+ const int copy_size =
+ input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+ memcpy(output_ptr, input_data[i] + k * copy_size,
+ copy_size * sizeof(Scalar));
+ output_ptr += copy_size;
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void DepthConcatenation(const Scalar* const* input_data,
+ const Dims<4>* const* input_dims, int inputs_count,
+ Scalar* output_data, const Dims<4>& output_dims) {
+ Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
+ output_data, output_dims);
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+ const float* prev_activ_data,
+ const Dims<4>& prev_activ_dims, const float* weights_data,
+ const Dims<4>& weights_dims, const float* bias_data,
+ const Dims<4>& bias_dims, const float* prev_state_data,
+ const Dims<4>& prev_state_dims, float* output_state_data,
+ const Dims<4>& output_state_dims, float* output_activ_data,
+ const Dims<4>& output_activ_dims, float* concat_temp_data,
+ const Dims<4>& concat_temp_dims, float* activ_temp_data,
+ const Dims<4>& activ_temp_dims) {
+ gemmlowp::ScopedProfilingLabel label("LstmCell");
+ MatchingArraySize( // batches
+ input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims,
+ 3, output_activ_dims, 3);
+ MatchingArraySize( // height
+ input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims,
+ 2, output_activ_dims, 2);
+ MatchingArraySize( // width
+ input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims,
+ 1, output_activ_dims, 1);
+ TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
+ TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
+ const int input_depth = ArraySize(input_dims, 0);
+ const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+ const int total_input_depth = prev_activ_depth + input_depth;
+ TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+ TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
+ 1);
+ const int intern_activ_depth =
+ MatchingArraySize(weights_dims, 1, bias_dims, 0);
+ TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
+ const int output_depth =
+ MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
+ output_state_dims, 0, output_activ_dims, 0);
+ TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+
+ // Concatenate prev_activ and input data together
+ std::vector<float const*> concat_input_arrays_data;
+ std::vector<Dims<4> const*> concat_input_arrays_dims;
+ concat_input_arrays_data.push_back(input_data);
+ concat_input_arrays_data.push_back(prev_activ_data);
+ concat_input_arrays_dims.push_back(&input_dims);
+ concat_input_arrays_dims.push_back(&prev_activ_dims);
+ Concatenation<FusedActivationFunctionType::kNone, float>(
+ 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
+ concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
+
+ // Fully connected
+ FullyConnected<FusedActivationFunctionType::kNone>(
+ concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
+ bias_dims, activ_temp_data, activ_temp_dims);
+
+ // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
+ // operations.
+ ArrayMap<float> activ_temp_map =
+ MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims);
+ auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
+ activ_temp_map.cols());
+ auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
+ activ_temp_map.cols());
+ auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
+ activ_temp_map.cols());
+ auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
+ activ_temp_map.cols());
+ ArrayMap<const float> prev_state_map =
+ MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims);
+ ArrayMap<float> output_state_map =
+ MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims);
+ ArrayMap<float> output_activ_map =
+ MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims);
+
+ // Combined memory state and final output calculation
+ gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
+ output_state_map =
+ input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+ new_input_sm.tanh() +
+ forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+ prev_state_map;
+ output_activ_map =
+ output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+ output_state_map.tanh();
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
+ int outputs_count, Scalar* const* output_data,
+ const Dims<4>* const* output_dims) {
+ gemmlowp::ScopedProfilingLabel label("TensorFlowSplit");
+ TFLITE_DCHECK_GE(outputs_count, 1);
+ for (int i = 0; i < outputs_count; i++) {
+ /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
+ /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
+ /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
+ }
+ const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
+ const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
+ const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
+ TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+ // for now we dont have a model with a TensorFlowSplit
+ // with fused activation function.
+ TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+ const int whb = width * height * batches;
+ const Scalar* input_ptr = input_data;
+ for (int k = 0; k < whb; k++) {
+ for (int i = 0; i < outputs_count; ++i) {
+ memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr,
+ output_dims[i]->sizes[0] * sizeof(Scalar));
+ input_ptr += output_dims[i]->sizes[0];
+ }
+ }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width) {
+ return (b * height + h) * width + w;
+}
+
+inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int kwidth, int kheight,
+ float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("AveragePool");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ // TODO(benoitjacob) make this a proper reference impl without Eigen!
+ const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+ auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+ // TODO(benoitjacob) get rid of the dynamic memory allocation here!
+ Eigen::VectorXf out_count(out_mat.cols());
+ out_count.setZero();
+ // Prefill the output to 0.
+ out_mat.setZero();
+ for (int b = 0; b < batches; ++b) {
+ for (int h = 0; h < input_height; ++h) {
+ for (int w = 0; w < input_width; ++w) {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ int hpad = h + pad_height;
+ int wpad = w + pad_width;
+ int h_start =
+ (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1;
+ int h_end = std::min(hpad / stride_height + 1, output_height);
+ int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1;
+ int w_end = std::min(wpad / stride_width + 1, output_width);
+ // compute elementwise sum
+ for (int ph = h_start; ph < h_end; ++ph) {
+ for (int pw = w_start; pw < w_end; ++pw) {
+ int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+ out_mat.col(out_offset) +=
+ in_mat.col(NodeOffset(b, h, w, input_height, input_width));
+ out_count(out_offset)++;
+ }
+ }
+ }
+ }
+ }
+ // Divide the output by the actual number of elements being averaged over
+ TFLITE_DCHECK_GT(out_count.minCoeff(), 0);
+ out_mat.array().rowwise() /= out_count.transpose().array();
+
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < output_height; ++y) {
+ for (int x = 0; x < output_width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ ActivationFunctionWithMinMax(
+ output_data[Offset(output_dims, c, x, y, b)],
+ output_activation_min, output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int kwidth, int kheight, float* output_data,
+ const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+ AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, kwidth, kheight, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width,
+ int filter_height, float* output_data,
+ const Dims<4>& output_dims) {
+ AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_data, output_dims);
+}
+
+inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("AveragePool/8bit");
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end =
+ std::min(filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(filter_height, input_height - in_y_origin);
+ const int filter_count =
+ (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+ // 1280 required by Inception v3
+ static constexpr int kAccBufferMaxSize = 2048;
+ TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
+ uint16 acc[kAccBufferMaxSize];
+ memset(acc, 0, depth * sizeof(acc[0]));
+ const uint8* input_ptr =
+ input_data + input_dims.strides[1] * in_x_origin +
+ input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch;
+ for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+ const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] +
+ filter_x_start * input_dims.strides[1];
+ for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+ int channel = 0;
+#ifdef USE_NEON
+ for (; channel <= depth - 16; channel += 16) {
+ uint16x8_t acc_reg[2];
+ for (int i = 0; i < 2; i++) {
+ acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+ }
+ uint8x16_t input_reg = vld1q_u8(input_row_ptr);
+ input_row_ptr += 16;
+ acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
+ acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
+ for (int i = 0; i < 2; i++) {
+ vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+ }
+ }
+ for (; channel <= depth - 8; channel += 8) {
+ uint16x8_t acc_reg = vld1q_u16(acc + channel);
+ uint8x8_t input_reg = vld1_u8(input_row_ptr);
+ input_row_ptr += 8;
+ acc_reg = vaddw_u8(acc_reg, input_reg);
+ vst1q_u16(acc + channel, acc_reg);
+ }
+#endif
+ for (; channel < depth; ++channel) {
+ acc[channel] += *input_row_ptr++;
+ }
+ }
+ }
+ uint8* output_ptr =
+ output_data + Offset(output_dims, 0, out_x, out_y, batch);
+ int channel = 0;
+#ifdef USE_NEON
+#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \
+ if (filter_count == FILTER_COUNT) { \
+ for (; channel <= depth - 8; channel += 8) { \
+ uint16 buf[8]; \
+ for (int i = 0; i < 8; i++) { \
+ buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \
+ } \
+ uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \
+ buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); \
+ buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); \
+ vst1_u8(output_ptr + channel, buf8); \
+ } \
+ }
+ AVGPOOL_DIVIDING_BY(9)
+ AVGPOOL_DIVIDING_BY(15)
+#undef AVGPOOL_DIVIDING_BY
+ for (; channel <= depth - 8; channel += 8) {
+ uint16 buf[8];
+ for (int i = 0; i < 8; i++) {
+ buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+ }
+ uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+ buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max));
+ buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min));
+ vst1_u8(output_ptr + channel, buf8);
+ }
+#endif
+ for (; channel < depth; ++channel) {
+ uint16 a = (acc[channel] + filter_count / 2) / filter_count;
+ a = std::max<uint16>(a, output_activation_min);
+ a = std::min<uint16>(a, output_activation_max);
+ output_ptr[channel] = static_cast<uint8>(a);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width,
+ int filter_height, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+inline void MaxPool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int kwidth, int kheight,
+ float output_activation_min, float output_activation_max,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("MaxPool");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+ auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+ // Prefill the output to minimum representable float value
+ out_mat.setConstant(std::numeric_limits<float>::lowest());
+ for (int b = 0; b < batches; ++b) {
+ for (int h = 0; h < input_height; ++h) {
+ for (int w = 0; w < input_width; ++w) {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ int hpad = h + pad_height;
+ int wpad = w + pad_width;
+ int h_start =
+ (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1;
+ int h_end = std::min(hpad / stride_height + 1, output_height);
+ int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1;
+ int w_end = std::min(wpad / stride_width + 1, output_width);
+ // compute elementwise sum
+ for (int ph = h_start; ph < h_end; ++ph) {
+ for (int pw = w_start; pw < w_end; ++pw) {
+ int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+ out_mat.col(out_offset) =
+ out_mat.col(out_offset)
+ .cwiseMax(in_mat.col(
+ NodeOffset(b, h, w, input_height, input_width)));
+ }
+ }
+ }
+ }
+ }
+
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < output_height; ++y) {
+ for (int x = 0; x < output_width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ ActivationFunctionWithMinMax(
+ output_data[Offset(output_dims, c, x, y, b)],
+ output_activation_min, output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width, int pad_height,
+ int kwidth, int kheight, float* output_data,
+ const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, kwidth, kheight, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width, int filter_height,
+ float* output_data, const Dims<4>& output_dims) {
+ MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_data, output_dims);
+}
+
+inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("MaxPool/8bit");
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end =
+ std::min(filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(filter_height, input_height - in_y_origin);
+ // 2048 required by Inception v3
+ static constexpr int kAccBufferMaxSize = 2048;
+ TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
+ uint8 acc[kAccBufferMaxSize];
+ memset(acc, 0, depth * sizeof(acc[0]));
+ const uint8* input_ptr =
+ input_data + input_dims.strides[1] * in_x_origin +
+ input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch;
+ for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+ const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] +
+ filter_x_start * input_dims.strides[1];
+ for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+ int channel = 0;
+#ifdef USE_NEON
+ for (; channel <= depth - 16; channel += 16) {
+ uint8x16_t acc_reg = vld1q_u8(acc + channel);
+ uint8x16_t input_reg = vld1q_u8(input_row_ptr);
+ input_row_ptr += 16;
+ acc_reg = vmaxq_u8(acc_reg, input_reg);
+ vst1q_u8(acc + channel, acc_reg);
+ }
+
+ for (; channel <= depth - 8; channel += 8) {
+ uint8x8_t acc_reg = vld1_u8(acc + channel);
+ uint8x8_t input_reg = vld1_u8(input_row_ptr);
+ input_row_ptr += 8;
+ acc_reg = vmax_u8(acc_reg, input_reg);
+ vst1_u8(acc + channel, acc_reg);
+ }
+#endif
+ for (; channel < depth; ++channel) {
+ acc[channel] = std::max(acc[channel], *input_row_ptr++);
+ }
+ }
+ }
+ uint8* output_ptr =
+ output_data + Offset(output_dims, 0, out_x, out_y, batch);
+ int channel = 0;
+#ifdef USE_NEON
+ for (; channel <= depth - 16; channel += 16) {
+ uint8x16_t a = vld1q_u8(acc + channel);
+ a = vminq_u8(a, vdupq_n_u8(output_activation_max));
+ a = vmaxq_u8(a, vdupq_n_u8(output_activation_min));
+ vst1q_u8(output_ptr + channel, a);
+ }
+ for (; channel <= depth - 8; channel += 8) {
+ uint8x8_t a = vld1_u8(acc + channel);
+ a = vmin_u8(a, vdup_n_u8(output_activation_max));
+ a = vmax_u8(a, vdup_n_u8(output_activation_min));
+ vst1_u8(output_ptr + channel, a);
+ }
+#endif
+ for (; channel < depth; ++channel) {
+ uint8 a = acc[channel];
+ a = std::max<uint8>(a, output_activation_min);
+ a = std::min<uint8>(a, output_activation_max);
+ output_ptr[channel] = static_cast<uint8>(a);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width, int pad_height,
+ int filter_width, int filter_height, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width, int filter_height,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+inline void L2Pool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ float output_activation_min, float output_activation_max,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("L2Pool");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ // Actually carry out L2 Pool. Code is written in forward mode: we go through
+ // the input values once, and write to all the pooled regions that it maps to.
+ const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+ auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+ Eigen::VectorXf in_square(in_mat.rows());
+ Eigen::VectorXf out_count(out_mat.cols());
+ out_count.setZero();
+ // Prefill the output to 0.
+ out_mat.setZero();
+ for (int b = 0; b < batches; ++b) {
+ for (int h = 0; h < input_height; ++h) {
+ for (int w = 0; w < input_width; ++w) {
+ // (h_start, h_end) * (w_start, w_end) is the range that the input
+ // vector projects to.
+ const int hpad = h + pad_height;
+ const int wpad = w + pad_width;
+ const int h_start = (hpad < filter_height)
+ ? 0
+ : (hpad - filter_height) / stride_height + 1;
+ const int h_end = std::min(hpad / stride_height + 1, output_height);
+ const int w_start = (wpad < filter_width)
+ ? 0
+ : (wpad - filter_width) / stride_width + 1;
+ const int w_end = std::min(wpad / stride_width + 1, output_width);
+ // pre-compute square
+ const int in_offset = w + input_width * (h + input_height * b);
+ in_square =
+ in_mat.col(in_offset).array() * in_mat.col(in_offset).array();
+ // compute elementwise sum of squares
+ for (int ph = h_start; ph < h_end; ++ph) {
+ for (int pw = w_start; pw < w_end; ++pw) {
+ const int out_offset = pw + output_width * (ph + output_height * b);
+ out_mat.col(out_offset) += in_square;
+ out_count(out_offset)++;
+ }
+ }
+ }
+ }
+ }
+
+ out_count = out_count.array().inverse();
+ out_mat =
+ (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt();
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width, int pad_height,
+ int filter_width, int filter_height, float* output_data,
+ const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ L2Pool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width, int filter_height,
+ float* output_data, const Dims<4>& output_dims) {
+ L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_data, output_dims);
+}
+
+inline void LocalResponseNormalization(const float* input_data,
+ const Dims<4>& input_dims, int range,
+ float bias, float alpha, float beta,
+ float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("LocalResponseNormalization");
+ /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
+ /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
+ /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
+ /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+ auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+ // Carry out local response normalization, vector by vector.
+ // Since the data are stored column major, making row-wise operation
+ // probably not memory efficient anyway, we do an explicit for loop over
+ // the columns.
+ const int double_range = range * 2;
+ Eigen::VectorXf padded_square(data_in.rows() + double_range);
+ padded_square.setZero();
+ for (int r = 0; r < data_in.cols(); ++r) {
+ // Do local response normalization for data_in(:, r)
+ // first, compute the square and store them in buffer for repeated use
+ padded_square.block(range, 0, data_in.rows(), 1) =
+ data_in.col(r).cwiseProduct(data_in.col(r)) * alpha;
+ // Then, compute the scale and writes them to data_out
+ float accumulated_scale = 0;
+ for (int i = 0; i < double_range; ++i) {
+ accumulated_scale += padded_square(i);
+ }
+ for (int i = 0; i < data_in.rows(); ++i) {
+ accumulated_scale += padded_square(i + double_range);
+ data_out(i, r) = bias + accumulated_scale;
+ accumulated_scale -= padded_square(i);
+ }
+ }
+
+ // In a few cases, the pow computation could benefit from speedups.
+ if (beta == 1) {
+ data_out.array() = data_in.array() * data_out.array().inverse();
+ } else if (beta == 0.5) {
+ data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
+ } else {
+ data_out.array() = data_in.array() * data_out.array().pow(-beta);
+ }
+}
+
+inline void Softmax(const float* input_data, const Dims<4>& input_dims,
+ float beta, float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Softmax");
+ /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
+ /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
+ /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
+ /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+ auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+ // Compute the exponential first, removing the max coefficient for numerical
+ // stability.
+ out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * beta;
+ // We are separating out the exp function so that exp can be vectorized.
+ out_mat = out_mat.array().exp();
+ // Normalize to get the activations.
+ Eigen::Array<float, 1, Eigen::Dynamic> scale =
+ out_mat.array().colwise().sum().inverse();
+ out_mat.array().rowwise() *= scale;
+}
+
+inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_beta_multiplier, int32 input_beta_left_shift,
+ int diff_min, uint8* output_data,
+ const Dims<4>& output_dims) {
+ // The representation chosen for the input to the exp() function is Q5.26.
+ // We need to leave extra space since values that we skip might be as large as
+ // -32 before multiplying by input_beta_multiplier, and therefore as large as
+ // -16 afterwards. Note that exp(-8) is definitely not insignificant to
+ // accumulation, but exp(-16) definitely is.
+ static const int kScaledDiffIntegerBits = 5;
+ static const int kAccumulationIntegerBits = 12;
+ using FixedPointScaledDiff =
+ gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+ using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+ using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+ gemmlowp::ScopedProfilingLabel label("Softmax");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int x = 0; x < width; ++x) {
+ for (int y = 0; y < height; ++y) {
+ uint8 max_in_row = 0;
+ for (int c = 0; c < depth; ++c) {
+ max_in_row =
+ std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
+ }
+
+ FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+ for (int c = 0; c < depth; ++c) {
+ int32 input_diff =
+ static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+ max_in_row;
+ if (input_diff >= diff_min) {
+ const int32 input_diff_rescaled =
+ MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+ sum_of_exps =
+ sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+ exp_on_negative_values(scaled_diff_f8));
+ }
+ }
+
+ int32 fixed_sum_of_exps = sum_of_exps.raw();
+ // TODO(starka): Use a NEON intrinsic like vclzq_u32 instead.
+ int headroom_plus_one =
+ __builtin_clz(static_cast<uint32>(fixed_sum_of_exps));
+ // This is the number of bits to the left of the binary point above 1.0.
+ // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
+ // no later adjustment will be needed.
+ int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+ int32 shifted_sum_minus_one = static_cast<int32>(
+ (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+ (static_cast<uint32>(1) << 31));
+
+ FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+ FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+ for (int c = 0; c < depth; ++c) {
+ int32 input_diff =
+ static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+ max_in_row;
+ if (input_diff >= diff_min) {
+ const int32 input_diff_rescaled =
+ MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+ FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+ int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+ (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+ output_data[Offset(output_dims, c, x, y, b)] =
+ std::max(std::min(unsat_output, 255), 0);
+
+ } else {
+ output_data[Offset(output_dims, c, x, y, b)] = 0;
+ }
+ }
+ }
+ }
+ }
+}
+
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Logistic");
+ auto input_map = MapAsVector(input_data, input_dims);
+ auto output_map = MapAsVector(output_data, output_dims);
+ output_map.array() =
+ input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>());
+}
+
+inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_zero_point, int32 input_range_radius,
+ int32 input_multiplier, int input_left_shift,
+ uint8* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Logistic");
+ /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
+ /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
+ /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
+ /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0);
+ const int size = RequiredBufferSizeForDims(input_dims);
+
+ int c = 0;
+#ifdef USE_NEON
+ // Handle 16 values at a time
+ for (; c <= size - 16; c += 16) {
+ // Read input uint8 values, cast to int16 and subtract input_zero_point
+ uint8x16_t input_val_u8 = vld1q_u8(input_data + c);
+ int16x8_t input_val_centered_0 =
+ vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_val_u8))),
+ vdupq_n_s16(input_zero_point));
+ int16x8_t input_val_centered_1 =
+ vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_val_u8))),
+ vdupq_n_s16(input_zero_point));
+
+ // Prepare the bit masks that we will use at the end to implement the logic
+ // that was expressed in the scalar code with branching:
+ // if (input_val_centered < -input_range_radius) {
+ // output_val = 0;
+ // } else if (input_val_centered > input_range_radius) {
+ // output_val = 255;
+ // } else {
+ // ...
+ uint16x8_t mask_rightclamp_0 =
+ vcgtq_s16(input_val_centered_0, vdupq_n_s16(input_range_radius));
+ uint16x8_t mask_rightclamp_1 =
+ vcgtq_s16(input_val_centered_1, vdupq_n_s16(input_range_radius));
+ uint16x8_t mask_leftclamp_0 =
+ vcgeq_s16(input_val_centered_0, vdupq_n_s16(-input_range_radius));
+ uint16x8_t mask_leftclamp_1 =
+ vcgeq_s16(input_val_centered_1, vdupq_n_s16(-input_range_radius));
+ uint8x16_t mask_rightclamp = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+ vshrn_n_u16(mask_rightclamp_1, 8));
+ uint8x16_t mask_leftclamp = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+ vshrn_n_u16(mask_leftclamp_1, 8));
+
+ // This performs what is expressed in the scalar code as
+ // const int32 input_val_rescaled =
+ // MultiplyByQuantizedMultiplierGreaterThanOne(
+ // input_val_centered, input_multiplier, input_left_shift);
+ int32x4_t input_val_rescaled_0 =
+ vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_0)),
+ vdupq_n_s32(input_left_shift));
+ int32x4_t input_val_rescaled_1 =
+ vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_0)),
+ vdupq_n_s32(input_left_shift));
+ int32x4_t input_val_rescaled_2 =
+ vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_1)),
+ vdupq_n_s32(input_left_shift));
+ int32x4_t input_val_rescaled_3 =
+ vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_1)),
+ vdupq_n_s32(input_left_shift));
+ input_val_rescaled_0 =
+ vqrdmulhq_n_s32(input_val_rescaled_0, input_multiplier);
+ input_val_rescaled_1 =
+ vqrdmulhq_n_s32(input_val_rescaled_1, input_multiplier);
+ input_val_rescaled_2 =
+ vqrdmulhq_n_s32(input_val_rescaled_2, input_multiplier);
+ input_val_rescaled_3 =
+ vqrdmulhq_n_s32(input_val_rescaled_3, input_multiplier);
+
+ // Invoke gemmlowp::logistic on FixedPoint wrapping int32x4_t
+ using FixedPoint4 = gemmlowp::FixedPoint<int32x4_t, 4>;
+ using FixedPoint0 = gemmlowp::FixedPoint<int32x4_t, 0>;
+ const FixedPoint4 input_val_f4_0 =
+ FixedPoint4::FromRaw(input_val_rescaled_0);
+ const FixedPoint4 input_val_f4_1 =
+ FixedPoint4::FromRaw(input_val_rescaled_1);
+ const FixedPoint4 input_val_f4_2 =
+ FixedPoint4::FromRaw(input_val_rescaled_2);
+ const FixedPoint4 input_val_f4_3 =
+ FixedPoint4::FromRaw(input_val_rescaled_3);
+ const FixedPoint0 output_val_f0_0 = gemmlowp::logistic(input_val_f4_0);
+ const FixedPoint0 output_val_f0_1 = gemmlowp::logistic(input_val_f4_1);
+ const FixedPoint0 output_val_f0_2 = gemmlowp::logistic(input_val_f4_2);
+ const FixedPoint0 output_val_f0_3 = gemmlowp::logistic(input_val_f4_3);
+
+ // Divide by 2^23 as in the scalar code
+ using gemmlowp::RoundingDivideByPOT;
+ int32x4_t output_val_s32_0 = RoundingDivideByPOT(output_val_f0_0.raw(), 23);
+ int32x4_t output_val_s32_1 = RoundingDivideByPOT(output_val_f0_1.raw(), 23);
+ int32x4_t output_val_s32_2 = RoundingDivideByPOT(output_val_f0_2.raw(), 23);
+ int32x4_t output_val_s32_3 = RoundingDivideByPOT(output_val_f0_3.raw(), 23);
+
+ // Cast output values to uint8, saturating
+ int16x8_t output_val_s16_0 = vcombine_s16(vqmovn_s32(output_val_s32_0),
+ vqmovn_s32(output_val_s32_1));
+ int16x8_t output_val_s16_1 = vcombine_s16(vqmovn_s32(output_val_s32_2),
+ vqmovn_s32(output_val_s32_3));
+ uint8x16_t output_val_u8 = vcombine_u8(vqmovun_s16(output_val_s16_0),
+ vqmovun_s16(output_val_s16_1));
+
+ // Perform the bit-masking with the bit masks computed at the beginning,
+ // see the comment there.
+ output_val_u8 = vorrq_u8(output_val_u8, mask_rightclamp);
+ output_val_u8 = vandq_u8(output_val_u8, mask_leftclamp);
+
+ // Store back to memory
+ vst1q_u8(output_data + c, output_val_u8);
+ }
+#endif
+ // Leftover loop: handle one value at a time with scalar code.
+ for (; c < size; ++c) {
+ const uint8 input_val_u8 = input_data[c];
+ const int32 input_val_centered =
+ static_cast<int32>(input_val_u8) - input_zero_point;
+ uint8 output_val;
+ if (input_val_centered < -input_range_radius) {
+ output_val = 0;
+ } else if (input_val_centered > input_range_radius) {
+ output_val = 255;
+ } else {
+ const int32 input_val_rescaled =
+ MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_val_centered, input_multiplier, input_left_shift);
+ using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+ using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+ const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+ const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+ using gemmlowp::RoundingDivideByPOT;
+ int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+ if (output_val_s32 == 256) {
+ output_val_s32 = 255;
+ }
+ TFLITE_DCHECK_GE(output_val_s32, 0);
+ TFLITE_DCHECK_LE(output_val_s32, 255);
+ output_val = static_cast<uint8>(output_val_s32);
+ }
+ output_data[c] = output_val;
+ }
+}
+
+inline void Tanh(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Tanh");
+ auto input_map = MapAsVector(input_data, input_dims);
+ auto output_map = MapAsVector(output_data, output_dims);
+ output_map.array() = input_map.array().tanh();
+}
+
+inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
+ int32 zero_point, double scale, float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Dequantize");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ int32 val = input_data[Offset(input_dims, c, x, y, b)];
+ float result = static_cast<float>(scale * (val - zero_point));
+ output_data[Offset(output_dims, c, x, y, b)] = result;
+ }
+ }
+ }
+ }
+}
+
+inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
+ float rmin, float rmax, float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("FakeQuant");
+
+ // 0 should always be a representable value. Let's assume that the initial
+ // min,max range contains 0.
+ TFLITE_DCHECK_LE(rmin, 0.);
+ TFLITE_DCHECK_GE(rmax, 0.);
+
+ // Determine quantization parameters: zero_point, scale.
+ using Integer = uint8;
+ const Integer qmin = std::numeric_limits<Integer>::min();
+ const Integer qmax = std::numeric_limits<Integer>::max();
+ const float qmin_float = qmin;
+ const float qmax_float = qmax;
+ int32 zero_point = 0;
+ float scale = 0.f;
+ // If rmin==rmax, both must be zero per the above assertion,
+ // so we are done.
+ if (rmin != rmax) {
+ // First determine the scale.
+ scale = (rmax - rmin) / (qmax_float - qmin_float);
+
+ // Zero-point computation.
+ // First the initial floating-point computation. The zero-point can be
+ // determined from solving an affine equation for any known pair
+ // (real value, corresponding quantized value).
+ // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+ // The arithmetic error on the zero point computed from either pair
+ // will be roughly machine_epsilon * (sum of absolute values of terms)
+ // so we want to use the variant that adds the smaller terms.
+ const float zero_point_from_min = qmin_float - rmin / scale;
+ const float zero_point_from_max = qmax_float - rmax / scale;
+ const float zero_point_from_min_error =
+ std::abs(qmin_float) + std::abs(rmin / scale);
+ const float zero_point_from_max_error =
+ std::abs(qmax_float) + std::abs(rmax / scale);
+
+ const float zero_point_float =
+ zero_point_from_min_error < zero_point_from_max_error
+ ? zero_point_from_min
+ : zero_point_from_max;
+
+ // Now we need to nudge the zero point to be an integer
+ // (our zero points are integer, and this is motivated by the requirement
+ // to be able to represent the real value "0" exactly as a quantized value,
+ // which is required in multiple places, for example in Im2col with SAME
+ // padding).
+ if (zero_point_float < qmin_float) {
+ zero_point = qmin;
+ } else if (zero_point_float > qmax_float) {
+ zero_point = qmax;
+ } else {
+ zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
+ }
+ // The zero point should always be in the range of quantized value,
+ // [qmin, qmax].
+ TFLITE_DCHECK_GE(zero_point, qmin);
+ TFLITE_DCHECK_LE(zero_point, qmax);
+ }
+
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ const float src_val = input_data[Offset(input_dims, c, x, y, b)];
+ const float unclamped_quantized_val =
+ TfLiteRound(zero_point + src_val / scale);
+ const float quantized_val = std::min(
+ qmax_float, std::max(qmin_float, unclamped_quantized_val));
+ const float dst_val = scale * (quantized_val - zero_point);
+ output_data[Offset(output_dims, c, x, y, b)] = dst_val;
+ }
+ }
+ }
+ }
+}
+
+template <typename SrcT, typename DstT>
+inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
+ DstT* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Cast");
+ auto input_map = MapAsVector(input_data, input_dims);
+ auto output_map = MapAsVector(output_data, output_dims);
+ output_map.array() = input_map.array().template cast<DstT>();
+}
+
+inline void Floor(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Floor");
+ auto input_map = MapAsVector(input_data, input_dims);
+ auto output_map = MapAsVector(output_data, output_dims);
+ output_map.array() = Eigen::floor(input_map.array());
+}
+
+template <typename T>
+inline void Gather(const T* input_data, const Dims<4>& input_dims,
+ int input_rank, const int32* coords_data,
+ const Dims<4>& coords_dims, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Gather");
+
+ TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]);
+ int stride = input_dims.strides[input_rank - 1];
+ T* out = output_data;
+
+ for (int i = 0; i < coords_dims.sizes[0]; i++) {
+ TFLITE_DCHECK_GE(coords_data[i], 0);
+ TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]);
+ const T* in = input_data + coords_data[i] * stride;
+ memcpy(out, in, sizeof(T) * stride);
+ out += stride;
+ }
+}
+
+#ifdef USE_NEON
+inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
+ float scale, float* output_ptr) {
+ int ic = 0;
+ // Handle 32 input channels at a time.
+ for (; ic <= depth - 32; ic += 32) {
+ float32x4x2_t input[4];
+ for (int i = 0; i < 4; i++) {
+ input[i].val[0] = vld1q_f32(input_ptr + 8 * i);
+ input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4);
+ }
+ float32x4x2_t acc[4];
+ for (int i = 0; i < 4; i++) {
+ acc[i].val[0] = vld1q_f32(output_ptr + 8 * i);
+ acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4);
+ }
+ for (int i = 0; i < 4; i++) {
+ acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale);
+ acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale);
+ }
+ for (int i = 0; i < 4; i++) {
+ vst1q_f32(output_ptr, acc[i].val[0]);
+ vst1q_f32(output_ptr + 4, acc[i].val[1]);
+ output_ptr += 8;
+ }
+ input_ptr += 32;
+ }
+ // Handle 16 input channels at a time.
+ for (; ic <= depth - 16; ic += 16) {
+ float32x4x2_t input[2];
+ for (int i = 0; i < 2; i++) {
+ input[i].val[0] = vld1q_f32(input_ptr + 8 * i);
+ input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4);
+ }
+ float32x4x2_t acc[2];
+ for (int i = 0; i < 2; i++) {
+ acc[i].val[0] = vld1q_f32(output_ptr + 8 * i);
+ acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4);
+ }
+ for (int i = 0; i < 2; i++) {
+ acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale);
+ acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale);
+ }
+ for (int i = 0; i < 2; i++) {
+ vst1q_f32(output_ptr, acc[i].val[0]);
+ vst1q_f32(output_ptr + 4, acc[i].val[1]);
+ output_ptr += 8;
+ }
+ input_ptr += 16;
+ }
+ // Handle 8 input channels at a time.
+ for (; ic <= depth - 8; ic += 8) {
+ float32x4x2_t input;
+ input.val[0] = vld1q_f32(input_ptr);
+ input.val[1] = vld1q_f32(input_ptr + 4);
+
+ float32x4x2_t acc;
+ acc.val[0] = vld1q_f32(output_ptr);
+ acc.val[1] = vld1q_f32(output_ptr + 4);
+ acc.val[0] = vmlaq_n_f32(acc.val[0], input.val[0], scale);
+ acc.val[1] = vmlaq_n_f32(acc.val[1], input.val[1], scale);
+
+ vst1q_f32(output_ptr, acc.val[0]);
+ vst1q_f32(output_ptr + 4, acc.val[1]);
+
+ input_ptr += 8;
+ output_ptr += 8;
+ }
+ // Handle 4 input channels at a time.
+ for (; ic <= depth - 4; ic += 4) {
+ float32x4_t input = vld1q_f32(input_ptr);
+ float32x4_t acc = vld1q_f32(output_ptr);
+
+ acc = vmlaq_n_f32(acc, input, scale);
+ vst1q_f32(output_ptr, acc);
+
+ input_ptr += 4;
+ output_ptr += 4;
+ }
+ // Handle 1 input channel at a time.
+ for (; ic < depth; ic++) {
+ *output_ptr += *input_ptr * scale;
+ output_ptr++;
+ input_ptr++;
+ }
+}
+#else
+inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
+ float scale, float* output_ptr) {
+ for (int32 i = 0; i < depth; i++) {
+ *output_ptr += *input_ptr * scale;
+ output_ptr++;
+ input_ptr++;
+ }
+}
+#endif
+
+inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
+ int32 x, int32 y, int32 depth, int32 batch,
+ const float* input_data,
+ const Dims<4>& input_dims,
+ float* output_data,
+ const Dims<4>& output_dims) {
+ const int32 input_width = ArraySize(input_dims, 1);
+ const int32 output_width = ArraySize(output_dims, 1);
+
+ const int32 input_x_offset = (x1 - x0) * depth;
+ const int32 input_y_offset = (y1 - y0) * depth * input_width;
+ const int32 output_x_offset = depth;
+ const int32 output_y_offset = depth * output_width;
+
+#ifdef USE_NEON
+ TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+ TFLITE_DCHECK(x1 >= x0);
+ TFLITE_DCHECK(y1 >= y0);
+
+ int ic = 0;
+ // Handle 8 input channels at a time.
+ for (; ic <= depth - 8; ic += 8) {
+ const float* input_ptr = nullptr;
+
+ float32x4x2_t x0y0;
+ input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)];
+ x0y0.val[0] = vld1q_f32(input_ptr);
+ x0y0.val[1] = vld1q_f32(input_ptr + 4);
+
+ float32x4x2_t x1y0;
+ input_ptr += input_x_offset;
+ x1y0.val[0] = vld1q_f32(input_ptr);
+ x1y0.val[1] = vld1q_f32(input_ptr + 4);
+
+ float32x4x2_t x0y1;
+ input_ptr += -input_x_offset + input_y_offset;
+ x0y1.val[0] = vld1q_f32(input_ptr);
+ x0y1.val[1] = vld1q_f32(input_ptr + 4);
+
+ float32x4x2_t x1y1;
+ input_ptr += input_x_offset;
+ x1y1.val[0] = vld1q_f32(input_ptr);
+ x1y1.val[1] = vld1q_f32(input_ptr + 4);
+
+ // Top left corner.
+ float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)];
+ vst1q_f32(output_ptr, x0y0.val[0]);
+ vst1q_f32(output_ptr + 4, x0y0.val[1]);
+
+ // Top right corner.
+ output_ptr += output_x_offset;
+ float32x4x2_t tr;
+ tr.val[0] = vaddq_f32(x0y0.val[0], x1y0.val[0]);
+ tr.val[1] = vaddq_f32(x0y0.val[1], x1y0.val[1]);
+ tr.val[0] = vmulq_n_f32(tr.val[0], 0.5f);
+ tr.val[1] = vmulq_n_f32(tr.val[1], 0.5f);
+
+ vst1q_f32(output_ptr, tr.val[0]);
+ vst1q_f32(output_ptr + 4, tr.val[1]);
+
+ // Bottom left corner.
+ output_ptr += -output_x_offset + output_y_offset;
+ float32x4x2_t bl;
+ bl.val[0] = vaddq_f32(x0y0.val[0], x0y1.val[0]);
+ bl.val[1] = vaddq_f32(x0y0.val[1], x0y1.val[1]);
+ bl.val[0] = vmulq_n_f32(bl.val[0], 0.5f);
+ bl.val[1] = vmulq_n_f32(bl.val[1], 0.5f);
+ vst1q_f32(output_ptr, bl.val[0]);
+ vst1q_f32(output_ptr + 4, bl.val[1]);
+
+ // Bottom right corner.
+ output_ptr += output_x_offset;
+ float32x4x2_t br;
+ br.val[0] = vaddq_f32(x1y0.val[0], x1y1.val[0]);
+ br.val[1] = vaddq_f32(x1y0.val[1], x1y1.val[1]);
+ br.val[0] = vmlaq_n_f32(bl.val[0], br.val[0], 0.5f);
+ br.val[1] = vmlaq_n_f32(bl.val[1], br.val[1], 0.5f);
+ br.val[0] = vmulq_n_f32(br.val[0], 0.5f);
+ br.val[1] = vmulq_n_f32(br.val[1], 0.5f);
+ vst1q_f32(output_ptr, br.val[0]);
+ vst1q_f32(output_ptr + 4, br.val[1]);
+ }
+ // Handle 4 input channels at a time.
+ for (; ic <= depth - 4; ic += 4) {
+ const float* input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)];
+ float32x4_t x0y0 = vld1q_f32(input_ptr);
+ float32x4_t x1y0 = vld1q_f32(input_ptr + input_x_offset);
+ float32x4_t x0y1 = vld1q_f32(input_ptr + input_y_offset);
+ float32x4_t x1y1 = vld1q_f32(input_ptr + input_x_offset + input_y_offset);
+
+ // Top left corner.
+ float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)];
+ vst1q_f32(output_ptr, x0y0);
+
+ // Top right corner.
+ output_ptr += output_x_offset;
+ float32x4_t tr = vaddq_f32(x0y0, x1y0);
+ tr = vmulq_n_f32(tr, 0.5f);
+ vst1q_f32(output_ptr, tr);
+
+ // Bottom left corner.
+ output_ptr += -output_x_offset + output_y_offset;
+ float32x4_t bl = vaddq_f32(x0y0, x0y1);
+ bl = vmulq_n_f32(bl, 0.5f);
+ vst1q_f32(output_ptr, bl);
+
+ // Bottom right corner.
+ output_ptr += output_x_offset;
+ float32x4_t br = vaddq_f32(x1y0, x1y1);
+ br = vmlaq_n_f32(bl, br, 0.5f);
+ br = vmulq_n_f32(br, 0.5f);
+ vst1q_f32(output_ptr, br);
+ }
+ // Handle one input channel at a time.
+ for (; ic < depth; ic++) {
+ const int32 input_offset = Offset(input_dims, ic, x0, y0, batch);
+
+ float x0y0 = input_data[input_offset];
+ float x1y0 = input_data[input_offset + input_x_offset];
+ float x0y1 = input_data[input_offset + input_y_offset];
+ float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
+
+ // Top left corner.
+ const int32 output_offset = Offset(output_dims, ic, x, y, batch);
+ output_data[output_offset] = x0y0;
+
+ // Top right corner.
+ output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
+
+ // Bottom left corner.
+ float output = (x0y0 + x0y1) / 2;
+ output_data[output_offset + output_y_offset] = output;
+
+ // Bottom right corner.
+ output_data[output_offset + output_x_offset + output_y_offset] =
+ (output + ((x1y0 + x1y1) / 2)) / 2;
+ }
+#else
+ for (int ch = 0; ch < depth; ch++) {
+ const int32 input_offset = Offset(input_dims, ch, x0, y0, batch);
+
+ float x0y0 = input_data[input_offset];
+ float x1y0 = input_data[input_offset + input_x_offset];
+ float x0y1 = input_data[input_offset + input_y_offset];
+ float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
+
+ // Top left corner.
+ const int32 output_offset = Offset(output_dims, ch, x, y, batch);
+ output_data[output_offset] = x0y0;
+
+ // Top right corner.
+ output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
+
+ // Bottom left corner.
+ float output = (x0y0 + x0y1) / 2;
+ output_data[output_offset + output_y_offset] = output;
+
+ // Bottom right corner.
+ output_data[output_offset + output_x_offset + output_y_offset] =
+ (output + ((x1y0 + x1y1) / 2)) / 2;
+ }
+#endif
+}
+
+inline void ResizeBilinear2x2(const float* input_data,
+ const Dims<4>& input_dims, float* output_data,
+ const Dims<4>& output_dims, int32 batches,
+ int32 input_height, int32 input_width,
+ int32 depth, int32 output_height,
+ int32 output_width) {
+ for (int b = 0; b < batches; b++) {
+ for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) {
+ for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) {
+ int32 x1 = std::min(x0 + 1, input_width - 1);
+ int32 y1 = std::min(y0 + 1, input_height - 1);
+ ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_data,
+ input_dims, output_data, output_dims);
+ }
+ }
+ }
+}
+
+inline void ResizeBilinearGeneric(const float* input_data,
+ const Dims<4>& input_dims, float* output_data,
+ const Dims<4>& output_dims, int32 batches,
+ int32 input_height, int32 input_width,
+ int32 depth, int32 output_height,
+ int32 output_width, float height_scale,
+ float width_scale) {
+ memset(output_data, 0,
+ batches * output_height * output_width * depth * sizeof(float));
+
+ int32 output_offset = 0;
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < output_height; ++y) {
+ float input_y = y * height_scale;
+ int32 y0 = static_cast<int32>(std::floor(input_y));
+ int32 y1 = std::min(y0 + 1, input_height - 1);
+ for (int x = 0; x < output_width; ++x) {
+ float input_x = x * width_scale;
+ int32 x0 = static_cast<int32>(input_x);
+ int32 x1 = std::min(x0 + 1, input_width - 1);
+ float* output_ptr = &output_data[output_offset];
+
+ // Run kernel on the 4 corners of the bilinear resize algorithm.
+ int32 input_offset = Offset(input_dims, 0, x0, y0, b);
+ float scale = (1 - (input_y - y0)) * (1 - (input_x - x0));
+ const float* input_ptr = &input_data[input_offset];
+ ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+ input_offset = Offset(input_dims, 0, x1, y0, b);
+ scale = (1 - (input_y - y0)) * (input_x - x0);
+ input_ptr = &input_data[input_offset];
+ ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+ input_offset = Offset(input_dims, 0, x0, y1, b);
+ scale = (input_y - y0) * (1 - (input_x - x0));
+ input_ptr = &input_data[input_offset];
+ ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+ input_offset = Offset(input_dims, 0, x1, y1, b);
+ scale = (input_y - y0) * (input_x - x0);
+ input_ptr = &input_data[input_offset];
+ ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+ output_offset += depth;
+ }
+ }
+ }
+}
+
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+ const int32* output_size_data,
+ const Dims<4>& output_size_dims, float* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
+ int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ int32 input_height = ArraySize(input_dims, 2);
+ int32 input_width = ArraySize(input_dims, 1);
+ int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
+ TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
+ TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
+ TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
+ int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
+ int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+
+ // Specialize for 2x2 upsample.
+ if (output_height == 2 * input_height && output_width == 2 * input_width) {
+ ResizeBilinear2x2(input_data, input_dims, output_data, output_dims, batches,
+ input_height, input_width, depth, output_height,
+ output_width);
+ } else {
+ float height_scale = static_cast<float>(input_height) / output_height;
+ float width_scale = static_cast<float>(input_width) / output_width;
+
+ ResizeBilinearGeneric(input_data, input_dims, output_data, output_dims,
+ batches, input_height, input_width, depth,
+ output_height, output_width, height_scale,
+ width_scale);
+ }
+}
+
+template <typename T>
+inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
+ const int32* block_shape_data,
+ const Dims<4>& block_shape_dims,
+ const int32* paddings_data,
+ const Dims<4>& paddings_dims, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("SpaceToBatchND");
+
+ const int output_batch_size = ArraySize(output_dims, 3);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ const int input_batch_size = ArraySize(input_dims, 3);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int depth = ArraySize(input_dims, 0);
+ const int block_shape_height = block_shape_data[0];
+ const int block_shape_width = block_shape_data[1];
+ const int padding_top = paddings_data[0];
+ const int padding_left = paddings_data[2];
+
+ for (int out_b = 0; out_b < output_batch_size; ++out_b) {
+ int input_batch = out_b % input_batch_size;
+ int shift_w = (out_b / input_batch_size) % block_shape_width;
+ int shift_h = (out_b / input_batch_size) / block_shape_width;
+ for (int out_h = 0; out_h < output_height; ++out_h) {
+ for (int out_w = 0; out_w < output_width; ++out_w) {
+ T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
+ if (out_h * block_shape_height < padding_top ||
+ out_h * block_shape_height >= padding_top + input_height ||
+ out_w * block_shape_width < padding_left ||
+ out_w * block_shape_width >= padding_left + input_width) {
+ memset(out, 0, depth * sizeof(T));
+ } else {
+ const T* in =
+ input_data +
+ Offset(input_dims, 0,
+ (out_w * block_shape_width + shift_w) - padding_left,
+ (out_h * block_shape_height + shift_h) - padding_top,
+ input_batch);
+ memcpy(out, in, depth * sizeof(T));
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
+ const int32* block_shape_data,
+ const Dims<4>& block_shape_dims, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BatchToSpaceND");
+
+ const int output_batch_size = ArraySize(output_dims, 3);
+ const int input_batch_size = ArraySize(input_dims, 3);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int depth = ArraySize(input_dims, 0);
+ const int block_shape_width = block_shape_data[1];
+ const int block_shape_height = block_shape_data[0];
+
+ for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+ for (int in_h = 0; in_h < input_height; ++in_h) {
+ for (int in_w = 0; in_w < input_width; ++in_w) {
+ int out_batch = in_batch % output_batch_size;
+ int out_w = in_w * block_shape_width +
+ (in_batch / output_batch_size) % block_shape_width;
+ int out_h = in_h * block_shape_height +
+ (in_batch / output_batch_size) / block_shape_width;
+ T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
+ const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
+ memcpy(out, in, depth * sizeof(T));
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+ const std::vector<int>& left_paddings,
+ const std::vector<int>& right_paddings, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Pad");
+ const int output_batch = ArraySize(output_dims, 3);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ const int output_depth = ArraySize(output_dims, 0);
+
+ const int left_b_padding = left_paddings[3];
+ const int left_h_padding = left_paddings[2];
+ const int left_w_padding = left_paddings[1];
+ const int left_d_padding = left_paddings[0];
+
+ const int right_b_padding = right_paddings[3];
+ const int right_h_padding = right_paddings[2];
+ const int right_w_padding = right_paddings[1];
+ const int right_d_padding = right_paddings[0];
+
+ const int input_depth = ArraySize(input_dims, 0);
+
+ if (left_b_padding != 0) {
+ memset(output_data, 0,
+ left_b_padding * output_height * output_width * output_depth *
+ sizeof(T));
+ }
+ for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
+ ++out_b) {
+ if (left_h_padding != 0) {
+ memset(output_data + Offset(output_dims, 0, 0, 0, out_b), 0,
+ left_h_padding * output_width * output_depth * sizeof(T));
+ }
+ for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
+ ++out_h) {
+ if (left_w_padding != 0) {
+ memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), 0,
+ left_w_padding * output_depth * sizeof(T));
+ }
+ for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
+ ++out_w) {
+ if (left_d_padding != 0) {
+ memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b), 0,
+ left_d_padding * sizeof(T));
+ }
+
+ T* out = output_data +
+ Offset(output_dims, left_d_padding, out_w, out_h, out_b);
+ const T* in =
+ input_data + Offset(input_dims, 0, out_w - left_w_padding,
+ out_h - left_h_padding, out_b - left_b_padding);
+ memcpy(out, in, input_depth * sizeof(T));
+
+ if (right_d_padding != 0) {
+ memset(
+ output_data + Offset(output_dims, output_depth - right_d_padding,
+ out_w, out_h, out_b),
+ 0, right_d_padding * sizeof(T));
+ }
+ }
+ if (right_w_padding != 0) {
+ memset(
+ output_data + Offset(output_dims, 0, output_width - right_w_padding,
+ out_h, out_b),
+ 0, right_w_padding * output_depth * sizeof(T));
+ }
+ }
+ if (right_h_padding != 0) {
+ memset(output_data + Offset(output_dims, 0, 0,
+ output_height - right_h_padding, out_b),
+ 0, right_h_padding * output_width * output_depth * sizeof(T));
+ }
+ }
+ if (right_b_padding != 0) {
+ memset(output_data +
+ Offset(output_dims, 0, 0, 0, output_batch - right_b_padding),
+ 0,
+ right_b_padding * output_height * output_width * output_depth *
+ sizeof(T));
+ }
+}
+
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+ int begin_mask, int end_mask,
+ const std::vector<int>& starts,
+ const std::vector<int>& stops,
+ const std::vector<int>& strides, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("StridedSlice");
+ const int start_b = (begin_mask & 8) ? 0 : starts[3];
+ const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
+ const int start_h = (begin_mask & 4) ? 0 : starts[2];
+ const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
+ const int start_w = (begin_mask & 2) ? 0 : starts[1];
+ const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
+ const int start_d = (begin_mask & 1) ? 0 : starts[0];
+ const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
+
+ T* out_ptr = output_data;
+ if (strides[0] == 0) {
+ for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
+ for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
+ for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
+ const int len = stop_d - start_d;
+ memcpy(out_ptr,
+ input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
+ len * sizeof(T));
+ out_ptr += len;
+ }
+ }
+ }
+ } else {
+ for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
+ for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
+ for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
+ for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
+ *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
+ }
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+ const std::vector<int>& begin, const std::vector<int>& size,
+ T* output_data, const Dims<4>& output_dims) {
+ // TODO(dkalenichenko): This op only supports 4D tensors.
+ TFLITE_DCHECK_EQ(begin.size(), 4);
+ TFLITE_DCHECK_EQ(size.size(), 4);
+ const int start_b = begin[3];
+ const int stop_b =
+ size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
+ const int start_h = begin[2];
+ const int stop_h =
+ size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
+ const int start_w = begin[1];
+ const int stop_w =
+ size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
+ const int start_d = begin[0];
+ const int stop_d =
+ size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
+
+ T* out_ptr = output_data;
+ for (int in_b = start_b; in_b < stop_b; ++in_b) {
+ for (int in_h = start_h; in_h < stop_h; ++in_h) {
+ for (int in_w = start_w; in_w < stop_w; ++in_w) {
+ const int len = stop_d - start_d;
+ memcpy(out_ptr,
+ input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
+ len * sizeof(T));
+ out_ptr += len;
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void Mean(const T* input_data, const Dims<4>& input_dims,
+ const std::vector<int>& reduction_indices, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Mean");
+ const int output_batch = ArraySize(output_dims, 3);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ const int output_depth = ArraySize(output_dims, 0);
+
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+
+ // The current implementation only supports simultaneous reduction over
+ // width and height.
+ TFLITE_DCHECK_EQ(reduction_indices.size(), 2);
+ TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) ||
+ (reduction_indices[0] == 2 && reduction_indices[1] == 1));
+ TFLITE_DCHECK_EQ(output_height, 1);
+ TFLITE_DCHECK_EQ(output_width, 1);
+
+ for (int out_b = 0; out_b < output_batch; ++out_b) {
+ for (int out_d = 0; out_d < output_depth; ++out_d) {
+ float value = 0;
+ for (int in_h = 0; in_h < input_height; ++in_h) {
+ for (int in_w = 0; in_w < input_width; ++in_w) {
+ value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)];
+ }
+ }
+ output_data[Offset(output_dims, out_d, 0, 0, out_b)] =
+ value / (input_width * input_height);
+ }
+ }
+}
+
+template <typename T>
+void GenericBroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, const Dims<4>& input2_dims,
+ T* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("GenericBroadcastSub");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+ input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
+ const Dims<4>& input2_dims, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("Sub");
+
+ auto input1_map = MapAsVector(input1_data, input1_dims);
+ auto input2_map = MapAsVector(input2_data, input2_dims);
+ auto output_map = MapAsVector(output_data, output_dims);
+ if (AreSameDims(input1_dims, input2_dims)) {
+ output_map.array() = input1_map.array() - input2_map.array();
+ } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+ auto scalar = input1_data[0];
+ output_map.array() = scalar - input2_map.array();
+ } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+ auto scalar = input2_data[0];
+ output_map.array() = input1_map.array() - scalar;
+ } else {
+ GenericBroadcastSub(input1_data, input1_dims, input2_data, input2_dims,
+ output_data, output_dims);
+ }
+}
+
+template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("TensorFlowMinimum");
+ auto input1_map = MapAsVector(input1_data, input1_dims);
+ auto output_map = MapAsVector(output_data, output_dims);
+ auto min_value = input2_data[0];
+ output_map.array() = input1_map.array().min(min_value);
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, T* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("TensorFlowMaximum");
+ auto input1_map = MapAsVector(input1_data, input1_dims);
+ auto output_map = MapAsVector(output_data, output_dims);
+ auto max_value = input2_data[0];
+ output_map.array() = input1_map.array().max(max_value);
+}
+} // namespace optimized_ops
+} // namespace tflite
+
+#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#pragma GCC diagnostic pop
+#endif
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
new file mode 100644
index 0000000000..f8be99e82f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -0,0 +1,138 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#define TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+
+// TDOD(ghodrat): Remove this header file and the dependency to internal data
+// structure.
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+#endif // USE_NEON
+
+namespace tflite {
+namespace tensor_utils {
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+ int m_rows, int m_cols,
+ const float* vector,
+ int n_batch, float* result,
+ int result_stride);
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+ int m_cols, const float* vector,
+ int n_batch, float* result,
+ int result_stride);
+
+// Cwise product of two vectors.
+void PortableVectorVectorCwiseProduct(const float* vector1,
+ const float* vector2, int v_size,
+ float* result);
+void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
+ int v_size, float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
+ const float* vector2,
+ int v_size, float* result);
+void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
+ const float* vector2, int v_size,
+ float* result);
+
+// Dot product of two vectors.
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+ int v_size);
+float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
+ int v_size);
+
+// Dot product of two batch vectors.
+void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
+ const float* vector2, int v_size,
+ int n_batch, float* result,
+ int result_stride);
+void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
+ const float* vector2, int v_size,
+ int n_batch, float* result,
+ int result_stride);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
+ int v_size,
+ const float* batch_vector,
+ int n_batch,
+ float* result);
+void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
+ int v_size,
+ const float* batch_vector,
+ int n_batch, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void PortableSub1Vector(const float* vector, int v_size, float* result);
+void NeonSub1Vector(const float* vector, int v_size, float* result);
+
+// Clip elements of a vector using a abs_limit value.
+void PortableClipVector(const float* vector, int v_size, float abs_limit,
+ float* result);
+void NeonClipVector(const float* vector, int v_size, float abs_limit,
+ float* result);
+
+// Batch vector initialization with another vector.
+void PortableVectorBatchVectorAssign(const float* vector, int v_size,
+ int n_batch, float* batch_vector);
+
+// Apply sigmoid to elements of a vector.
+void PortableApplySigmoidToVector(const float* vector, int v_size,
+ float* result);
+
+// Apply activation function to elements of a vector.
+void PortableApplyActivationToVector(const float* vector, int v_size,
+ TfLiteFusedActivation activation,
+ float* result);
+
+// Copy vector to another vector.
+void PortableCopyVector(const float* vector, int v_size, float* result);
+
+// Fill vector with 0.f.
+void PortableZeroVector(float* vector, int v_size);
+
+// Limit a float input f between +abs_limit and -abs_limit.
+float PortableClip(float f, float abs_limit);
+
+// Shift left a vector in place with v_size size.
+void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
+void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void PortableReductionSumVector(const float* input_vector, float* output_vector,
+ int output_size, int reduction_size);
+void NeonReductionSumVector(const float* input_vector, float* output_vector,
+ int output_size, int reduction_size);
+
+} // namespace tensor_utils
+} // namespace tflite
+
+#endif // TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
new file mode 100644
index 0000000000..98f2e365c5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -0,0 +1,95 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+
+namespace tflite {
+
+void QuantizeMultiplierSmallerThanOne(double double_multiplier,
+ int32_t* quantized_multiplier,
+ int* right_shift) {
+ TFLITE_CHECK(double_multiplier >= 0.);
+ TFLITE_CHECK(double_multiplier < 1.);
+ if (double_multiplier == 0.) {
+ *quantized_multiplier = 0;
+ *right_shift = 0;
+ return;
+ }
+ TFLITE_CHECK(double_multiplier > 0.);
+ const double q = std::frexp(double_multiplier, right_shift);
+ *right_shift *= -1;
+
+ auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
+ TFLITE_CHECK(q_fixed <= (1ll << 31));
+ if (q_fixed == (1ll << 31)) {
+ q_fixed /= 2;
+ --*right_shift;
+ }
+ TFLITE_CHECK_GE(*right_shift, 0);
+ TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+ *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+ int32_t* quantized_multiplier,
+ int* left_shift) {
+ TFLITE_CHECK(double_multiplier > 1.);
+ const double q = std::frexp(double_multiplier, left_shift);
+ auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
+ TFLITE_CHECK(q_fixed <= (1ll << 31));
+ if (q_fixed == (1ll << 31)) {
+ q_fixed /= 2;
+ ++*left_shift;
+ }
+ TFLITE_CHECK_GE(*left_shift, 0);
+ TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+ *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void PreprocessSoftmaxScaling(double beta, double input_scale,
+ int input_integer_bits,
+ int32_t* quantized_multiplier, int* left_shift) {
+ // If the overall multiplier (input and beta) is large, then exp() of an
+ // input difference of 1 scaled by this will be large. In other words, we
+ // can cap the multiplier and know that, when it is used, the output will be
+ // (round to) zero wherever the input is not at the maximum value.
+
+ // If the overall scale is less than one, and input_integer_bits=0, then the
+ // result is double equivalent of Q0.31 (actually with more precision). Thus
+ // this generates a Q(input_integer_bits).(31-input_integer_bits)
+ // representation.
+ const double input_beta_real_multiplier = std::min(
+ beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
+
+ QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
+ quantized_multiplier, left_shift);
+}
+
+int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
+ const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
+ (1ll << (31 - input_integer_bits)) /
+ (1ll << input_left_shift);
+ // Tighten bound using floor. Suppose that we could use the exact value.
+ // After scaling the difference, the result would be at the maximum. Thus we
+ // must ensure that our value has lower magnitude.
+ return static_cast<int>(std::floor(max_input_rescaled));
+}
+
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
new file mode 100644
index 0000000000..efb7191c8d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef PHOTOS_VISION_LEARNING_TENSORFLOW_MINI_QUANTIZATION_UTIL_H_
+#define PHOTOS_VISION_LEARNING_TENSORFLOW_MINI_QUANTIZATION_UTIL_H_
+
+#include <cstdint>
+
+namespace tflite {
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Restricted to the case where the multiplier < 1 (and non-negative).
+void QuantizeMultiplierSmallerThanOne(double double_multiplier,
+ int32_t* quantized_multiplier,
+ int* right_shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Restricted to the case where the multiplier > 1.
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+ int32_t* quantized_multiplier,
+ int* left_shift);
+
+// This first creates a multiplier in a double equivalent of
+// Q(input_integer_bits).(31-input_integer_bits) representation, with extra
+// precision in the double's fractional bits. It then splits the result into
+// significand and exponent.
+void PreprocessSoftmaxScaling(double beta, double input_scale,
+ int input_integer_bits,
+ int32_t* quantized_multiplier, int* left_shift);
+
+// Calculate the largest input that will result in a within-bounds intermediate
+// result within MultiplyByQuantizedMultiplierGreaterThanOne. In other words,
+// it must not overflow before we reduce the value by multiplication by the
+// input multiplier. The negative radius is used as the minimum difference
+// in Softmax.
+int CalculateInputRadius(int input_integer_bits, int input_left_shift);
+
+} // namespace tflite
+
+#endif // PHOTOS_VISION_LEARNING_TENSORFLOW_MINI_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
new file mode 100644
index 0000000000..d6f306e2cb
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+using ::testing::Pair;
+
+TEST(QuantizationUtilTest, QuantizeMultiplierSmallerThanOne) {
+ auto quantize = [](double d) {
+ int32_t q;
+ int s;
+ QuantizeMultiplierSmallerThanOne(d, &q, &s);
+ return std::pair<int32_t, int>{q, s};
+ };
+
+ EXPECT_DEATH(quantize(-0.1), "");
+ EXPECT_THAT(quantize(0.0), Pair(0, 0));
+ EXPECT_THAT(quantize(0.25), Pair(1073741824, 1));
+
+ // Around 0.5 we can see the change in exponent and how we try hard to
+ // void hitting max int32.
+ EXPECT_THAT(quantize(0.50 - 5e-9), Pair(2147483627, 1));
+ EXPECT_THAT(quantize(0.50 - 1e-10), Pair(1073741824, 0));
+ EXPECT_THAT(quantize(0.50), Pair(1073741824, 0));
+
+ EXPECT_THAT(quantize(0.75), Pair(1610612736, 0));
+ EXPECT_THAT(quantize(1 - 1e-9), Pair(2147483646, 0));
+
+ // If we get close enough to 1.0 it crashes and dies in one of two ways:
+ // Either the shift becomes negative or we trigger the 'less-than-one' CHECK.
+ EXPECT_DEATH(quantize(1 - 1e-15), "");
+ EXPECT_DEATH(quantize(1 - 1e-17), "");
+ EXPECT_DEATH(quantize(1.0), "");
+}
+
+TEST(QuantizationUtilTest, QuantizeMultiplierGreaterThanOne) {
+ auto quantize = [](double d) {
+ int32_t q;
+ int s;
+ QuantizeMultiplierGreaterThanOne(d, &q, &s);
+ return std::pair<int32_t, int>{q, s};
+ };
+
+ // If we are close enough to 1.0 it crashes.
+ EXPECT_DEATH(quantize(1 + 1e-16), "");
+
+ EXPECT_THAT(quantize(1 + 1e-11), Pair(1073741824, 1));
+ EXPECT_THAT(quantize(1.25), Pair(1342177280, 1));
+ EXPECT_THAT(quantize(1.50), Pair(1610612736, 1));
+ EXPECT_THAT(quantize(1.75), Pair(1879048192, 1));
+
+ // Around the powers of two we see the change in exponent. Also,
+ // we try hard to avoid hitting max int32.
+ EXPECT_THAT(quantize(2 - 1e-9), Pair(2147483647, 1));
+ EXPECT_THAT(quantize(2 - 1e-11), Pair(1073741824, 2));
+ EXPECT_THAT(quantize(2), Pair(1073741824, 2));
+}
+
+TEST(QuantizationUtilTest, PreprocessSoftmaxScaling) {
+ auto quantize = [](double beta, double scale, int integer_bits) {
+ int32_t q;
+ int s;
+ PreprocessSoftmaxScaling(beta, scale, integer_bits, &q, &s);
+ return std::pair<int32_t, int>{q, s};
+ };
+
+ // If beta * scale is greater than fits in the number of integer bits, the
+ // result is move near the maximum. Otherwise they quantize as expected.
+ // With 4 integer bits we can represent up to 16.0.
+ EXPECT_THAT(quantize(1.0, 16.0, 4), Pair(2147483647, 31));
+ EXPECT_THAT(quantize(1.0, 8.0, 4), Pair(1073741824, 31));
+ // But with 5 bits we can go further.
+ EXPECT_THAT(quantize(2.0, 16.0, 5), Pair(2147483647, 31));
+ EXPECT_THAT(quantize(2.0, 8.0, 5), Pair(1073741824, 31));
+}
+
+TEST(QuantizationUtilTest, CalculateInputRadius) {
+ EXPECT_EQ(CalculateInputRadius(4, 27), 15);
+ EXPECT_EQ(CalculateInputRadius(3, 27), 14);
+ EXPECT_EQ(CalculateInputRadius(3, 28), 7);
+ EXPECT_EQ(CalculateInputRadius(4, 2), 503316480);
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
new file mode 100644
index 0000000000..8e0f234545
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
@@ -0,0 +1,115 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int depth_multiplier,
+ float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int input_depth = ArraySize(input_dims, 0);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ for (int ic = 0; ic < input_depth; ++ic) {
+ for (int m = 0; m < depth_multiplier; m++) {
+ const int oc = m + ic * depth_multiplier;
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ float total = 0.f;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+ (in_y < input_height)) {
+ float input_value =
+ input_data[Offset(input_dims, ic, in_x, in_y, b)];
+ float filter_value = filter_data[Offset(
+ filter_dims, oc, filter_x, filter_y, 0)];
+ total += (input_value * filter_value);
+ }
+ }
+ }
+ float bias_value = 0.0f;
+ if (bias_data) {
+ bias_value = bias_data[Offset(bias_dims, oc, 0, 0, 0)];
+ }
+ output_data[Offset(output_dims, oc, out_x, out_y, b)] =
+ ActivationFunctionWithMinMax(total + bias_value,
+ output_activation_min,
+ output_activation_max);
+ }
+ }
+ }
+ }
+ }
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int depth_multiplier, float* output_data,
+ const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+ bias_dims, stride_width, stride_height, pad_width, pad_height,
+ depth_multiplier, output_activation_min, output_activation_max,
+ output_data, output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims, int stride,
+ int pad_width, int pad_height, int depth_multiplier,
+ float* output_data, const Dims<4>& output_dims) {
+ DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+ bias_dims, stride, stride, pad_width, pad_height,
+ depth_multiplier, output_data, output_dims);
+}
+
+} // end namespace reference_ops
+} // end namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
new file mode 100644
index 0000000000..8a80558b32
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -0,0 +1,138 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int depth_multiplier,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int input_depth = ArraySize(input_dims, 0);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ for (int ic = 0; ic < input_depth; ++ic) {
+ for (int m = 0; m < depth_multiplier; m++) {
+ const int oc = m + ic * depth_multiplier;
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int32 acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+ (in_y < input_height)) {
+ int32 input_val =
+ input_data[Offset(input_dims, ic, in_x, in_y, b)];
+ int32 filter_val = filter_data[Offset(filter_dims, oc,
+ filter_x, filter_y, 0)];
+ acc +=
+ (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ }
+ }
+ if (bias_data) {
+ acc += bias_data[Offset(bias_dims, oc, 0, 0, 0)];
+ }
+ acc = MultiplyByQuantizedMultiplierSmallerThanOne(
+ acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[Offset(output_dims, oc, out_x, out_y, b)] =
+ static_cast<uint8>(acc);
+ }
+ }
+ }
+ }
+ }
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int depth_multiplier, int32 output_offset,
+ int32 output_multiplier, int output_shift,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, stride_width,
+ stride_height, pad_width, pad_height, depth_multiplier,
+ output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max, output_data,
+ output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims, int stride,
+ int pad_width, int pad_height, int depth_multiplier,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
+ filter_dims, filter_offset, bias_data, bias_dims, stride,
+ stride, pad_width, pad_height, depth_multiplier,
+ output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max, output_data,
+ output_dims);
+}
+
+} // end namespace reference_ops
+} // end namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
new file mode 100644
index 0000000000..c5b0bccc9d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -0,0 +1,165 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+float PortableClip(float f, float abs_limit) {
+ float result = (abs_limit < f) ? abs_limit : f;
+ result = (-abs_limit > result) ? -abs_limit : result;
+ return result;
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+ int m_rows, int m_cols,
+ const float* vector,
+ int n_batch, float* result,
+ int result_stride) {
+ float* result_in_batch = result;
+ for (int b = 0; b < n_batch; b++) {
+ const float* matrix_ptr = matrix;
+ for (int r = 0; r < m_rows; r++) {
+ const float* vector_in_batch = vector + b * m_cols;
+ for (int c = 0; c < m_cols; c++) {
+ *result_in_batch += *matrix_ptr++ * *vector_in_batch++;
+ }
+ result_in_batch += result_stride;
+ }
+ }
+}
+
+void PortableVectorVectorCwiseProduct(const float* vector1,
+ const float* vector2, int v_size,
+ float* result) {
+ for (int v = 0; v < v_size; v++) {
+ *result++ = *vector1++ * *vector2++;
+ }
+}
+
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+ int v_size) {
+ float result = 0.0;
+ for (int v = 0; v < v_size; v++) {
+ result += *vector1++ * *vector2++;
+ }
+ return result;
+}
+
+void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
+ const float* vector2, int v_size,
+ int n_batch, float* result,
+ int result_stride) {
+ float* result_ptr = result;
+ const float* vector1_ptr = vector1;
+ const float* vector2_ptr = vector2;
+ for (int b = 0; b < n_batch; b++) {
+ *result_ptr =
+ PortableVectorVectorDotProduct(vector1_ptr, vector2_ptr, v_size);
+ vector1_ptr += v_size;
+ vector2_ptr += v_size;
+ result_ptr += result_stride;
+ }
+}
+
+void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
+ const float* vector2,
+ int v_size, float* result) {
+ for (int v = 0; v < v_size; v++) {
+ *result++ += *vector1++ * *vector2++;
+ }
+}
+
+void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
+ int v_size,
+ const float* batch_vector,
+ int n_batch,
+ float* result) {
+ for (int b = 0; b < n_batch; b++) {
+ for (int v = 0; v < v_size; v++) {
+ *result++ += vector[v] * *batch_vector++;
+ }
+ }
+}
+
+void PortableVectorBatchVectorAssign(const float* vector, int v_size,
+ int n_batch, float* batch_vector) {
+ for (int b = 0; b < n_batch; b++) {
+ memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
+ }
+}
+
+void PortableApplySigmoidToVector(const float* vector, int v_size,
+ float* result) {
+ auto sigmoid_func = ActivationFunctor(kTfLiteActSigmoid);
+ for (int v = 0; v < v_size; v++) {
+ *result++ = (sigmoid_func)(*vector++);
+ }
+}
+
+void PortableApplyActivationToVector(const float* vector, int v_size,
+ TfLiteFusedActivation activation,
+ float* result) {
+ auto activation_func = ActivationFunctor(activation);
+ for (int v = 0; v < v_size; v++) {
+ *result++ = (activation_func)(*vector++);
+ }
+}
+
+void PortableCopyVector(const float* vector, int v_size, float* result) {
+ memcpy(result, vector, v_size * sizeof(float));
+}
+
+void PortableSub1Vector(const float* vector, int v_size, float* result) {
+ for (int v = 0; v < v_size; v++) {
+ *result++ = 1.0f - *vector++;
+ }
+}
+
+void PortableZeroVector(float* vector, int v_size) {
+ memset(vector, 0, v_size * sizeof(float));
+}
+
+void PortableClipVector(const float* vector, int v_size, float abs_limit,
+ float* result) {
+ for (int v = 0; v < v_size; v++) {
+ *result++ = PortableClip(*vector++, abs_limit);
+ }
+}
+
+void PortableVectorShiftLeft(float* vector, int v_size, float shift_value) {
+ TF_LITE_ASSERT(v_size > 0);
+ for (int i = 0; i < v_size - 1; i++) {
+ vector[i] = vector[i + 1];
+ }
+ vector[v_size - 1] = shift_value;
+}
+
+void PortableReductionSumVector(const float* input_vector, float* output_vector,
+ int output_size, int reduction_size) {
+ const float* input_vector_ptr = input_vector;
+ for (int o = 0; o < output_size; o++) {
+ for (int r = 0; r < reduction_size; r++) {
+ output_vector[o] += *input_vector_ptr++;
+ }
+ }
+}
+
+} // namespace tensor_utils
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
new file mode 100644
index 0000000000..c2ab78000b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -0,0 +1,189 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+
+// TDOD(ghodrat): Remove this header file and the dependency to internal data
+// structure.
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+// Limit a float input f betweeen +abs_limit and -abs_limit.
+float PortableClip(float f, float abs_limit);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+ int m_rows, int m_cols,
+ const float* vector,
+ int n_batch, float* result,
+ int result_stride);
+
+// Cwise product of two vectors.
+void PortableVectorVectorCwiseProduct(const float* vector1,
+ const float* vector2, int v_size,
+ float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
+// assumption here is that result array is initialized to valid values.
+void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
+ const float* vector2,
+ int v_size, float* result);
+
+// Dot product of two vectors.
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+ int v_size);
+
+// Dot product of two batch vectors.
+void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
+ const float* vector2, int v_size,
+ int n_batch, float* result,
+ int result_stride);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
+ int v_size,
+ const float* batch_vector,
+ int n_batch,
+ float* result);
+
+// Batch vector initialization with another vector.
+void PortableVectorBatchVectorAssign(const float* vector, int v_size,
+ int n_batch, float* batch_vector);
+
+// Apply sigmoid to elements of a vector.
+void PortableApplySigmoidToVector(const float* vector, int v_size,
+ float* result);
+
+// Apply activation function to elements of a vector.
+void PortableApplyActivationToVector(const float* vector, int v_size,
+ TfLiteFusedActivation activation,
+ float* result);
+
+// Copy vector to another vector.
+void PortableCopyVector(const float* vector, int v_size, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void PortableSub1Vector(const float* vector, int v_size, float* result);
+
+// Fill vector with 0.f.
+void PortableZeroVector(float* vector, int v_size);
+
+// Clip elements of a vector using a abs_limit value.
+void PortableClipVector(const float* vector, int v_size, float abs_limit,
+ float* result);
+
+// Shift left a vector in place with v_size size.
+void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void PortableReductionSumVector(const float* input_vector, float* output_vector,
+ int output_size, int reduction_size);
+
+float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+ int m_cols, const float* vector,
+ int n_batch, float* result,
+ int result_stride) {
+ PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+ n_batch, result, result_stride);
+}
+
+void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
+ int v_size, float* result) {
+ PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
+}
+
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+ const float* vector2, int v_size,
+ float* result) {
+ PortableVectorVectorCwiseProductAccumulate(vector1, vector2, v_size, result);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
+ const float* batch_vector,
+ int n_batch, float* result) {
+ PortableVectorBatchVectorCwiseProductAccumulate(vector, v_size, batch_vector,
+ n_batch, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+ int v_size) {
+ return PortableVectorVectorDotProduct(vector1, vector2, v_size);
+}
+
+void BatchVectorBatchVectorDotProduct(const float* vector1,
+ const float* vector2, int v_size,
+ int n_batch, float* result,
+ int result_stride) {
+ PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+ result, result_stride);
+}
+
+void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
+ float* batch_vector) {
+ PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
+}
+
+void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
+ PortableApplySigmoidToVector(vector, v_size, result);
+}
+
+void ApplyActivationToVector(const float* vector, int v_size,
+ TfLiteFusedActivation activation, float* result) {
+ PortableApplyActivationToVector(vector, v_size, activation, result);
+}
+
+void CopyVector(const float* vector, int v_size, float* result) {
+ PortableCopyVector(vector, v_size, result);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+ PortableSub1Vector(vector, v_size, result);
+}
+
+void ZeroVector(float* vector, int v_size) {
+ PortableZeroVector(vector, v_size);
+}
+
+void ClipVector(const float* vector, int v_size, float abs_limit,
+ float* result) {
+ PortableClipVector(vector, v_size, abs_limit, result);
+}
+
+void VectorShiftLeft(float* vector, int v_size, float shift_value) {
+ PortableVectorShiftLeft(vector, v_size, shift_value);
+}
+
+void ReductionSumVector(const float* input_vector, float* output_vector,
+ int output_size, int reduction_size) {
+ PortableReductionSumVector(input_vector, output_vector, output_size,
+ reduction_size);
+}
+
+} // namespace tensor_utils
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
new file mode 100644
index 0000000000..b9ca3d5c62
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -0,0 +1,2455 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
+ int32 x, int32 quantized_multiplier, int right_shift) {
+ using gemmlowp::RoundingDivideByPOT;
+ using gemmlowp::SaturatingRoundingDoublingHighMul;
+ return RoundingDivideByPOT(
+ SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
+}
+
+inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
+ int32 x, int32 quantized_multiplier, int left_shift) {
+ using gemmlowp::SaturatingRoundingDoublingHighMul;
+ return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+ quantized_multiplier);
+}
+
+template <typename T>
+int CountLeadingZeros(T integer_input) {
+ static_assert(std::is_unsigned<T>::value,
+ "Only unsigned integer types handled.");
+ const T one_in_leading_positive = static_cast<T>(1)
+ << (std::numeric_limits<T>::digits - 1);
+ int leading_zeros = 0;
+ while (integer_input < one_in_leading_positive) {
+ integer_input <<= 1;
+ ++leading_zeros;
+ }
+ return leading_zeros;
+}
+
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N>
+struct NdArrayDesc {
+ // The "extent" of each dimension. Indices along dimension d must be in the
+ // half-open interval [0, extents[d]).
+ int extents[N];
+
+ // The number of *elements* (not bytes) between consecutive indices of each
+ // dimension.
+ int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// ELEMENT-WISE BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+ int i3) {
+ TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+ TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+ TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+ TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+ return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+ i3 * desc.strides[3];
+}
+
+// Given the dimensions of the operands for an element-wise binary broadcast,
+// adjusts them so that they can be directly iterated over with simple loops.
+// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
+// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
+//
+// This function assumes that the two input shapes are compatible up to
+// broadcasting and the shorter one has already been prepended with 1s to be the
+// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
+// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
+// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
+// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
+//
+// When two shapes are compatible up to broadcasting, for each dimension d,
+// the input extents are either equal, or one of them is 1.
+//
+// This function performs the following for each dimension d:
+// - If the extents are equal, then do nothing since the loop that walks over
+// both of the input arrays is correct.
+// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
+// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
+// array0 to be referenced *at any index* in dimension d and still access the
+// same slice.
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
+ const Dims<N>& input1_dims,
+ NdArrayDesc<N>* desc0_out,
+ NdArrayDesc<N>* desc1_out) {
+ TFLITE_DCHECK(desc0_out != nullptr);
+ TFLITE_DCHECK(desc1_out != nullptr);
+
+ // Copy dims to desc.
+ for (int i = 0; i < N; ++i) {
+ desc0_out->extents[i] = input0_dims.sizes[i];
+ desc0_out->strides[i] = input0_dims.strides[i];
+ desc1_out->extents[i] = input1_dims.sizes[i];
+ desc1_out->strides[i] = input1_dims.strides[i];
+ }
+
+ // Walk over each dimension. If the extents are equal do nothing.
+ // Otherwise, set the desc with extent 1 to have extent equal to the other and
+ // stride 0.
+ for (int i = 0; i < N; ++i) {
+ const int extent0 = ArraySize(input0_dims, i);
+ const int extent1 = ArraySize(input1_dims, i);
+ if (extent0 != extent1) {
+ if (extent0 == 1) {
+ desc0_out->strides[i] = 0;
+ desc0_out->extents[i] = extent1;
+ } else {
+ TFLITE_DCHECK_EQ(extent1, 1);
+ desc1_out->strides[i] = 0;
+ desc1_out->extents[i] = extent0;
+ }
+ }
+ }
+}
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims, float* im2col_data,
+ const Dims<4>& im2col_dims) {
+ (void)im2col_data; // only used in optimized code.
+ (void)im2col_dims; // only used in optimized code.
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+ const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
+ if (bias_data) {
+ TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0));
+ }
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ float total = 0.f;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+ (in_y < input_height)) {
+ float input_value = input_data[Offset(input_dims, in_channel,
+ in_x, in_y, batch)];
+ float filter_value =
+ filter_data[Offset(filter_dims, in_channel, filter_x,
+ filter_y, out_channel)];
+ total += (input_value * filter_value);
+ }
+ }
+ }
+ }
+ float bias_value = 0.0f;
+ if (bias_data) {
+ bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
+ }
+ output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
+ ActivationFunctionWithMinMax(total + bias_value,
+ output_activation_min,
+ output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+ int stride_height, int pad_width, int pad_height, float* output_data,
+ const Dims<4>& output_dims, float* im2col_data,
+ const Dims<4>& im2col_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+ stride_width, stride_height, pad_width, pad_height,
+ output_activation_min, output_activation_max, output_data, output_dims,
+ im2col_data, im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims, int stride,
+ int pad_width, int pad_height, float* output_data,
+ const Dims<4>& output_dims, float* im2col_data,
+ const Dims<4>& im2col_dims) {
+ Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+ bias_dims, stride, stride, pad_width, pad_height, output_data,
+ output_dims, im2col_data, im2col_dims);
+}
+
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims, uint8* im2col_data,
+ const Dims<4>& im2col_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ (void)im2col_data; // only used in optimized code.
+ (void)im2col_dims; // only used in optimized code.
+ (void)gemm_context; // only used in optimized code.
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+ const int output_depth =
+ MatchingArraySize(filter_dims, 3, bias_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int32 acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+ (in_y < input_height)) {
+ int32 input_val = input_data[Offset(input_dims, in_channel,
+ in_x, in_y, batch)];
+ int32 filter_val =
+ filter_data[Offset(filter_dims, in_channel, filter_x,
+ filter_y, out_channel)];
+ acc +=
+ (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ }
+ }
+ }
+ if (bias_data) {
+ acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
+ }
+ acc = MultiplyByQuantizedMultiplierSmallerThanOne(
+ acc, output_multiplier, output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
+ static_cast<uint8>(acc);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims, uint8* im2col_data,
+ const Dims<4>& im2col_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, stride_width, stride_height,
+ pad_width, pad_height, output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max, output_data, output_dims,
+ im2col_data, im2col_dims, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims, int stride,
+ int pad_width, int pad_height, int32 output_offset,
+ int32 output_multiplier, int output_shift,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
+ const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
+ Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, stride, stride, pad_width,
+ pad_height, output_offset, output_multiplier, output_shift,
+ output_activation_min, output_activation_max, output_data,
+ output_dims, im2col_data, im2col_dims, gemm_context);
+}
+
+template <typename T>
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
+ int block_size, T* output_data,
+ const Dims<4>& output_dims) {
+ const int input_depth = ArraySize(input_dims, 0);
+ const int input_width = ArraySize(input_dims, 1);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_batch = ArraySize(input_dims, 3);
+
+ const int output_depth = ArraySize(output_dims, 0);
+ const int output_width = ArraySize(output_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_batch = ArraySize(output_dims, 3);
+
+ TFLITE_DCHECK_EQ(input_width * block_size, output_width);
+ TFLITE_DCHECK_EQ(input_height * block_size, output_height);
+ TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
+ TFLITE_DCHECK_EQ(input_batch, output_batch);
+
+ for (int out_b = 0; out_b < output_batch; ++out_b) {
+ for (int out_h = 0; out_h < output_height; ++out_h) {
+ for (int out_w = 0; out_w < output_width; ++out_w) {
+ for (int out_d = 0; out_d < output_depth; ++out_d) {
+ const int in_d =
+ out_d + ((out_h % block_size) * block_size + out_w % block_size) *
+ output_depth;
+ const int in_w = out_w / block_size;
+ const int in_h = out_h / block_size;
+ const int in_b = out_b;
+
+ const int output_index =
+ Offset(output_dims, out_d, out_w, out_h, out_b);
+ const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
+
+ output_data[output_index] = input_data[input_index];
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+ int block_size, T* output_data,
+ const Dims<4>& output_dims) {
+ const int input_depth = ArraySize(input_dims, 0);
+ const int input_width = ArraySize(input_dims, 1);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_batch = ArraySize(input_dims, 3);
+
+ const int output_depth = ArraySize(output_dims, 0);
+ const int output_width = ArraySize(output_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_batch = ArraySize(output_dims, 3);
+
+ TFLITE_DCHECK_EQ(input_width, output_width * block_size);
+ TFLITE_DCHECK_EQ(input_height, output_height * block_size);
+ TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
+ TFLITE_DCHECK_EQ(input_batch, output_batch);
+
+ for (int in_b = 0; in_b < input_batch; ++in_b) {
+ for (int in_h = 0; in_h < input_height; ++in_h) {
+ for (int in_w = 0; in_w < input_width; ++in_w) {
+ for (int in_d = 0; in_d < input_depth; ++in_d) {
+ const int out_d =
+ in_d + ((in_h % block_size) * block_size + in_w % block_size) *
+ input_depth;
+ const int out_w = in_w / block_size;
+ const int out_h = in_h / block_size;
+ const int out_b = in_b;
+
+ const int output_index =
+ Offset(output_dims, out_d, out_w, out_h, out_b);
+ const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
+
+ output_data[output_index] = input_data[input_index];
+ }
+ }
+ }
+ }
+}
+
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+ const float* weights_data,
+ const Dims<4>& weights_dims, const float* bias_data,
+ const Dims<4>& bias_dims,
+ float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims) {
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+ ArraySize(output_dims, 3);
+ const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
+ const int accum_depth = ArraySize(weights_dims, 0);
+ TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+ for (int b = 0; b < batches; ++b) {
+ for (int out_c = 0; out_c < output_depth; ++out_c) {
+ float total = 0.f;
+ for (int d = 0; d < accum_depth; ++d) {
+ total += input_data[b * accum_depth + d] *
+ weights_data[out_c * accum_depth + d];
+ }
+ float bias_value = 0.0f;
+ if (bias_data) {
+ bias_value = bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
+ }
+ output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
+ total + bias_value, output_activation_min, output_activation_max);
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+ const float* weights_data, const Dims<4>& weights_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+ bias_dims, output_activation_min, output_activation_max,
+ output_data, output_dims);
+}
+
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ (void)gemm_context; // only used in optimized code.
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ // TODO(benoitjacob): This really should be:
+ // const int batches = ArraySize(output_dims, 1);
+ // but the current --variable_batch hack consists in overwriting the 3rd
+ // dimension with the runtime batch size, as we don't keep track for each
+ // array of which dimension is the batch dimension in it.
+ const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+ ArraySize(output_dims, 3);
+ const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
+ const int accum_depth = ArraySize(filter_dims, 0);
+ TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+ TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+ for (int b = 0; b < batches; ++b) {
+ for (int out_c = 0; out_c < output_depth; ++out_c) {
+ int32 acc = 0;
+ for (int d = 0; d < accum_depth; ++d) {
+ int32 input_val = input_data[b * accum_depth + d];
+ int32 filter_val = filter_data[out_c * accum_depth + d];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ if (bias_data) {
+ acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
+ }
+ acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier,
+ output_shift);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_offset, const uint8* filter_data,
+ const Dims<4>& filter_dims, int32 filter_offset,
+ const int32* bias_data, const Dims<4>& bias_dims,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims,
+ gemmlowp::GemmContext* gemm_context) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+ filter_offset, bias_data, bias_dims, output_offset,
+ output_multiplier, output_shift, output_activation_min,
+ output_activation_max, output_data, output_dims, gemm_context);
+}
+
+template <FusedActivationFunctionType Ac>
+void NonGlobalBatchNormalization(
+ const float* input_data, const Dims<4>& input_dims, const float* mean_data,
+ const Dims<4>& mean_dims, const float* multiplier_data,
+ const Dims<4>& multiplier_dims, const float* offset_data,
+ const Dims<4>& offset_dims, float* output_data,
+ const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height =
+ MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
+ offset_dims, 2, output_dims, 2);
+ const int width =
+ MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
+ offset_dims, 1, output_dims, 1);
+ const int depth =
+ MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+ offset_dims, 0, output_dims, 0);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+ (input_data[Offset(input_dims, c, x, y, b)] -
+ mean_data[Offset(mean_dims, c, x, y, 0)]) *
+ multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
+ offset_data[Offset(offset_dims, c, x, y, 0)]);
+ }
+ }
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac>
+void GlobalBatchNormalization(const float* input_data,
+ const Dims<4>& input_dims, const float* mean_data,
+ const Dims<4>& mean_dims,
+ const float* multiplier_data,
+ const Dims<4>& multiplier_dims,
+ const float* offset_data,
+ const Dims<4>& offset_dims, float* output_data,
+ const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth =
+ MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+ offset_dims, 0, output_dims, 0);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+ (input_data[Offset(input_dims, c, x, y, b)] -
+ mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
+ multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
+ offset_data[Offset(offset_dims, c, 0, 0, 0)]);
+ }
+ }
+ }
+ }
+}
+
+inline void Relu(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ float val = input_data[Offset(input_dims, c, x, y, b)];
+ const float lower = 0;
+ float clamped = val < lower ? lower : val;
+ output_data[Offset(output_dims, c, x, y, b)] = clamped;
+ }
+ }
+ }
+ }
+}
+
+inline void Relu1(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ float val = input_data[Offset(input_dims, c, x, y, b)];
+ const float upper = 1;
+ const float lower = -1;
+ float clamped = val > upper ? upper : val < lower ? lower : val;
+ output_data[Offset(output_dims, c, x, y, b)] = clamped;
+ }
+ }
+ }
+ }
+}
+
+inline void Relu6(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ float val = input_data[Offset(input_dims, c, x, y, b)];
+ const float upper = 6;
+ const float lower = 0;
+ float clamped = val > upper ? upper : val < lower ? lower : val;
+ output_data[Offset(output_dims, c, x, y, b)] = clamped;
+ }
+ }
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone, "");
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ float squared_l2_norm = 0;
+ for (int c = 0; c < depth; ++c) {
+ float val = input_data[Offset(input_dims, c, x, y, b)];
+ squared_l2_norm += val * val;
+ }
+ float l2_norm = std::sqrt(squared_l2_norm);
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ input_data[Offset(input_dims, c, x, y, b)] / l2_norm;
+ }
+ }
+ }
+ }
+}
+
+inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
+ int* output_shift) {
+ *output_shift = 11;
+ while (input >= (1 << 29)) {
+ input /= 4;
+ ++*output_shift;
+ }
+ TFLITE_DCHECK_GT(input, 0);
+ const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
+ const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+ const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+ *output_shift -= left_shift_bit_pairs;
+ input <<= 2 * left_shift_bit_pairs;
+ TFLITE_DCHECK_GE(input, (1 << 27));
+ TFLITE_DCHECK_LT(input, (1 << 29));
+ using gemmlowp::FixedPoint;
+ using gemmlowp::Rescale;
+ using gemmlowp::SaturatingRoundingMultiplyByPOT;
+ // Using 3 integer bits gives us enough room for the internal arithmetic in
+ // this Newton-Raphson iteration.
+ using F3 = FixedPoint<int32, 3>;
+ using F0 = FixedPoint<int32, 0>;
+ const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+ const F3 fixedpoint_half_input =
+ SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+ const F3 fixedpoint_half_three =
+ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+ // Newton-Raphson iteration
+ // Naive unoptimized starting guess: x = 1
+ F3 x = F3::One();
+ // Naive unoptimized number of iterations: 5
+ for (int i = 0; i < 5; i++) {
+ const F3 x3 = Rescale<3>(x * x * x);
+ x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+ }
+ const F0 fixedpoint_half_sqrt_2 =
+ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+ x = x * fixedpoint_half_sqrt_2;
+ *output_inv_sqrt = x.raw();
+ if (*output_shift < 0) {
+ *output_inv_sqrt <<= -*output_shift;
+ *output_shift = 0;
+ }
+}
+
+inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_zero_point, uint8* output_data,
+ const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ TFLITE_DCHECK_EQ(batches, 1);
+ TFLITE_DCHECK_EQ(height, 1);
+ TFLITE_DCHECK_EQ(width, 1);
+ int32 square_l2_norm = 0;
+ for (int i = 0; i < depth; i++) {
+ int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
+ square_l2_norm += diff * diff;
+ }
+ int32 inv_l2norm_multiplier;
+ int inv_l2norm_shift;
+ GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
+ &inv_l2norm_shift);
+
+ for (int i = 0; i < depth; i++) {
+ int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
+ int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
+ 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+ int32 unclamped_output_val = 128 + rescaled_diff;
+ int32 output_val = std::min(255, std::max(0, unclamped_output_val));
+ output_data[Offset(output_dims, i, 0, 0, 0)] =
+ static_cast<uint8>(output_val);
+ }
+}
+
+inline void Add(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float output_activation_min, float output_activation_max,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches =
+ MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+ const int height =
+ MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+ const int width =
+ MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+ const int depth =
+ MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ ActivationFunctionWithMinMax(
+ input1_data[Offset(input1_dims, c, x, y, b)] +
+ input2_data[Offset(input2_dims, c, x, y, b)],
+ output_activation_min, output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+ Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+ const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift,
+ const uint8* input2_data, const Dims<4>& input2_dims,
+ int32 input2_offset, int32 input2_multiplier, int input2_shift,
+ int32 output_offset, int32 output_multiplier, int output_shift,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ const int batches =
+ MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+ const int height =
+ MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+ const int width =
+ MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+ const int depth =
+ MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ const int32 input1_val =
+ input1_offset + input1_data[Offset(input1_dims, c, x, y, b)];
+ const int32 input2_val =
+ input2_offset + input2_data[Offset(input2_dims, c, x, y, b)];
+ const int32 shifted_input1_val = input1_val * (1 << left_shift);
+ const int32 shifted_input2_val = input2_val * (1 << left_shift);
+ const int32 scaled_input1_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input1_val, input1_multiplier, input1_shift);
+ const int32 scaled_input2_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input2_val, input2_multiplier, input2_shift);
+ const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+ const int32 raw_output =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ raw_sum, output_multiplier, output_shift) +
+ output_offset;
+ const int32 clamped_output =
+ std::min(output_activation_max,
+ std::max(output_activation_min, raw_output));
+ output_data[Offset(output_dims, c, x, y, b)] =
+ static_cast<uint8>(clamped_output);
+ }
+ }
+ }
+ }
+}
+
+// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <FusedActivationFunctionType Ac>
+void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+ input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+ input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+ }
+ }
+ }
+ }
+}
+
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+ const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift,
+ const uint8* input2_data, const Dims<4>& input2_dims,
+ int32 input2_offset, int32 input2_multiplier,
+ int input2_shift, int32 output_offset,
+ int32 output_multiplier, int output_shift,
+ int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ const int32 input1_val =
+ input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+ const int32 input2_val =
+ input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+ const int32 shifted_input1_val = input1_val * (1 << left_shift);
+ const int32 shifted_input2_val = input2_val * (1 << left_shift);
+ const int32 scaled_input1_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input1_val, input1_multiplier, input1_shift);
+ const int32 scaled_input2_val =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ shifted_input2_val, input2_multiplier, input2_shift);
+ const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+ const int32 raw_output =
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ raw_sum, output_multiplier, output_shift) +
+ output_offset;
+ const int32 clamped_output =
+ std::min(output_activation_max,
+ std::max(output_activation_min, raw_output));
+ output_data[Offset(output_dims, c, x, y, b)] =
+ static_cast<uint8>(clamped_output);
+ }
+ }
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+ const Dims<4>& input1_dims, int32 input1_offset,
+ int32 input1_multiplier, int input1_shift,
+ const uint8* input2_data, const Dims<4>& input2_dims,
+ int32 input2_offset, int32 input2_multiplier,
+ int input2_shift, int32 output_offset,
+ int32 output_multiplier, int output_shift,
+ int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
+ input1_multiplier, input1_shift, input2_data, input2_dims,
+ input2_offset, input2_multiplier, input2_shift, output_offset,
+ output_multiplier, output_shift, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float output_activation_min, float output_activation_max,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches =
+ MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+ const int height =
+ MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+ const int width =
+ MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+ const int depth =
+ MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ ActivationFunctionWithMinMax(
+ input1_data[Offset(input1_dims, c, x, y, b)] *
+ input2_data[Offset(input2_dims, c, x, y, b)],
+ output_activation_min, output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Mul(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+ Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <FusedActivationFunctionType Ac>
+void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
+ const float* input2_data, const Dims<4>& input2_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastMul");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest
+ // stride, typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for
+ // the best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+ input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+ input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+ }
+ }
+ }
+ }
+}
+
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+ int32 input1_offset, const uint8* input2_data,
+ const Dims<4>& input2_dims, int32 input2_offset,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest
+ // stride, typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for
+ // the best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ const int32 input1_val =
+ input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+ const int32 input2_val =
+ input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+ const int32 unclamped_result =
+ output_offset +
+ MultiplyByQuantizedMultiplierSmallerThanOne(
+ input1_val * input2_val, output_multiplier, output_shift);
+ const int32 clamped_output =
+ std::min(output_activation_max,
+ std::max(output_activation_min, unclamped_result));
+ output_data[Offset(output_dims, c, x, y, b)] =
+ static_cast<uint8>(clamped_output);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+ int32 input1_offset, const uint8* input2_data,
+ const Dims<4>& input2_dims, int32 input2_offset,
+ int32 output_offset, int32 output_multiplier,
+ int output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+ input2_dims, input2_offset, output_offset, output_multiplier,
+ output_shift, output_activation_min, output_activation_max,
+ output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void Concatenation(int concat_dim, const Scalar* const* input_data,
+ const Dims<4>* const* input_dims, int inputs_count,
+ Scalar* output_data, const Dims<4>& output_dims) {
+ TFLITE_DCHECK_GT(inputs_count, 1);
+ int concat_size = 0;
+ for (int i = 0; i < inputs_count; i++) {
+ for (int j = 0; j < 4; j++) {
+ if (j != concat_dim) {
+ MatchingArraySize(*input_dims[i], j, output_dims, j);
+ }
+ }
+ concat_size += ArraySize(*input_dims[i], concat_dim);
+ }
+ TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+ TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+ int outer_size = 1;
+ for (int i = concat_dim + 1; i < 4; i++) {
+ outer_size *= output_dims.sizes[i];
+ }
+ Scalar* output_ptr = output_data;
+ for (int k = 0; k < outer_size; k++) {
+ for (int i = 0; i < inputs_count; ++i) {
+ const int copy_size =
+ input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+ memcpy(output_ptr, input_data[i] + k * copy_size,
+ copy_size * sizeof(Scalar));
+ output_ptr += copy_size;
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void DepthConcatenation(const Scalar* const* input_data,
+ const Dims<4>* const* input_dims, int inputs_count,
+ Scalar* output_data, const Dims<4>& output_dims) {
+ Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
+ output_data, output_dims);
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+ const float* prev_activ_data,
+ const Dims<4>& prev_activ_dims, const float* weights_data,
+ const Dims<4>& weights_dims, const float* bias_data,
+ const Dims<4>& bias_dims, const float* prev_state_data,
+ const Dims<4>& prev_state_dims, float* output_state_data,
+ const Dims<4>& output_state_dims, float* output_activ_data,
+ const Dims<4>& output_activ_dims, float* concat_temp_data,
+ const Dims<4>& concat_temp_dims, float* activ_temp_data,
+ const Dims<4>& activ_temp_dims) {
+ const int batches =
+ MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
+ output_state_dims, 3, output_activ_dims, 3);
+ const int height =
+ MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
+ output_state_dims, 2, output_activ_dims, 2);
+ const int width =
+ MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
+ output_state_dims, 1, output_activ_dims, 1);
+ TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
+ TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
+ const int input_depth = ArraySize(input_dims, 0);
+ const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+ const int total_input_depth = prev_activ_depth + input_depth;
+ TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+ TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
+ 1);
+ const int intern_activ_depth =
+ MatchingArraySize(weights_dims, 1, bias_dims, 0);
+ TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
+ const int output_depth =
+ MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
+ output_state_dims, 0, output_activ_dims, 0);
+ TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+
+ // Concatenate prev_activ and input data together
+ std::vector<float const*> concat_input_arrays_data;
+ std::vector<Dims<4> const*> concat_input_arrays_dims;
+ concat_input_arrays_data.push_back(input_data);
+ concat_input_arrays_data.push_back(prev_activ_data);
+ concat_input_arrays_dims.push_back(&input_dims);
+ concat_input_arrays_dims.push_back(&prev_activ_dims);
+ Concatenation<FusedActivationFunctionType::kNone, float>(
+ 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
+ concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
+
+ // Fully connected
+ FullyConnected<FusedActivationFunctionType::kNone>(
+ concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
+ bias_dims, activ_temp_data, activ_temp_dims);
+
+ // Memory state update (the LSTM "guts")
+ for (int b = 0; b < batches; ++b) {
+ for (int w = 0; w < width; ++w) {
+ for (int h = 0; h < height; ++h) {
+ for (int c = 0; c < output_depth; ++c) {
+ const float input_gate =
+ 1.f /
+ (1.f + std::exp(-activ_temp_data[Offset(
+ activ_temp_dims, 0 * output_depth + c, w, h, b)]));
+ const float new_input = std::tanh(activ_temp_data[Offset(
+ activ_temp_dims, 1 * output_depth + c, w, h, b)]);
+ const float forget_gate =
+ 1.f /
+ (1.f + std::exp(-activ_temp_data[Offset(
+ activ_temp_dims, 2 * output_depth + c, w, h, b)]));
+ const float output_gate =
+ 1.f /
+ (1.f + std::exp(-activ_temp_data[Offset(
+ activ_temp_dims, 3 * output_depth + c, w, h, b)]));
+ const float new_state =
+ input_gate * new_input +
+ forget_gate *
+ prev_state_data[Offset(prev_state_dims, c, w, h, b)];
+ output_state_data[Offset(output_state_dims, c, w, h, b)] = new_state;
+ output_activ_data[Offset(output_activ_dims, c, w, h, b)] =
+ output_gate * std::tanh(new_state);
+ }
+ }
+ }
+ }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
+ int outputs_count, Scalar* const* output_data,
+ const Dims<4>* const* output_dims) {
+ TFLITE_DCHECK_GE(outputs_count, 1);
+ for (int i = 0; i < outputs_count; i++) {
+ /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
+ /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
+ /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
+ }
+ const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
+ const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
+ const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
+ // for now we dont have a model with a TensorFlowSplit
+ // with fused activation function.
+ TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ int in_c = 0;
+ for (int i = 0; i < outputs_count; ++i) {
+ const int depth = ArraySize(*output_dims[i], 0);
+ for (int c = 0; c < depth; ++c) {
+ output_data[i][Offset(*output_dims[i], c, x, y, b)] =
+ input_data[Offset(input_dims, in_c, x, y, b)];
+ in_c++;
+ }
+ }
+ TFLITE_DCHECK(in_c == ArraySize(input_dims, 0));
+ }
+ }
+ }
+}
+
+// TODO(benoitjacob) make this a proper reference impl without Eigen!
+template <typename Scalar>
+using MatrixMap = typename std::conditional<
+ std::is_const<Scalar>::value,
+ Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+ Eigen::Dynamic, Eigen::Dynamic>>,
+ Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
+ const Dims<N>& dims) {
+ const int rows = dims.sizes[0];
+ int cols = 1;
+ for (int d = 1; d < N; d++) {
+ cols *= dims.sizes[d];
+ }
+ return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
+ const Dims<N>& dims) {
+ const int cols = dims.sizes[N - 1];
+ int rows = 1;
+ for (int d = 0; d < N - 1; d++) {
+ rows *= dims.sizes[d];
+ }
+ return MatrixMap<Scalar>(data, rows, cols);
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width) {
+ return (b * height + h) * width + w;
+}
+
+inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ for (int channel = 0; channel < depth; ++channel) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end =
+ std::min(filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(filter_height, input_height - in_y_origin);
+ float total = 0.f;
+ float filter_count = 0;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end;
+ ++filter_y) {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end;
+ ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ total +=
+ input_data[Offset(input_dims, channel, in_x, in_y, batch)];
+ filter_count++;
+ }
+ }
+ const float average = total / filter_count;
+ output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+ ActivationFunctionWithMinMax(average, output_activation_min,
+ output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ float* output_data, const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width,
+ int filter_height, float* output_data,
+ const Dims<4>& output_dims) {
+ AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_data, output_dims);
+}
+
+inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ for (int channel = 0; channel < depth; ++channel) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end =
+ std::min(filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(filter_height, input_height - in_y_origin);
+ int32 acc = 0;
+ int filter_count = 0;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end;
+ ++filter_y) {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end;
+ ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ acc += input_data[Offset(input_dims, channel, in_x, in_y, batch)];
+ filter_count++;
+ }
+ }
+ acc = (acc + filter_count / 2) / filter_count;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+ static_cast<uint8>(acc);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width,
+ int filter_height, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+inline void L2Pool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ float output_activation_min, float output_activation_max,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ for (int channel = 0; channel < depth; ++channel) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end =
+ std::min(filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(filter_height, input_height - in_y_origin);
+ float sum_squares = 0.f;
+ int filter_count = 0;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end;
+ ++filter_y) {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end;
+ ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ const float val =
+ input_data[Offset(input_dims, channel, in_x, in_y, batch)];
+ sum_squares += val * val;
+ filter_count++;
+ }
+ }
+ const float l2pool_result = std::sqrt(sum_squares / filter_count);
+ output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+ ActivationFunctionWithMinMax(l2pool_result, output_activation_min,
+ output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width, int pad_height,
+ int filter_width, int filter_height, float* output_data,
+ const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+ L2Pool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width, int filter_height,
+ float* output_data, const Dims<4>& output_dims) {
+ L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_data, output_dims);
+}
+
+inline void MaxPool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ float output_activation_min, float output_activation_max,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ for (int channel = 0; channel < depth; ++channel) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end =
+ std::min(filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(filter_height, input_height - in_y_origin);
+ float max = std::numeric_limits<float>::lowest();
+ for (int filter_y = filter_y_start; filter_y < filter_y_end;
+ ++filter_y) {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end;
+ ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ max = std::max(
+ max,
+ input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
+ }
+ }
+ output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+ ActivationFunctionWithMinMax(max, output_activation_min,
+ output_activation_max);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width, int pad_height,
+ int filter_width, int filter_height, float* output_data,
+ const Dims<4>& output_dims) {
+ float output_activation_min, output_activation_max;
+ GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+ MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width, int filter_height,
+ float* output_data, const Dims<4>& output_dims) {
+ MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_data, output_dims);
+}
+
+inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, int filter_width, int filter_height,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+ TFLITE_DCHECK_GE(output_activation_min, 0);
+ TFLITE_DCHECK_LE(output_activation_max, 255);
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ for (int channel = 0; channel < depth; ++channel) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end =
+ std::min(filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(filter_height, input_height - in_y_origin);
+ uint8 max = 0;
+ for (int filter_y = filter_y_start; filter_y < filter_y_end;
+ ++filter_y) {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end;
+ ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ max = std::max(
+ max,
+ input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
+ }
+ }
+ max = std::max<uint8>(max, output_activation_min);
+ max = std::min<uint8>(max, output_activation_max);
+ output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+ static_cast<uint8>(max);
+ }
+ }
+ }
+ }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
+ int stride_width, int stride_height, int pad_width, int pad_height,
+ int filter_width, int filter_height, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ static_assert(Ac == FusedActivationFunctionType::kNone ||
+ Ac == FusedActivationFunctionType::kRelu ||
+ Ac == FusedActivationFunctionType::kRelu6 ||
+ Ac == FusedActivationFunctionType::kRelu1,
+ "");
+ if (Ac == FusedActivationFunctionType::kNone) {
+ TFLITE_DCHECK_EQ(output_activation_min, 0);
+ TFLITE_DCHECK_EQ(output_activation_max, 255);
+ }
+ MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+ pad_height, filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride,
+ int pad_width, int pad_height, int filter_width, int filter_height,
+ int32 output_activation_min, int32 output_activation_max,
+ uint8* output_data, const Dims<4>& output_dims) {
+ MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+ filter_width, filter_height, output_activation_min,
+ output_activation_max, output_data, output_dims);
+}
+
+inline void LocalResponseNormalization(const float* input_data,
+ const Dims<4>& input_dims, int range,
+ float bias, float alpha, float beta,
+ float* output_data,
+ const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ const int begin_input_c = std::max(0, c - range);
+ const int end_input_c = std::min(depth, c + range);
+ float accum = 0.f;
+ for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
+ const float input_val =
+ input_data[Offset(input_dims, input_c, x, y, b)];
+ accum += input_val * input_val;
+ }
+ const float multiplier = std::pow(bias + alpha * accum, -beta);
+ output_data[Offset(output_dims, c, x, y, b)] =
+ input_data[Offset(input_dims, c, x, y, b)] * multiplier;
+ }
+ }
+ }
+ }
+}
+
+inline void Softmax(const float* input_data, const Dims<4>& input_dims,
+ float beta, float* output_data,
+ const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ // Find max element value which we'll use to ensure numerical stability
+ // taking advantage of the following equality:
+ // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+ float max = std::numeric_limits<float>::lowest();
+ for (int c = 0; c < depth; ++c) {
+ max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
+ }
+
+ // Compute sum.
+ float sum = 0.f;
+ for (int c = 0; c < depth; ++c) {
+ sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
+ beta);
+ }
+
+ // Compute result.
+ for (int c = 0; c < depth; ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
+ beta) /
+ sum;
+ }
+ }
+ }
+ }
+}
+
+inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_beta_multiplier, int32 input_beta_left_shift,
+ int diff_min, uint8* output_data,
+ const Dims<4>& output_dims) {
+ // The representation chosen for the input to the exp() function is Q5.26.
+ // We need to leave extra space since values that we skip might be as large as
+ // -32 before multiplying by input_beta_multiplier, and therefore as large as
+ // -16 afterwards. Note that exp(-8) is definitely not insignificant to
+ // accumulation, but exp(-16) definitely is.
+ static const int kScaledDiffIntegerBits = 5;
+ static const int kAccumulationIntegerBits = 12;
+ using FixedPointScaledDiff =
+ gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+ using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+ using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ for (int b = 0; b < batches; ++b) {
+ for (int x = 0; x < width; ++x) {
+ for (int y = 0; y < height; ++y) {
+ uint8 max_in_row = 0;
+ for (int c = 0; c < depth; ++c) {
+ max_in_row =
+ std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
+ }
+
+ FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+ for (int c = 0; c < depth; ++c) {
+ int32 input_diff =
+ static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+ max_in_row;
+ if (input_diff >= diff_min) {
+ const int32 input_diff_rescaled =
+ MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+ sum_of_exps =
+ sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+ exp_on_negative_values(scaled_diff_f8));
+ }
+ }
+
+ int32 fixed_sum_of_exps = sum_of_exps.raw();
+ int headroom_plus_one =
+ CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
+ // This is the number of bits to the left of the binary point above 1.0.
+ // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
+ // no later adjustment will be needed.
+ int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+ int32 shifted_sum_minus_one = static_cast<int32>(
+ (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+ (static_cast<uint32>(1) << 31));
+
+ FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+ FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+ for (int c = 0; c < depth; ++c) {
+ int32 input_diff =
+ static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+ max_in_row;
+ if (input_diff >= diff_min) {
+ const int32 input_diff_rescaled =
+ MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_diff, input_beta_multiplier, input_beta_left_shift);
+ const FixedPointScaledDiff scaled_diff_f8 =
+ FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+ FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+ int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+ (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+ output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>(
+ std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+
+ } else {
+ output_data[Offset(output_dims, c, x, y, b)] = 0;
+ }
+ }
+ }
+ }
+ }
+}
+
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ float val = input_data[Offset(input_dims, c, x, y, b)];
+ float result = 1.f / (1.f + std::exp(-val));
+ output_data[Offset(output_dims, c, x, y, b)] = result;
+ }
+ }
+ }
+ }
+}
+
+inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
+ int32 input_zero_point, int32 input_range_radius,
+ int32 input_multiplier, int input_left_shift,
+ uint8* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
+ const int32 input_val_centered =
+ static_cast<int32>(input_val_u8) - input_zero_point;
+ uint8 output_val;
+ if (input_val_centered <= -input_range_radius) {
+ output_val = 0;
+ } else if (input_val_centered >= input_range_radius) {
+ output_val = 255;
+ } else {
+ const int32 input_val_rescaled =
+ MultiplyByQuantizedMultiplierGreaterThanOne(
+ input_val_centered, input_multiplier, input_left_shift);
+ using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+ using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+ const FixedPoint4 input_val_f4 =
+ FixedPoint4::FromRaw(input_val_rescaled);
+ const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+ using gemmlowp::RoundingDivideByPOT;
+ int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+ if (output_val_s32 == 256) {
+ output_val_s32 = 255;
+ }
+ TFLITE_DCHECK_GE(output_val_s32, 0);
+ TFLITE_DCHECK_LE(output_val_s32, 255);
+ output_val = static_cast<uint8>(output_val_s32);
+ }
+ output_data[Offset(output_dims, c, x, y, b)] = output_val;
+ }
+ }
+ }
+ }
+}
+
+inline void Tanh(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ float val = input_data[Offset(input_dims, c, x, y, b)];
+ float result = std::tanh(val);
+ output_data[Offset(output_dims, c, x, y, b)] = result;
+ }
+ }
+ }
+ }
+}
+
+inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
+ int32 zero_point, double scale, float* output_data,
+ const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ int32 val = input_data[Offset(input_dims, c, x, y, b)];
+ float result = static_cast<float>(scale * (val - zero_point));
+ output_data[Offset(output_dims, c, x, y, b)] = result;
+ }
+ }
+ }
+ }
+}
+
+inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
+ float rmin, float rmax, float* output_data,
+ const Dims<4>& output_dims) {
+ // 0 should always be a representable value. Let's assume that the initial
+ // min,max range contains 0.
+ TFLITE_DCHECK_LE(rmin, 0.);
+ TFLITE_DCHECK_GE(rmax, 0.);
+
+ // Determine quantization parameters: zero_point, scale.
+ using Integer = uint8;
+ const Integer qmin = std::numeric_limits<Integer>::min();
+ const Integer qmax = std::numeric_limits<Integer>::max();
+ const float qmin_float = qmin;
+ const float qmax_float = qmax;
+ int32 zero_point = 0;
+ float scale = 0.f;
+ // If rmin==rmax, both must be zero per the above assertion,
+ // so we are done.
+ if (rmin != rmax) {
+ // First determine the scale.
+ scale = (rmax - rmin) / (qmax_float - qmin_float);
+
+ // Zero-point computation.
+ // First the initial floating-point computation. The zero-point can be
+ // determined from solving an affine equation for any known pair
+ // (real value, corresponding quantized value).
+ // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+ // The arithmetic error on the zero point computed from either pair
+ // will be roughly machine_epsilon * (sum of absolute values of terms)
+ // so we want to use the variant that adds the smaller terms.
+ const float zero_point_from_min = qmin_float - rmin / scale;
+ const float zero_point_from_max = qmax_float - rmax / scale;
+ const float zero_point_from_min_error =
+ std::abs(qmin_float) + std::abs(rmin / scale);
+ const float zero_point_from_max_error =
+ std::abs(qmax_float) + std::abs(rmax / scale);
+
+ const float zero_point_float =
+ zero_point_from_min_error < zero_point_from_max_error
+ ? zero_point_from_min
+ : zero_point_from_max;
+
+ // Now we need to nudge the zero point to be an integer
+ // (our zero points are integer, and this is motivated by the requirement
+ // to be able to represent the real value "0" exactly as a quantized value,
+ // which is required in multiple places, for example in Im2col with SAME
+ // padding).
+ if (zero_point_float < qmin_float) {
+ zero_point = qmin;
+ } else if (zero_point_float > qmax_float) {
+ zero_point = qmax;
+ } else {
+ zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
+ }
+ // The zero point should always be in the range of quantized value,
+ // [qmin, qmax].
+ TFLITE_DCHECK_GE(zero_point, qmin);
+ TFLITE_DCHECK_LE(zero_point, qmax);
+ }
+
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ const float src_val = input_data[Offset(input_dims, c, x, y, b)];
+ const float unclamped_quantized_val =
+ TfLiteRound(zero_point + src_val / scale);
+ const float quantized_val = std::min(
+ qmax_float, std::max(qmin_float, unclamped_quantized_val));
+ const float dst_val = scale * (quantized_val - zero_point);
+ output_data[Offset(output_dims, c, x, y, b)] = dst_val;
+ }
+ }
+ }
+ }
+}
+
+template <typename SrcT, typename DstT>
+inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
+ DstT* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ int offset = Offset(input_dims, c, x, y, b);
+ output_data[offset] = static_cast<DstT>(input_data[offset]);
+ }
+ }
+ }
+ }
+}
+
+inline void Floor(const float* input_data, const Dims<4>& input_dims,
+ float* output_data, const Dims<4>& output_dims) {
+ const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+ const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+ const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < depth; ++c) {
+ int offset = Offset(input_dims, c, x, y, b);
+ output_data[offset] = std::floor(input_data[offset]);
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void Gather(const T* input_data, const Dims<4>& input_dims,
+ int input_rank, const int32* coords_data,
+ const Dims<4>& coords_dims, T* output_data,
+ const Dims<4>& output_dims) {
+ TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]);
+ int stride = input_dims.strides[input_rank - 1];
+ T* out = output_data;
+
+ for (int i = 0; i < coords_dims.sizes[0]; i++) {
+ TFLITE_DCHECK_GE(coords_data[i], 0);
+ TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]);
+ const T* in = input_data + coords_data[i] * stride;
+ memcpy(out, in, sizeof(T) * stride);
+ out += stride;
+ }
+}
+
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+ const int32* output_size_data,
+ const Dims<4>& output_size_dims, float* output_data,
+ const Dims<4>& output_dims) {
+ int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ int32 input_height = ArraySize(input_dims, 2);
+ int32 input_width = ArraySize(input_dims, 1);
+ int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+ TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
+ TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
+ TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
+ TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
+ int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
+ int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+ float height_scale = static_cast<float>(input_height) / output_height;
+ float width_scale = static_cast<float>(input_width) / output_width;
+
+ for (int b = 0; b < batches; ++b) {
+ for (int y = 0; y < output_height; ++y) {
+ float input_y = y * height_scale;
+ int32 y0 = static_cast<int32>(std::floor(input_y));
+ int32 y1 = std::min(y0 + 1, input_height - 1);
+ for (int x = 0; x < output_width; ++x) {
+ float input_x = x * width_scale;
+ int32 x0 = static_cast<int32>(std::floor(input_x));
+ int32 x1 = std::min(x0 + 1, input_width - 1);
+ for (int c = 0; c < depth; ++c) {
+ float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] *
+ (1 - (input_y - y0)) *
+ (1 - (input_x - x0)) +
+ input_data[Offset(input_dims, c, x0, y1, b)] *
+ (input_y - y0) * (1 - (input_x - x0)) +
+ input_data[Offset(input_dims, c, x1, y0, b)] *
+ (1 - (input_y - y0)) * (input_x - x0) +
+ input_data[Offset(input_dims, c, x1, y1, b)] *
+ (input_y - y0) * (input_x - x0);
+ output_data[Offset(output_dims, c, x, y, b)] = interpolation;
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
+ const int32* block_shape_data,
+ const Dims<4>& block_shape_dims,
+ const int32* paddings_data,
+ const Dims<4>& paddings_dims, T* output_data,
+ const Dims<4>& output_dims) {
+ const int output_batch_size = ArraySize(output_dims, 3);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ const int input_batch_size = ArraySize(input_dims, 3);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int depth = ArraySize(input_dims, 0);
+ const int block_shape_height = block_shape_data[0];
+ const int block_shape_width = block_shape_data[1];
+ const int padding_top = paddings_data[0];
+ const int padding_left = paddings_data[2];
+
+ for (int out_b = 0; out_b < output_batch_size; ++out_b) {
+ int input_batch = out_b % input_batch_size;
+ int shift_w = (out_b / input_batch_size) % block_shape_width;
+ int shift_h = (out_b / input_batch_size) / block_shape_width;
+ for (int out_h = 0; out_h < output_height; ++out_h) {
+ for (int out_w = 0; out_w < output_width; ++out_w) {
+ T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
+ if (out_h * block_shape_height < padding_top ||
+ out_h * block_shape_height >= padding_top + input_height ||
+ out_w * block_shape_width < padding_left ||
+ out_w * block_shape_width >= padding_left + input_width) {
+ memset(out, 0, depth * sizeof(T));
+ } else {
+ const T* in =
+ input_data +
+ Offset(input_dims, 0,
+ (out_w * block_shape_width + shift_w) - padding_left,
+ (out_h * block_shape_height + shift_h) - padding_top,
+ input_batch);
+ memcpy(out, in, depth * sizeof(T));
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
+ const int32* block_shape_data,
+ const Dims<4>& block_shape_dims, T* output_data,
+ const Dims<4>& output_dims) {
+ const int output_batch_size = ArraySize(output_dims, 3);
+ const int input_batch_size = ArraySize(input_dims, 3);
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+ const int depth = ArraySize(input_dims, 0);
+ const int block_shape_width = block_shape_data[1];
+ const int block_shape_height = block_shape_data[0];
+
+ for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+ for (int in_h = 0; in_h < input_height; ++in_h) {
+ for (int in_w = 0; in_w < input_width; ++in_w) {
+ int out_batch = in_batch % output_batch_size;
+ int out_w = in_w * block_shape_width +
+ (in_batch / output_batch_size) % block_shape_width;
+ int out_h = in_h * block_shape_height +
+ (in_batch / output_batch_size) / block_shape_width;
+ T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
+ const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
+ memcpy(out, in, depth * sizeof(T));
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+ const std::vector<int>& left_paddings,
+ const std::vector<int>& right_paddings, T* output_data,
+ const Dims<4>& output_dims) {
+ const int output_batch = ArraySize(output_dims, 3);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ const int output_depth = ArraySize(output_dims, 0);
+
+ const int left_b_padding = left_paddings[3];
+ const int left_h_padding = left_paddings[2];
+ const int left_w_padding = left_paddings[1];
+ const int left_d_padding = left_paddings[0];
+
+ const int right_b_padding = right_paddings[3];
+ const int right_h_padding = right_paddings[2];
+ const int right_w_padding = right_paddings[1];
+ const int right_d_padding = right_paddings[0];
+
+ const T* in_ptr = input_data;
+ T* out_ptr = output_data;
+ for (int out_b = 0; out_b < output_batch; ++out_b) {
+ for (int out_h = 0; out_h < output_height; ++out_h) {
+ for (int out_w = 0; out_w < output_width; ++out_w) {
+ for (int out_d = 0; out_d < output_depth; ++out_d) {
+ if (out_b < left_b_padding ||
+ out_b >= output_batch - right_b_padding ||
+ out_h < left_h_padding ||
+ out_h >= output_height - right_h_padding ||
+ out_w < left_w_padding ||
+ out_w >= output_width - right_w_padding ||
+ out_d < left_d_padding ||
+ out_d >= output_depth - right_d_padding) {
+ *out_ptr++ = 0;
+ } else {
+ *out_ptr++ = *in_ptr++;
+ }
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+ int begin_mask, int end_mask,
+ const std::vector<int>& starts,
+ const std::vector<int>& stops,
+ const std::vector<int>& strides, T* output_data,
+ const Dims<4>& output_dims) {
+ const int start_b = (begin_mask & 8) ? 0 : starts[3];
+ const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
+ const int start_h = (begin_mask & 4) ? 0 : starts[2];
+ const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
+ const int start_w = (begin_mask & 2) ? 0 : starts[1];
+ const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
+ const int start_d = (begin_mask & 1) ? 0 : starts[0];
+ const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
+
+ T* out_ptr = output_data;
+ for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
+ for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
+ for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
+ for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
+ *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+ const std::vector<int>& begin, const std::vector<int>& size,
+ T* output_data, const Dims<4>& output_dims) {
+ // TODO(dkalenichenko): This op only supports 4D tensors.
+ TFLITE_DCHECK_EQ(begin.size(), 4);
+ TFLITE_DCHECK_EQ(size.size(), 4);
+ const int start_b = begin[3];
+ const int stop_b =
+ size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
+ const int start_h = begin[2];
+ const int stop_h =
+ size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
+ const int start_w = begin[1];
+ const int stop_w =
+ size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
+ const int start_d = begin[0];
+ const int stop_d =
+ size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
+
+ T* out_ptr = output_data;
+ for (int in_b = start_b; in_b < stop_b; ++in_b) {
+ for (int in_h = start_h; in_h < stop_h; ++in_h) {
+ for (int in_w = start_w; in_w < stop_w; ++in_w) {
+ for (int in_d = start_d; in_d < stop_d; ++in_d) {
+ *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void Mean(const T* input_data, const Dims<4>& input_dims,
+ const std::vector<int>& reduction_indices, T* output_data,
+ const Dims<4>& output_dims) {
+ const int output_batch = ArraySize(output_dims, 3);
+ const int output_height = ArraySize(output_dims, 2);
+ const int output_width = ArraySize(output_dims, 1);
+ const int output_depth = ArraySize(output_dims, 0);
+
+ const int input_height = ArraySize(input_dims, 2);
+ const int input_width = ArraySize(input_dims, 1);
+
+ // The current implementation only supports simultaneous reduction over
+ // width and height.
+ TFLITE_DCHECK_EQ(reduction_indices.size(), 2);
+ TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) ||
+ (reduction_indices[0] == 2 && reduction_indices[1] == 1));
+ TFLITE_DCHECK_EQ(output_height, 1);
+ TFLITE_DCHECK_EQ(output_width, 1);
+
+ for (int out_b = 0; out_b < output_batch; ++out_b) {
+ for (int out_d = 0; out_d < output_depth; ++out_d) {
+ float value = 0;
+ for (int in_h = 0; in_h < input_height; ++in_h) {
+ for (int in_w = 0; in_w < input_width; ++in_w) {
+ value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)];
+ }
+ }
+ output_data[Offset(output_dims, out_d, 0, 0, out_b)] =
+ value / (input_width * input_height);
+ }
+ }
+}
+
+template <typename T>
+void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
+ const Dims<4>& input2_dims, T* output_data,
+ const Dims<4>& output_dims) {
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+ input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, T* output_data,
+ const Dims<4>& output_dims) {
+ int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
+ int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
+ int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
+ int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+
+ auto min_value = input2_data[0];
+
+ for (int b = 0; b < batches; b++) {
+ for (int y = 0; y < input_height; y++) {
+ for (int x = 0; x < input_width; x++) {
+ for (int c = 0; c < depth; c++) {
+ int offset = Offset(input1_dims, c, x, y, b);
+ output_data[offset] =
+ input1_data[offset] > min_value ? min_value : input1_data[offset];
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, T* output_data,
+ const Dims<4>& output_dims) {
+ int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
+ int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
+ int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
+ int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+
+ auto max_value = input2_data[0];
+
+ for (int b = 0; b < batches; b++) {
+ for (int y = 0; y < input_height; y++) {
+ for (int x = 0; x < input_width; x++) {
+ for (int c = 0; c < depth; c++) {
+ int offset = Offset(input1_dims, c, x, y, b);
+ output_data[offset] =
+ input1_data[offset] < max_value ? max_value : input1_data[offset];
+ }
+ }
+ }
+ }
+}
+
+} // namespace reference_ops
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/round.h b/tensorflow/contrib/lite/kernels/internal/round.h
new file mode 100644
index 0000000000..38525b0e20
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/round.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+
+#include <cmath>
+
+namespace tflite {
+
+// TODO(aselle): See if we can do this only on jdk. Also mikecase, check
+// if you need this for java host build.
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+template <class T>
+inline float TfLiteRound(const float x) {
+ return ::round(x);
+}
+inline double TfLiteRound(const double x) { return ::round(x); }
+#else
+template <class T>
+inline T TfLiteRound(const T x) {
+ return std::round(x);
+}
+#endif
+
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
new file mode 100644
index 0000000000..ee4111e041
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+
+#include <vector>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+template <typename T>
+inline T* GetTensorData(TfLiteTensor* tensor);
+
+template <>
+inline float* GetTensorData(TfLiteTensor* tensor) {
+ return tensor != nullptr ? tensor->data.f : nullptr;
+}
+
+template <>
+inline uint8_t* GetTensorData(TfLiteTensor* tensor) {
+ return tensor != nullptr ? tensor->data.uint8 : nullptr;
+}
+
+template <>
+inline int32_t* GetTensorData(TfLiteTensor* tensor) {
+ return tensor != nullptr ? tensor->data.i32 : nullptr;
+}
+
+template <>
+inline int64_t* GetTensorData(TfLiteTensor* tensor) {
+ return tensor != nullptr ? reinterpret_cast<int64_t*>(tensor->data.raw)
+ : nullptr;
+}
+
+inline int RemapDim(int max_dimensions, int d) {
+ return max_dimensions - d - 1;
+}
+
+// TODO(ahentz): the implementations in kernels/internal/ take a Dims<4> object
+// even if the original tensors were not 4D. We should consider rewriting them
+// to take a more generic 'shape' object.
+inline Dims<4> GetTensorDims(const int data[], const int size) {
+ Dims<4> d;
+ for (int i = 0; i < 4; ++i) {
+ int src = size - i - 1;
+ if (src >= 0) {
+ d.sizes[i] = data[src];
+ } else {
+ d.sizes[i] = 1;
+ }
+ }
+ d.strides[0] = 1;
+ for (int i = 1; i < 4; i++) {
+ d.strides[i] = d.strides[i - 1] * d.sizes[i - 1];
+ }
+ return d;
+}
+
+inline Dims<4> GetTensorDims(std::vector<int32_t> data) {
+ return GetTensorDims(data.data(), data.size());
+}
+
+inline Dims<4> GetTensorDims(const TfLiteTensor* tensor) {
+ if (tensor == nullptr) {
+ return Dims<4>();
+ }
+
+ auto* dims = tensor->dims;
+ return GetTensorDims(dims->data, dims->size);
+}
+
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_test.cc
new file mode 100644
index 0000000000..bf2068d320
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(TensorTest, GetTensorDims4D) {
+ Dims<4> d = GetTensorDims({2, 3, 4, 5});
+ EXPECT_THAT(d.sizes, ElementsAre(5, 4, 3, 2));
+ EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 60));
+}
+
+TEST(TensorTest, GetTensorDims3D) {
+ Dims<4> d = GetTensorDims({3, 4, 5});
+ EXPECT_THAT(d.sizes, ElementsAre(5, 4, 3, 1));
+ EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 60));
+}
+
+TEST(TensorTest, GetTensorDims2D) {
+ Dims<4> d = GetTensorDims({4, 5});
+ EXPECT_THAT(d.sizes, ElementsAre(5, 4, 1, 1));
+ EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 20));
+}
+
+TEST(TensorTest, GetTensorDims1D) {
+ Dims<4> d = GetTensorDims({5});
+ EXPECT_THAT(d.sizes, ElementsAre(5, 1, 1, 1));
+ EXPECT_THAT(d.strides, ElementsAre(1, 5, 5, 5));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
new file mode 100644
index 0000000000..904a97803a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
@@ -0,0 +1,27 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
+#endif // USE_NEON
+
+#ifdef USE_NEON
+#include "tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h"
+#else
+#include "tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h"
+#endif // USE_NEON
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
new file mode 100644
index 0000000000..0e69ef5982
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+// Limit a float input f betweeen +abs_limit and -abs_limit.
+float Clip(float f, float abs_limit);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector using a stride value provided in result_stride. 'result_stride' shows
+// how the number of elements between consecutive result values. For example
+// result_stride = 1, will cause the output to look like this:
+// [O_1, 0_2, ... O_rows] in memory, but result_stride = 3, will cause it to be
+// arranged like this in memory: [O_1, x, x, 0_2, x, x, ..., O_rows]
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+ int m_cols, const float* vector,
+ int n_batch, float* result,
+ int result_stride);
+
+// Cwise product of two vectors.
+void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
+ int v_size, float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
+// assumption here is that result array is initialized to valid values.
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+ const float* vector2, int v_size,
+ float* result);
+
+// Dot product of two vectors.
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+ int v_size);
+
+// Dot product of two batch vectors of size n_batch * v_size:
+// vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
+// x_2_1, x_2_2, ..., x_2_vsize,
+// ...
+// x_nbatch_1,..., x_nbatch_vsize]
+// vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
+// y_2_1, y_2_2, ..., y_2_vsize,
+// ...
+// y_nbatch_1,..., y_nbatch_vsize]
+// Then result will be a vector of n_batch size which will be saved with a
+// stride of result_stride in memory starting from 'result':
+// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
+// x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
+// ...
+// x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
+void BatchVectorBatchVectorDotProduct(const float* vector1,
+ const float* vector2, int v_size,
+ int n_batch, float* result,
+ int result_stride);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
+ const float* batch_vector,
+ int n_batch, float* result);
+
+// Batch vector initialization with another vector.
+void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
+ float* batch_vector);
+
+// Apply sigmoid to elements of a vector.
+void ApplySigmoidToVector(const float* vector, int v_size, float* result);
+
+// Apply activation function to elements of a vector.
+void ApplyActivationToVector(const float* vector, int v_size,
+ TfLiteFusedActivation activation, float* result);
+
+// Copy vector to another vector.
+void CopyVector(const float* vector, int v_size, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void Sub1Vector(const float* vector, int v_size, float* result);
+
+// Fill vector with 0.f.
+void ZeroVector(float* vector, int v_size);
+
+// Clip elements of a vector using a abs_limit value.
+void ClipVector(const float* vector, int v_size, float abs_limit,
+ float* result);
+
+// Shift left a vector in place with v_size size.
+void VectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void ReductionSumVector(const float* input_vector, float* output_vector,
+ int output_size, int reduction_size);
+} // namespace tensor_utils
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
new file mode 100644
index 0000000000..588f1a428b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
@@ -0,0 +1,192 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include <gmock/gmock.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+TEST(uKernels, ClipTest) {
+ constexpr int kVectorSize = 10;
+ constexpr float kAbsLimit = 2.0;
+ static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
+ -2.5, 3.0, -3.5, 4.0, -4.5};
+ std::vector<float> output(kVectorSize);
+ ClipVector(input, kVectorSize, kAbsLimit, output.data());
+ EXPECT_THAT(output,
+ ElementsAreArray(ArrayFloatNear(
+ {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0})));
+}
+
+TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
+ constexpr int kRow = 3;
+ constexpr int kCol = 4;
+ constexpr int kBatch = 2;
+ static float matrix[kRow * kCol] = {1.0, 2.0, 3.0, 4.0, //
+ -1.0, -2.0, -3.0, -4.0, //
+ 1.0, -2.0, 3.0, -4.0};
+ static float vector[kCol * kBatch] = {1.0, -1.0, 1.0, -1.0, //
+ 2.0, -2.0, 2.0, -2.0};
+ std::vector<float> output(kRow * kBatch);
+ std::fill(output.begin(), output.end(), 3.0);
+ MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
+ output.data(), /*result_stride=*/1);
+ EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({1., 5., 13., //
+ -1., 7., 23.})));
+
+ std::vector<float> output_with_stride2(kRow * kBatch * 2);
+ std::fill(output_with_stride2.begin(), output_with_stride2.end(), 3.0);
+ MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
+ output_with_stride2.data(),
+ /*result_stride=*/2);
+ EXPECT_THAT(output_with_stride2,
+ ElementsAreArray(ArrayFloatNear({1., 3., 5., 3., 13., 3., //
+ -1., 3., 7., 3., 23., 3.})));
+}
+
+TEST(uKernels, VectorVectorCwiseProductTest) {
+ constexpr int kVectorSize = 10;
+ static float input1[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
+ -2.5, 3.0, -3.5, 4.0, -4.5};
+ static float input2[kVectorSize] = {0.1, -0.1, 0.1, -0.1, 0.1,
+ -0.1, 0.1, -0.1, 0.1, -0.1};
+ std::vector<float> output(kVectorSize);
+ VectorVectorCwiseProduct(input1, input2, kVectorSize, output.data());
+ EXPECT_THAT(output,
+ ElementsAreArray(ArrayFloatNear(
+ {0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45})));
+}
+
+TEST(uKernels, VectorVectorCwiseProductAccumulateTest) {
+ constexpr int kVectorSize = 10;
+ static float input1[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
+ -2.5, 3.0, -3.5, 4.0, -4.5};
+ static float input2[kVectorSize] = {0.1, -0.1, 0.1, -0.1, 0.1,
+ -0.1, 0.1, -0.1, 0.1, -0.1};
+ std::vector<float> output(kVectorSize);
+ std::fill(output.begin(), output.end(), 1.0);
+ VectorVectorCwiseProductAccumulate(input1, input2, kVectorSize,
+ output.data());
+ EXPECT_THAT(output,
+ ElementsAreArray(ArrayFloatNear(
+ {1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45})));
+}
+
+TEST(uKernels, VectorBatchVectorAssignTest) {
+ constexpr int kVectorSize = 5;
+ constexpr int kBatchSize = 3;
+ static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+ std::vector<float> output(kVectorSize * kBatchSize);
+ VectorBatchVectorAssign(input, kVectorSize, kBatchSize, output.data());
+ EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+ {0.0, -0.5, 1.0, -1.5, 2.0, 0.0, -0.5, 1.0, -1.5, 2.0,
+ 0.0, -0.5, 1.0, -1.5, 2.0})));
+}
+
+TEST(uKernels, ApplySigmoidToVectorTest) {
+ constexpr int kVectorSize = 5;
+ static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+ std::vector<float> output(kVectorSize);
+ ApplySigmoidToVector(input, kVectorSize, output.data());
+ EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+ {0.5, 0.377541, 0.731059, 0.182426, 0.880797})));
+}
+
+TEST(uKernels, ApplyActivationToVectorTest) {
+ constexpr int kVectorSize = 5;
+ static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+ std::vector<float> output(kVectorSize);
+ ApplyActivationToVector(input, kVectorSize, kTfLiteActRelu, output.data());
+ EXPECT_THAT(output,
+ ElementsAreArray(ArrayFloatNear({0.0, 0.0, 1.0, 0.0, 2.0})));
+
+ ApplyActivationToVector(input, kVectorSize, kTfLiteActTanh, output.data());
+ EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+ {0.0, -0.462117, 0.761594, -0.905148, 0.964028})));
+}
+
+TEST(uKernels, CopyVectorTest) {
+ constexpr int kVectorSize = 5;
+ static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+ std::vector<float> output(kVectorSize);
+ CopyVector(input, kVectorSize, output.data());
+ EXPECT_THAT(output,
+ ElementsAreArray(ArrayFloatNear({0.0, -0.5, 1.0, -1.5, 2.0})));
+}
+
+TEST(uKernels, Sub1VectorTest) {
+ constexpr int kVectorSize = 5;
+ static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+ std::vector<float> output(kVectorSize);
+ Sub1Vector(input, kVectorSize, output.data());
+ EXPECT_THAT(output,
+ ElementsAreArray(ArrayFloatNear({1.0, 1.5, 0.0, 2.5, -1.0})));
+}
+
+TEST(uKernels, ZeroVectorTest) {
+ constexpr int kVectorSize = 5;
+ std::vector<float> output(kVectorSize);
+ ZeroVector(output.data(), kVectorSize);
+ EXPECT_THAT(output,
+ ElementsAreArray(ArrayFloatNear({0.0, 0.0, 0.0, 0.0, 0.0})));
+}
+
+TEST(uKernels, BatchVectorBatchVectorDotProductTest) {
+ constexpr int kVectorSize = 5;
+ constexpr int kBatch = 2;
+ static float input1[kVectorSize * kBatch] = {0.0, -0.5, 1.0, -1.5, 2.0,
+ -2.5, 3.0, -3.5, 4.0, -4.5};
+ static float input2[kVectorSize * kBatch] = {0.1, -0.1, 0.1, -0.1, 0.1,
+ -0.1, 0.1, -0.1, 0.1, -0.1};
+ std::vector<float> output(kBatch);
+ BatchVectorBatchVectorDotProduct(input1, input2, kVectorSize, kBatch,
+ output.data(), /*result_stride=*/1);
+ EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({0.5, 1.75})));
+}
+
+TEST(uKernels, VectorShiftLeftTest) {
+ constexpr int kVectorSize = 5;
+ static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+ std::vector<float> result(kVectorSize);
+ VectorShiftLeft(input, kVectorSize, 3.0);
+ result.assign(input, input + kVectorSize);
+ EXPECT_THAT(result,
+ ElementsAreArray(ArrayFloatNear({-0.5, 1.0, -1.5, 2.0, 3.0})));
+}
+
+TEST(uKernels, ReductionSumVectorTest) {
+ constexpr int kInputVectorSize = 10;
+ constexpr int kOutputVectorSize1 = 5;
+ constexpr int kReductionSize1 = 2;
+ static float input[kInputVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
+ 0.0, -0.5, 1.0, 1.0, 2.0};
+ std::vector<float> result1(kOutputVectorSize1);
+ ReductionSumVector(input, result1.data(), kOutputVectorSize1,
+ kReductionSize1);
+ EXPECT_THAT(result1,
+ ElementsAreArray(ArrayFloatNear({-0.5, -0.5, 2.0, 0.5, 3.0})));
+
+ constexpr int kOutputVectorSize2 = 2;
+ constexpr int kReductionSize2 = 5;
+ std::vector<float> result2(kOutputVectorSize2);
+ ReductionSumVector(input, result2.data(), kOutputVectorSize2,
+ kReductionSize2);
+ EXPECT_THAT(result2, ElementsAreArray(ArrayFloatNear({1.0, 3.5})));
+}
+
+} // namespace tensor_utils
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
new file mode 100644
index 0000000000..07f1cb4004
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
+
+template <int N>
+struct Dims {
+ int sizes[N];
+ int strides[N];
+};
+
+inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
+ TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
+ TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
+ TFLITE_DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
+ TFLITE_DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
+ return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
+ i3 * dims.strides[3];
+}
+
+// Get array size, DCHECKing that the dim index is in range.
+template <int N>
+int ArraySize(const Dims<N>& array, int index) {
+ TFLITE_DCHECK(index >= 0 && index < N);
+ return array.sizes[index];
+}
+
+// Get common array size, DCHECKing that they all agree.
+template <typename ArrayType1, typename ArrayType2>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+ const ArrayType2& array2, int index2) {
+ TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+ return ArraySize(array1, index1);
+}
+
+template <typename ArrayType1, typename ArrayType2, typename... Args>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+ const ArrayType2& array2, int index2, Args... args) {
+ TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+ return MatchingArraySize(array1, index1, args...);
+}
+
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+ int max_offset = 0;
+ for (int i = 0; i < 4; i++) {
+ max_offset += (dims.sizes[i] - 1) * dims.strides[i];
+ }
+ return max_offset + 1;
+}
+
+template <int N>
+bool IsPackedWithoutStrides(const Dims<N>& dims) {
+ int expected_stride = 1;
+ for (int d = 0; d < N; d++) {
+ if (dims.strides[d] != expected_stride) return false;
+ expected_stride *= dims.sizes[d];
+ }
+ return true;
+}
+
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
new file mode 100644
index 0000000000..b0546c00cf
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include <algorithm>
+#include <cmath>
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+
+namespace tflite {
+
+TfLiteStatus GetQuantizedConvolutionMultipler(
+ TfLiteContext* context, TfLiteTensor* input, TfLiteTensor* filter,
+ TfLiteTensor* bias, TfLiteTensor* output, double* multiplier) {
+ const double input_product_scale = input->params.scale * filter->params.scale;
+ const double bias_scale = bias->params.scale;
+ const double output_scale = output->params.scale;
+
+ // TODO(ahentz): The following conditions must be guaranteed by the training
+ // pipeline.
+ TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <=
+ 1e-6 * std::min(input_product_scale, bias_scale));
+ TF_LITE_ENSURE(context, input_product_scale >= 0);
+ TF_LITE_ENSURE(context, input_product_scale < output_scale);
+
+ *multiplier = input_product_scale / output_scale;
+
+ return kTfLiteOk;
+}
+
+void CalculateActivationRangeUint8(TfLiteFusedActivation activation,
+ TfLiteTensor* output, int32_t* act_min,
+ int32_t* act_max) {
+ const int32_t qmin = std::numeric_limits<uint8_t>::min();
+ const int32_t qmax = std::numeric_limits<uint8_t>::max();
+
+ const auto scale = output->params.scale;
+ const auto zero_point = output->params.zero_point;
+
+ auto quantize = [scale, zero_point](float f) {
+ return zero_point + static_cast<int32_t>(TfLiteRound(f / scale));
+ };
+
+ if (activation == kTfLiteActRelu) {
+ *act_min = std::max(qmin, quantize(0.0));
+ *act_max = qmax;
+ } else if (activation == kTfLiteActRelu6) {
+ *act_min = std::max(qmin, quantize(0.0));
+ *act_max = std::min(qmax, quantize(6.0));
+ } else if (activation == kTfLiteActRelu1) {
+ *act_min = std::max(qmin, quantize(-1.0));
+ *act_max = std::min(qmax, quantize(1.0));
+ } else {
+ *act_min = qmin;
+ *act_max = qmax;
+ }
+}
+
+void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
+ float* activation_min,
+ float* activation_max) {
+ if (activation == kTfLiteActRelu) {
+ *activation_min = 0.f;
+ *activation_max = std::numeric_limits<float>::max();
+ } else if (activation == kTfLiteActRelu6) {
+ *activation_min = 0.f;
+ *activation_max = 6.f;
+ } else if (activation == kTfLiteActRelu1) {
+ *activation_min = -1.f;
+ *activation_max = 1.f;
+ } else {
+ *activation_min = std::numeric_limits<float>::lowest();
+ *activation_max = std::numeric_limits<float>::max();
+ }
+}
+
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
new file mode 100644
index 0000000000..25556ae456
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
+inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
+ return t->dims->data[dim];
+}
+inline TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node,
+ int index) {
+ return &context->tensors[node->inputs->data[index]];
+}
+inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
+ int index) {
+ return &context->tensors[node->outputs->data[index]];
+}
+inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
+inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
+
+inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
+ const TfLiteNode* node, int index) {
+ const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
+ if (use_tensor) {
+ return &context->tensors[node->inputs->data[index]];
+ }
+ return nullptr;
+}
+
+// Calculates the multiplication factor for a quantized convolution (or
+// quantized depthwise convolution) involving the given tensors. Returns an
+// error if the scales of the tensors are not compatible.
+TfLiteStatus GetQuantizedConvolutionMultipler(
+ TfLiteContext* context, TfLiteTensor* input, TfLiteTensor* filter,
+ TfLiteTensor* bias, TfLiteTensor* output, double* multiplier);
+
+// Calculates the useful range of an activation layer given its activation
+// tensor.
+void CalculateActivationRangeUint8(TfLiteFusedActivation activation,
+ TfLiteTensor* output, int32_t* act_min,
+ int32_t* act_max);
+void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
+ float* activation_min,
+ float* activation_max);
+
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc
new file mode 100644
index 0000000000..f43aa372b6
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/l2norm.cc
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace l2norm {
+
+// This file has two implementation of L2Norm.
+enum KernelType {
+ kReference,
+ kGenericOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
+
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ // TODO(ahentz): Our current implementations rely on the inputs being 4D.
+ TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+
+ // TODO(ahentz): Our current implementations only support float32.
+ TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+ TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+ // TODO(ahentz): For some reason our implementations don't support
+ // activations.
+ TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+ output_size->data[0] = input->dims->data[0];
+ output_size->data[1] = input->dims->data[1];
+ output_size->data[2] = input->dims->data[2];
+ output_size->data[3] = input->dims->data[3];
+
+ return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ if (output->type == kTfLiteFloat32) {
+#define TF_LITE_L2NORM(type) \
+ type::L2Normalization<FusedActivationFunctionType::kNone>( \
+ GetTensorData<float>(input), GetTensorDims(input), \
+ GetTensorData<float>(output), GetTensorDims(output))
+
+ if (kernel_type == kReference) {
+ TF_LITE_L2NORM(reference_ops);
+ }
+ if (kernel_type == kGenericOptimized) {
+ TF_LITE_L2NORM(optimized_ops);
+ }
+#undef TF_LITE_L2NORM
+ } else {
+ context->ReportError(context, "Inputs and outputs not all float types.");
+ return kTfLiteError;
+ }
+
+ return kTfLiteOk;
+}
+
+} // namespace l2norm
+
+TfLiteRegistration* Register_L2NORM_REF() {
+ static TfLiteRegistration r = {nullptr, nullptr, l2norm::Prepare,
+ l2norm::Eval<l2norm::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_L2NORM_GENERIC_OPT() {
+ static TfLiteRegistration r = {nullptr, nullptr, l2norm::Prepare,
+ l2norm::Eval<l2norm::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_L2_NORMALIZATION() {
+ return Register_L2NORM_GENERIC_OPT();
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
new file mode 100644
index 0000000000..b1db89b8bd
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class L2NormOpModel : public SingleOpModel {
+ public:
+ L2NormOpModel(std::initializer_list<int> input_shape,
+ ActivationFunctionType activation_type) {
+ input_ = AddInput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+ SetBuiltinOp(BuiltinOperator_L2_NORMALIZATION, BuiltinOptions_L2NormOptions,
+ CreateL2NormOptions(builder_, activation_type).Union());
+ BuildInterpreter({input_shape});
+ }
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+ int input_;
+ int output_;
+};
+
+TEST(L2NormOpTest, SimpleTest) {
+ L2NormOpModel m({1, 1, 1, 6}, ActivationFunctionType_NONE);
+ m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm.cc b/tensorflow/contrib/lite/kernels/local_response_norm.cc
new file mode 100644
index 0000000000..c1c70d0dfa
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/local_response_norm.cc
@@ -0,0 +1,109 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace local_response_norm {
+
+// This file has two implementation of LocalResponseNorm.
+enum KernelType {
+ kReference,
+ kGenericOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+
+ TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+ TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+ output_size->data[0] = input->dims->data[0];
+ output_size->data[1] = input->dims->data[1];
+ output_size->data[2] = input->dims->data[2];
+ output_size->data[3] = input->dims->data[3];
+
+ return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteLocalResponseNormParams*>(node->builtin_data);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ if (output->type == kTfLiteFloat32) {
+#define TF_LITE_LOCAL_RESPONSE_NORM(type) \
+ type::LocalResponseNormalization( \
+ GetTensorData<float>(input), GetTensorDims(input), params->radius, \
+ params->bias, params->alpha, params->beta, GetTensorData<float>(output), \
+ GetTensorDims(output))
+ if (kernel_type == kReference) {
+ TF_LITE_LOCAL_RESPONSE_NORM(reference_ops);
+ }
+ if (kernel_type == kGenericOptimized) {
+ TF_LITE_LOCAL_RESPONSE_NORM(optimized_ops);
+ }
+#undef TF_LITE_LOCAL_RESPONSE_NORM
+ } else {
+ context->ReportError(context, "Inputs and outputs not all float types.");
+ return kTfLiteError;
+ }
+
+ return kTfLiteOk;
+}
+
+} // namespace local_response_norm
+
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORM_REF() {
+ static TfLiteRegistration r = {
+ nullptr, nullptr, local_response_norm::Prepare,
+ local_response_norm::Eval<local_response_norm::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORM_GENERIC_OPT() {
+ static TfLiteRegistration r = {
+ nullptr, nullptr, local_response_norm::Prepare,
+ local_response_norm::Eval<local_response_norm::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORMALIZATION() {
+ return Register_LOCAL_RESPONSE_NORM_GENERIC_OPT();
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm_test.cc b/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
new file mode 100644
index 0000000000..63a8b0a3d0
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LocalResponseNormOpModel : public SingleOpModel {
+ public:
+ LocalResponseNormOpModel(std::initializer_list<int> input_shape, int radius,
+ float bias, float alpha, float beta) {
+ input_ = AddInput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+ SetBuiltinOp(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+ BuiltinOptions_LocalResponseNormalizationOptions,
+ CreateLocalResponseNormalizationOptions(builder_, radius, bias,
+ alpha, beta)
+ .Union());
+ BuildInterpreter({input_shape});
+ }
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+ int input_;
+ int output_;
+};
+
+TEST(LocalResponseNormOpTest, SameAsL2Norm) {
+ LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/0.0,
+ /*alpha=*/1.0, /*beta=*/0.5);
+ m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+ m.Invoke();
+ // The result is every input divided by 2.
+ EXPECT_THAT(
+ m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05})));
+}
+
+TEST(LocalResponseNormOpTest, WithAlpha) {
+ LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/0.0,
+ /*alpha=*/4.0, /*beta=*/0.5);
+ m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+ m.Invoke();
+ // The result is every input divided by 3.
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+ {-0.275, 0.15, 0.175, 0.3, -0.175, 0.025})));
+}
+
+TEST(LocalResponseNormOpTest, WithBias) {
+ LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/9.0,
+ /*alpha=*/4.0, /*beta=*/0.5);
+ m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+ m.Invoke();
+ // The result is every input divided by 5.
+ EXPECT_THAT(
+ m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({-0.22, 0.12, 0.14, 0.24, -0.14, 0.02})));
+}
+
+TEST(LocalResponseNormOpTest, SmallRadius) {
+ LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/2, /*bias=*/9.0,
+ /*alpha=*/4.0, /*beta=*/0.5);
+ m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+ m.Invoke();
+ EXPECT_THAT(
+ m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear(
+ {-0.264926, 0.125109, 0.140112, 0.267261, -0.161788, 0.0244266})));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/contrib/lite/kernels/lsh_projection.cc
new file mode 100644
index 0000000000..5f73b56ed9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lsh_projection.cc
@@ -0,0 +1,204 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// LSH Projection projects an input to a bit vector via locality senstive
+// hashing.
+//
+// Options:
+// Sparse:
+// Computed bit vector is considered to be sparse.
+// Each output element is an int32 made up by multiple bits computed from
+// hash functions.
+//
+// Dense:
+// Computed bit vector is considered to be dense. Each output element is
+// either 0 or 1 that represents a bit.
+//
+// Input:
+// Tensor[0]: Hash functions. Dim.size == 2, DataType: Float.
+// Tensor[0].Dim[0]: Num of hash functions.
+// Tensor[0].Dim[1]: Num of projected output bits generated by
+// each hash function.
+// In sparse case, Tensor[0].Dim[1] + ceil( log2(Tensor[0].Dim[0] )) <= 32.
+//
+// Tensor[1]: Input. Dim.size >= 1, No restriction on DataType.
+// Tensor[2]: Optional, Weight. Dim.size == 1, DataType: Float.
+// If not set, each element of input is considered to have same
+// weight of 1.0 Tensor[1].Dim[0] == Tensor[2].Dim[0]
+//
+// Output:
+// Sparse:
+// Output.Dim == { Tensor[0].Dim[0] }
+// A tensor of int32 that represents hash signatures,
+//
+// NOTE: To avoid collisions across hash functions, an offset value of
+// k * (1 << Tensor[0].Dim[1]) will be added to each signature,
+// k is the index of the hash function.
+// Dense:
+// Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
+// A flattened tensor represents projected bit vectors.
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <memory>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include <farmhash.h>
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lsh_projection {
+
+TfLiteStatus Resize(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data);
+ TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* hash = GetInput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(hash), 2);
+ // Support up to 32 bits.
+ TF_LITE_ENSURE(context, SizeOfDimension(hash, 1) <= 32);
+
+ TfLiteTensor* input = GetInput(context, node, 1);
+ TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+
+ if (NumInputs(node) == 3) {
+ TfLiteTensor* weight = GetInput(context, node, 2);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(weight), 1);
+ TF_LITE_ENSURE_EQ(context, SizeOfDimension(weight, 0),
+ SizeOfDimension(input, 0));
+ }
+
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TfLiteIntArray* outputSize = TfLiteIntArrayCreate(1);
+ switch (params->type) {
+ case kTfLiteLshProjectionSparse:
+ outputSize->data[0] = SizeOfDimension(hash, 0);
+ break;
+ case kTfLiteLshProjectionDense:
+ outputSize->data[0] = SizeOfDimension(hash, 0) * SizeOfDimension(hash, 1);
+ break;
+ default:
+ return kTfLiteError;
+ }
+ return context->ResizeTensor(context, output, outputSize);
+}
+
+// Compute sign bit of dot product of hash(seed, input) and weight.
+// NOTE: use float as seed, and convert it to double as a temporary solution
+// to match the trained model. This is going to be changed once the new
+// model is trained in an optimized method.
+//
+int RunningSignBit(const TfLiteTensor* input, const TfLiteTensor* weight,
+ float seed) {
+ double score = 0.0;
+ int input_item_bytes = input->bytes / SizeOfDimension(input, 0);
+ char* input_ptr = input->data.raw;
+
+ const size_t seed_size = sizeof(float);
+ const size_t key_bytes = sizeof(float) + input_item_bytes;
+ std::unique_ptr<char[]> key(new char[key_bytes]);
+
+ for (int i = 0; i < SizeOfDimension(input, 0); ++i) {
+ // Create running hash id and value for current dimension.
+ memcpy(key.get(), &seed, seed_size);
+ memcpy(key.get() + seed_size, input_ptr, input_item_bytes);
+
+ int64_t hash_signature = ::util::Fingerprint64(key.get(), key_bytes);
+ double running_value = static_cast<double>(hash_signature);
+ input_ptr += input_item_bytes;
+ if (weight == nullptr) {
+ score += running_value;
+ } else {
+ score += weight->data.f[i] * running_value;
+ }
+ }
+
+ return (score > 0) ? 1 : 0;
+}
+
+void SparseLshProjection(const TfLiteTensor* hash, const TfLiteTensor* input,
+ const TfLiteTensor* weight, int32_t* out_buf) {
+ int num_hash = SizeOfDimension(hash, 0);
+ int num_bits = SizeOfDimension(hash, 1);
+ for (int i = 0; i < num_hash; i++) {
+ int32_t hash_signature = 0;
+ for (int j = 0; j < num_bits; j++) {
+ float seed = hash->data.f[i * num_bits + j];
+ int bit = RunningSignBit(input, weight, seed);
+ hash_signature = (hash_signature << 1) | bit;
+ }
+ *out_buf++ = hash_signature + i * (1 << num_bits);
+ }
+}
+
+void DenseLshProjection(const TfLiteTensor* hash, const TfLiteTensor* input,
+ const TfLiteTensor* weight, int32_t* out_buf) {
+ int num_hash = SizeOfDimension(hash, 0);
+ int num_bits = SizeOfDimension(hash, 1);
+ for (int i = 0; i < num_hash; i++) {
+ for (int j = 0; j < num_bits; j++) {
+ float seed = hash->data.f[i * num_bits + j];
+ int bit = RunningSignBit(input, weight, seed);
+ *out_buf++ = bit;
+ }
+ }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data);
+
+ int32_t* out_buf = GetOutput(context, node, 0)->data.i32;
+ TfLiteTensor* hash = GetInput(context, node, 0);
+ TfLiteTensor* input = GetInput(context, node, 1);
+ TfLiteTensor* weight =
+ NumInputs(node) == 2 ? nullptr : GetInput(context, node, 2);
+
+ switch (params->type) {
+ case kTfLiteLshProjectionDense:
+ DenseLshProjection(hash, input, weight, out_buf);
+ break;
+ case kTfLiteLshProjectionSparse:
+ SparseLshProjection(hash, input, weight, out_buf);
+ break;
+ default:
+ return kTfLiteError;
+ }
+
+ return kTfLiteOk;
+}
+} // namespace lsh_projection
+
+TfLiteRegistration* Register_LSH_PROJECTION() {
+ static TfLiteRegistration r = {nullptr, nullptr, lsh_projection::Resize,
+ lsh_projection::Eval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection_test.cc b/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
new file mode 100644
index 0000000000..1011927848
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+class LSHProjectionOpModel : public SingleOpModel {
+ public:
+ LSHProjectionOpModel(LSHProjectionType type,
+ std::initializer_list<int> hash_shape,
+ std::initializer_list<int> input_shape,
+ std::initializer_list<int> weight_shape) {
+ hash_ = AddInput(TensorType_FLOAT32);
+ input_ = AddInput(TensorType_INT32);
+ if (weight_shape.size() > 0) {
+ weight_ = AddInput(TensorType_FLOAT32);
+ }
+ output_ = AddOutput(TensorType_INT32);
+
+ SetBuiltinOp(BuiltinOperator_LSH_PROJECTION,
+ BuiltinOptions_LSHProjectionOptions,
+ CreateLSHProjectionOptions(builder_, type).Union());
+ if (weight_shape.size() > 0) {
+ BuildInterpreter({hash_shape, input_shape, weight_shape});
+ } else {
+ BuildInterpreter({hash_shape, input_shape});
+ }
+
+ output_size_ = 1;
+ for (int i : hash_shape) {
+ output_size_ *= i;
+ if (type == LSHProjectionType_SPARSE) {
+ break;
+ }
+ }
+ }
+ void SetInput(std::initializer_list<int> data) {
+ PopulateTensor(input_, data);
+ }
+
+ void SetHash(std::initializer_list<float> data) {
+ PopulateTensor(hash_, data);
+ }
+
+ void SetWeight(std::initializer_list<float> f) { PopulateTensor(weight_, f); }
+
+ std::vector<int> GetOutput() { return ExtractVector<int>(output_); }
+
+ private:
+ int input_;
+ int hash_;
+ int weight_;
+ int output_;
+
+ int output_size_;
+};
+
+TEST(LSHProjectionOpTest2, Dense1DInputs) {
+ LSHProjectionOpModel m(LSHProjectionType_DENSE, {3, 2}, {5}, {5});
+
+ m.SetInput({12345, 54321, 67890, 9876, -12345678});
+ m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+ m.SetWeight({1.0, 1.0, 1.0, 1.0, 1.0});
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(), ElementsAre(0, 0, 0, 1, 0, 0));
+}
+
+TEST(LSHProjectionOpTest2, Sparse1DInputs) {
+ LSHProjectionOpModel m(LSHProjectionType_SPARSE, {3, 2}, {5}, {});
+
+ m.SetInput({12345, 54321, 67890, 9876, -12345678});
+ m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(), ElementsAre(0 + 0, 4 + 1, 8 + 0));
+}
+
+TEST(LSHProjectionOpTest2, Sparse3DInputs) {
+ LSHProjectionOpModel m(LSHProjectionType_SPARSE, {3, 2}, {5, 2, 2}, {5});
+
+ m.SetInput({1234, 2345, 3456, 1234, 4567, 5678, 6789, 4567, 7891, 8912,
+ 9123, 7890, -987, -876, -765, -987, -543, -432, -321, -543});
+ m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+ m.SetWeight({0.12, 0.34, 0.56, 0.67, 0.78});
+
+ m.Invoke();
+
+ EXPECT_THAT(m.GetOutput(), ElementsAre(0 + 2, 4 + 1, 8 + 1));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
new file mode 100644
index 0000000000..6c06264d84
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -0,0 +1,515 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lstm {
+
+// Input Tensors of size {n_batch, n_input}
+constexpr int kInputTensor = 0;
+
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kInputToInputWeightsTensor = 1; // Optional
+constexpr int kInputToForgetWeightsTensor = 2;
+constexpr int kInputToCellWeightsTensor = 3;
+constexpr int kInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kRecurrentToInputWeightsTensor = 5; // Optional
+constexpr int kRecurrentToForgetWeightsTensor = 6;
+constexpr int kRecurrentToCellWeightsTensor = 7;
+constexpr int kRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kCellToInputWeightsTensor = 9; // Optional
+constexpr int kCellToForgetWeightsTensor = 10; // Optional
+constexpr int kCellToOutputWeightsTensor = 11; // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kInputGateBiasTensor = 12; // Optional
+constexpr int kForgetGateBiasTensor = 13;
+constexpr int kCellGateBiasTensor = 14;
+constexpr int kOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kProjectionWeightsTensor = 16; // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kProjectionBiasTensor = 17; // Optional
+
+// Output tensors.
+constexpr int kScratchBufferTensor = 0;
+constexpr int kOutputStateTensor = 1;
+constexpr int kCellStateTensor = 2;
+constexpr int kOutputTensor = 3;
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
+ TfLiteNode* node, int n_input,
+ int n_output, int n_cell) {
+ auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+ // Making sure clipping parameters have valid values.
+ // == 0 means no clipping
+ // > 0 means clipping
+ TF_LITE_ENSURE(context, params->cell_clip >= 0);
+ TF_LITE_ENSURE(context, params->proj_clip >= 0);
+
+ TfLiteTensor* input_to_input_weights =
+ GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+ if (input_to_input_weights) {
+ TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
+ TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
+ TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+ }
+
+ TfLiteTensor* input_to_forget_weights =
+ GetInput(context, node, kInputToForgetWeightsTensor);
+ TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+ TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+ TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+
+ TfLiteTensor* input_to_cell_weights =
+ GetInput(context, node, kInputToCellWeightsTensor);
+ TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
+ TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
+ TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+
+ TfLiteTensor* recurrent_to_input_weights =
+ GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+ if (recurrent_to_input_weights) {
+ TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
+ n_cell);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+ n_output);
+ }
+
+ TfLiteTensor* recurrent_to_forget_weights =
+ GetInput(context, node, kRecurrentToForgetWeightsTensor);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
+ n_cell);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+ n_output);
+
+ TfLiteTensor* recurrent_to_cell_weights =
+ GetInput(context, node, kRecurrentToCellWeightsTensor);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+ n_output);
+
+ // We make sure the input-gate's parameters are either both present (regular
+ // LSTM) or not at all (CIFG-LSTM).
+ const bool cifg_weights_all_or_none =
+ ((input_to_input_weights != nullptr) &&
+ (recurrent_to_input_weights != nullptr)) ||
+ ((input_to_input_weights == nullptr) &&
+ (recurrent_to_input_weights == nullptr));
+ TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
+
+ TfLiteTensor* cell_to_input_weights =
+ GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+ if (cell_to_input_weights) {
+ TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+ }
+
+ TfLiteTensor* cell_to_forget_weights =
+ GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+ if (cell_to_forget_weights) {
+ TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+ }
+
+ TfLiteTensor* cell_to_output_weights =
+ GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+ if (cell_to_output_weights) {
+ TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+ }
+
+ // Making sure the peephole weights are there all or none.
+ const bool use_cifg = (input_to_input_weights == nullptr);
+ const bool peephole_weights_all_or_none =
+ ((cell_to_input_weights != nullptr || use_cifg) &&
+ (cell_to_forget_weights != nullptr) &&
+ (cell_to_output_weights != nullptr)) ||
+ ((cell_to_input_weights == nullptr) &&
+ (cell_to_forget_weights == nullptr) &&
+ (cell_to_output_weights == nullptr));
+ TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
+
+ // Make sure the input gate bias is present only when not a CIFG-LSTM.
+ TfLiteTensor* input_gate_bias =
+ GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+ if (use_cifg) {
+ TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+ } else {
+ TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+ }
+
+ TfLiteTensor* forget_gate_bias =
+ GetInput(context, node, kForgetGateBiasTensor);
+ TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+
+ TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+ TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+
+ TfLiteTensor* output_gate_bias =
+ GetInput(context, node, kOutputGateBiasTensor);
+ TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+
+ TfLiteTensor* projection_weights =
+ GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+ if (projection_weights) {
+ TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
+ TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
+ TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+ }
+
+ TfLiteTensor* projection_bias =
+ GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+ if (projection_bias) {
+ TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+ }
+
+ // Making sure the projection tensors are consistent:
+ // 1) If projection weight is not present, then projection bias should not be
+ // present.
+ // 2) If projection weight is present, then projection bias is optional.
+ // TODO(ghodrat): make sure this is correct.
+ const bool projecton_tensors_consistent =
+ ((projection_weights != nullptr) || (projection_bias == nullptr));
+ TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
+
+ return kTfLiteOk;
+}
+
+// Resize the output, state and scratch tensors based on the sizes of the input
+// tensors. Also check that the size of the input tensors match each other.
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ // Check we have all the inputs and outputs we need.
+ TF_LITE_ENSURE_EQ(context, node->inputs->size, 18);
+ TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+
+ // Inferring batch size, number of outputs and number of cells from the
+ // input tensors.
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TF_LITE_ENSURE(context, input->dims->size > 1);
+ const int n_batch = input->dims->data[0];
+ const int n_input = input->dims->data[1];
+
+ TfLiteTensor* input_to_output_weights =
+ GetInput(context, node, kInputToOutputWeightsTensor);
+ const int n_cell = input_to_output_weights->dims->data[0];
+ TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
+ TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
+
+ TfLiteTensor* recurrent_to_output_weights =
+ GetInput(context, node, kRecurrentToOutputWeightsTensor);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
+ TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
+ n_cell);
+ const int n_output = recurrent_to_output_weights->dims->data[1];
+
+ // Check that input tensor dimensions matches with each other.
+ CheckInputTensorDimensions(context, node, n_input, n_output, n_cell);
+
+ // Get the pointer to output, state and scratch buffer tensors.
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+ TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+ TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+ // TODO(ghodrat): Modify this as soon as we have a finalized method for
+ // scratch buffers.
+ TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
+
+ // Resize the output and output_state tensors.
+ TfLiteIntArray* output_size = TfLiteIntArrayCreate(2);
+ output_size->data[0] = n_batch;
+ output_size->data[1] = n_output;
+ TF_LITE_ENSURE_OK(context,
+ context->ResizeTensor(context, output, output_size));
+
+ TfLiteIntArray* output_state_size = TfLiteIntArrayCreate(2);
+ output_state_size->data[0] = n_batch;
+ output_state_size->data[1] = n_output;
+ TF_LITE_ENSURE_OK(
+ context, context->ResizeTensor(context, output_state, output_state_size));
+
+ // Resize the output, state and scratch buffer tensors.
+ TfLiteIntArray* cell_size = TfLiteIntArrayCreate(2);
+ cell_size->data[0] = n_batch;
+ cell_size->data[1] = n_cell;
+ TF_LITE_ENSURE_OK(context,
+ context->ResizeTensor(context, cell_state, cell_size));
+
+ // Mark state tensors as persistent tensors.
+ output_state->allocation_type = kTfLiteArenaRwPersistent;
+ cell_state->allocation_type = kTfLiteArenaRwPersistent;
+
+ TfLiteTensor* input_to_input_weights =
+ GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+ const bool use_cifg = (input_to_input_weights == nullptr);
+ if (use_cifg) {
+ TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+ scratch_buffer_size->data[0] = n_batch;
+ // Reserving space for Cell, Forget, Output gates
+ scratch_buffer_size->data[1] = n_cell * 3;
+ TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+ scratch_buffer_size));
+ } else {
+ TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+ scratch_buffer_size->data[0] = n_batch;
+ // Reserving space for Input, Cell, Forget, Output gates
+ scratch_buffer_size->data[1] = n_cell * 4;
+ TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+ scratch_buffer_size));
+ }
+ return kTfLiteOk;
+}
+
+// The LSTM Op engine.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+ TfLiteTensor* input_to_input_weights =
+ GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+ TfLiteTensor* input_to_forget_weights =
+ GetInput(context, node, kInputToForgetWeightsTensor);
+ TfLiteTensor* input_to_cell_weights =
+ GetInput(context, node, kInputToCellWeightsTensor);
+ TfLiteTensor* input_to_output_weights =
+ GetInput(context, node, kInputToOutputWeightsTensor);
+
+ TfLiteTensor* recurrent_to_input_weights =
+ GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+ TfLiteTensor* recurrent_to_forget_weights =
+ GetInput(context, node, kRecurrentToForgetWeightsTensor);
+ TfLiteTensor* recurrent_to_cell_weights =
+ GetInput(context, node, kRecurrentToCellWeightsTensor);
+ TfLiteTensor* recurrent_to_output_weights =
+ GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+ TfLiteTensor* cell_to_input_weights =
+ GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+ TfLiteTensor* cell_to_forget_weights =
+ GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+ TfLiteTensor* cell_to_output_weights =
+ GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+ TfLiteTensor* input_gate_bias =
+ GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+ TfLiteTensor* forget_gate_bias =
+ GetInput(context, node, kForgetGateBiasTensor);
+ TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+ TfLiteTensor* output_gate_bias =
+ GetInput(context, node, kOutputGateBiasTensor);
+
+ TfLiteTensor* projection_weights =
+ GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+ TfLiteTensor* projection_bias =
+ GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+ TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+ TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ const int n_batch = input->dims->data[0];
+ const int n_input = input->dims->data[1];
+ // n_cell and n_output will be the same size when there is no projection.
+ const int n_cell = input_to_output_weights->dims->data[0];
+ const int n_output = recurrent_to_output_weights->dims->data[1];
+
+ // Since we have already checked that weights are all there or none, we can
+ // check the existense of only one to the get the condition.
+ const bool use_cifg = (input_to_input_weights == nullptr);
+ const bool use_peephole = (cell_to_output_weights != nullptr);
+
+ // Index the scratch buffers pointers to the global scratch buffer.
+ TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
+ float* input_gate_scratch = nullptr;
+ float* cell_scratch = nullptr;
+ float* forget_gate_scratch = nullptr;
+ float* output_gate_scratch = nullptr;
+ if (use_cifg) {
+ cell_scratch = scratch_buffer->data.f;
+ forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+ output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+ } else {
+ input_gate_scratch = scratch_buffer->data.f;
+ cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+ forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+ output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+ }
+
+ // Initialize scratch buffers with bias.
+ if (!use_cifg) {
+ tensor_utils::VectorBatchVectorAssign(input_gate_bias->data.f, n_cell,
+ n_batch, input_gate_scratch);
+ }
+ tensor_utils::VectorBatchVectorAssign(forget_gate_bias->data.f, n_cell,
+ n_batch, forget_gate_scratch);
+ tensor_utils::VectorBatchVectorAssign(cell_bias->data.f, n_cell, n_batch,
+ cell_scratch);
+ tensor_utils::VectorBatchVectorAssign(output_gate_bias->data.f, n_cell,
+ n_batch, output_gate_scratch);
+
+ // For each batch and cell: compute input_weight * input.
+ if (!use_cifg) {
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ input_to_input_weights->data.f, n_cell, n_input, input->data.f, n_batch,
+ input_gate_scratch, /*result_stride=*/1);
+ }
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ input_to_forget_weights->data.f, n_cell, n_input, input->data.f, n_batch,
+ forget_gate_scratch, /*result_stride=*/1);
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ input_to_cell_weights->data.f, n_cell, n_input, input->data.f, n_batch,
+ cell_scratch, /*result_stride=*/1);
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ input_to_output_weights->data.f, n_cell, n_input, input->data.f, n_batch,
+ output_gate_scratch, /*result_stride=*/1);
+
+ // For each batch and cell: compute recurrent_weight * output_state.
+ if (!use_cifg) {
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ recurrent_to_input_weights->data.f, n_cell, n_output,
+ output_state->data.f, n_batch, input_gate_scratch, /*result_stride=*/1);
+ }
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ recurrent_to_forget_weights->data.f, n_cell, n_output,
+ output_state->data.f, n_batch, forget_gate_scratch, /*result_stride=*/1);
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ recurrent_to_cell_weights->data.f, n_cell, n_output, output_state->data.f,
+ n_batch, cell_scratch, /*result_stride=*/1);
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ recurrent_to_output_weights->data.f, n_cell, n_output,
+ output_state->data.f, n_batch, output_gate_scratch, /*result_stride=*/1);
+
+ // For each batch and cell: update input gate.
+ if (!use_cifg) {
+ if (use_peephole) {
+ tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+ cell_to_input_weights->data.f, n_cell, cell_state->data.f, n_batch,
+ input_gate_scratch);
+ }
+ tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+ input_gate_scratch);
+ }
+
+ // For each batch and cell: update forget gate.
+ if (use_peephole) {
+ tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+ cell_to_forget_weights->data.f, n_cell, cell_state->data.f, n_batch,
+ forget_gate_scratch);
+ }
+ tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+ forget_gate_scratch);
+
+ // For each batch and cell: update the cell.
+ tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch,
+ cell_state->data.f, n_batch * n_cell,
+ cell_state->data.f);
+ tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+ params->activation, cell_scratch);
+ if (use_cifg) {
+ tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+ forget_gate_scratch);
+ tensor_utils::VectorVectorCwiseProductAccumulate(
+ cell_scratch, forget_gate_scratch, n_batch * n_cell,
+ cell_state->data.f);
+ } else {
+ tensor_utils::VectorVectorCwiseProductAccumulate(
+ cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state->data.f);
+ }
+ if (params->cell_clip > 0.0) {
+ tensor_utils::ClipVector(cell_state->data.f, n_batch * n_cell,
+ params->cell_clip, cell_state->data.f);
+ }
+
+ // For each batch and cell: update the output gate.
+ if (use_peephole) {
+ tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+ cell_to_output_weights->data.f, n_cell, cell_state->data.f, n_batch,
+ output_gate_scratch);
+ }
+ tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+ output_gate_scratch);
+ tensor_utils::ApplyActivationToVector(cell_state->data.f, n_batch * n_cell,
+ params->activation, cell_scratch);
+ tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+ n_batch * n_cell, output_gate_scratch);
+
+ // For each batch: update the projection and output_state.
+ const bool use_projection_weight = (projection_weights != nullptr);
+ const bool use_projection_bias = (projection_bias != nullptr);
+ if (use_projection_weight) {
+ if (use_projection_bias) {
+ tensor_utils::VectorBatchVectorAssign(projection_bias->data.f, n_output,
+ n_batch, output->data.f);
+ } else {
+ tensor_utils::ZeroVector(output->data.f, n_batch * n_output);
+ }
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ projection_weights->data.f, n_output, n_cell, output_gate_scratch,
+ n_batch, output->data.f, /*result_stride=*/1);
+ if (params->proj_clip > 0.0) {
+ tensor_utils::ClipVector(output->data.f, n_batch * n_output,
+ params->proj_clip, output->data.f);
+ }
+ } else {
+ tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+ output->data.f);
+ }
+ tensor_utils::CopyVector(output->data.f, n_batch * n_output,
+ output_state->data.f);
+
+ return kTfLiteOk;
+}
+
+} // namespace lstm
+
+TfLiteRegistration* Register_LSTM() {
+ static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+ lstm::Prepare, lstm::Eval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/lstm_test.cc b/tensorflow/contrib/lite/kernels/lstm_test.cc
new file mode 100644
index 0000000000..be4c7ddbf8
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lstm_test.cc
@@ -0,0 +1,1088 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite LSTM op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LSTMOpModel : public SingleOpModel {
+ public:
+ LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+ bool use_peephole, bool use_projection_weights,
+ bool use_projection_bias, float cell_clip, float proj_clip,
+ const std::vector<std::vector<int>>& input_shapes)
+ : n_batch_(n_batch),
+ n_input_(n_input),
+ n_cell_(n_cell),
+ n_output_(n_output) {
+ input_ = AddInput(TensorType_FLOAT32);
+
+ if (use_cifg) {
+ input_to_input_weights_ = AddNullInput();
+ } else {
+ input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+ }
+
+ input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+ input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+ input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+ if (use_cifg) {
+ recurrent_to_input_weights_ = AddNullInput();
+ } else {
+ recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+ }
+
+ recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+ recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+ recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+ if (use_peephole) {
+ if (use_cifg) {
+ cell_to_input_weights_ = AddNullInput();
+ } else {
+ cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+ }
+ cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+ cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+ } else {
+ cell_to_input_weights_ = AddNullInput();
+ cell_to_forget_weights_ = AddNullInput();
+ cell_to_output_weights_ = AddNullInput();
+ }
+
+ if (use_cifg) {
+ input_gate_bias_ = AddNullInput();
+ } else {
+ input_gate_bias_ = AddInput(TensorType_FLOAT32);
+ }
+ forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+ cell_bias_ = AddInput(TensorType_FLOAT32);
+ output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+ if (use_projection_weights) {
+ projection_weights_ = AddInput(TensorType_FLOAT32);
+ if (use_projection_bias) {
+ projection_bias_ = AddInput(TensorType_FLOAT32);
+ } else {
+ projection_bias_ = AddNullInput();
+ }
+ } else {
+ projection_weights_ = AddNullInput();
+ projection_bias_ = AddNullInput();
+ }
+
+ scratch_buffer_ = AddOutput(TensorType_FLOAT32);
+ // TODO(ghodrat): Modify these states when we have a permanent solution for
+ // persistent buffer.
+ output_state_ = AddOutput(TensorType_FLOAT32);
+ cell_state_ = AddOutput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+
+ SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+ CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+ cell_clip, proj_clip)
+ .Union());
+ BuildInterpreter(input_shapes);
+ }
+
+ void SetInputToInputWeights(std::initializer_list<float> f) {
+ PopulateTensor(input_to_input_weights_, f);
+ }
+
+ void SetInputToForgetWeights(std::initializer_list<float> f) {
+ PopulateTensor(input_to_forget_weights_, f);
+ }
+
+ void SetInputToCellWeights(std::initializer_list<float> f) {
+ PopulateTensor(input_to_cell_weights_, f);
+ }
+
+ void SetInputToOutputWeights(std::initializer_list<float> f) {
+ PopulateTensor(input_to_output_weights_, f);
+ }
+
+ void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+ PopulateTensor(recurrent_to_input_weights_, f);
+ }
+
+ void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+ PopulateTensor(recurrent_to_forget_weights_, f);
+ }
+
+ void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+ PopulateTensor(recurrent_to_cell_weights_, f);
+ }
+
+ void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+ PopulateTensor(recurrent_to_output_weights_, f);
+ }
+
+ void SetCellToInputWeights(std::initializer_list<float> f) {
+ PopulateTensor(cell_to_input_weights_, f);
+ }
+
+ void SetCellToForgetWeights(std::initializer_list<float> f) {
+ PopulateTensor(cell_to_forget_weights_, f);
+ }
+
+ void SetCellToOutputWeights(std::initializer_list<float> f) {
+ PopulateTensor(cell_to_output_weights_, f);
+ }
+
+ void SetInputGateBias(std::initializer_list<float> f) {
+ PopulateTensor(input_gate_bias_, f);
+ }
+
+ void SetForgetGateBias(std::initializer_list<float> f) {
+ PopulateTensor(forget_gate_bias_, f);
+ }
+
+ void SetCellBias(std::initializer_list<float> f) {
+ PopulateTensor(cell_bias_, f);
+ }
+
+ void SetOutputGateBias(std::initializer_list<float> f) {
+ PopulateTensor(output_gate_bias_, f);
+ }
+
+ void SetProjectionWeights(std::initializer_list<float> f) {
+ PopulateTensor(projection_weights_, f);
+ }
+
+ void SetProjectionBias(std::initializer_list<float> f) {
+ PopulateTensor(projection_bias_, f);
+ }
+
+ void ResetOutputState() {
+ const int zero_buffer_size = n_cell_ * n_batch_;
+ std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+ memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+ PopulateTensor(output_state_, 0, zero_buffer.get(),
+ zero_buffer.get() + zero_buffer_size);
+ }
+
+ void ResetCellState() {
+ const int zero_buffer_size = n_cell_ * n_batch_;
+ std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+ memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+ PopulateTensor(cell_state_, 0, zero_buffer.get(),
+ zero_buffer.get() + zero_buffer_size);
+ }
+
+ void SetInput(int offset, float* begin, float* end) {
+ PopulateTensor(input_, offset, begin, end);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ int num_inputs() { return n_input_; }
+ int num_outputs() { return n_output_; }
+ int num_cells() { return n_cell_; }
+ int num_batches() { return n_batch_; }
+
+ private:
+ int input_;
+ int input_to_input_weights_;
+ int input_to_forget_weights_;
+ int input_to_cell_weights_;
+ int input_to_output_weights_;
+
+ int recurrent_to_input_weights_;
+ int recurrent_to_forget_weights_;
+ int recurrent_to_cell_weights_;
+ int recurrent_to_output_weights_;
+
+ int cell_to_input_weights_;
+ int cell_to_forget_weights_;
+ int cell_to_output_weights_;
+
+ int input_gate_bias_;
+ int forget_gate_bias_;
+ int cell_bias_;
+ int output_gate_bias_;
+
+ int projection_weights_;
+ int projection_bias_;
+
+ int output_;
+ int output_state_;
+ int cell_state_;
+ int scratch_buffer_;
+
+ int n_batch_;
+ int n_input_;
+ int n_cell_;
+ int n_output_;
+};
+
+TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
+ const int n_batch = 1;
+ const int n_input = 2;
+ // n_cell and n_output have the same size when there is no projection.
+ const int n_cell = 4;
+ const int n_output = 4;
+
+ LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+ /*use_cifg=*/false, /*use_peephole=*/false,
+ /*use_projection_weights=*/false,
+ /*use_projection_bias=*/false,
+ /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+ {
+ {n_batch, n_input}, // input tensor
+
+ {n_cell, n_input}, // input_to_input_weight tensor
+ {n_cell, n_input}, // input_to_forget_weight tensor
+ {n_cell, n_input}, // input_to_cell_weight tensor
+ {n_cell, n_input}, // input_to_output_weight tensor
+
+ {n_cell, n_output}, // recurrent_to_input_weight tensor
+ {n_cell, n_output}, // recurrent_to_forget_weight tensor
+ {n_cell, n_output}, // recurrent_to_cell_weight tensor
+ {n_cell, n_output}, // recurrent_to_output_weight tensor
+
+ {0}, // cell_to_input_weight tensor
+ {0}, // cell_to_forget_weight tensor
+ {0}, // cell_to_output_weight tensor
+
+ {n_cell}, // input_gate_bias tensor
+ {n_cell}, // forget_gate_bias tensor
+ {n_cell}, // cell_bias tensor
+ {n_cell}, // output_gate_bias tensor
+
+ {0, 0}, // projection_weight tensor
+ {0}, // projection_bias tensor
+ });
+
+ lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+ -0.34550029, 0.04266912, -0.15680569,
+ -0.34856534, 0.43890524});
+
+ lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+ -0.20583314, 0.44344562, 0.22077113,
+ -0.29909778});
+
+ lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+ -0.31343272, -0.40032279, 0.44781327,
+ 0.01387155, -0.35593212});
+
+ lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+ 0.40525138, 0.44272184, 0.03897077, -0.1556896,
+ 0.19487578});
+
+ lstm.SetInputGateBias({0., 0., 0., 0.});
+
+ lstm.SetCellBias({0., 0., 0., 0.});
+
+ lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+ lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+ lstm.SetRecurrentToInputWeights(
+ {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+ -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+ -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+ lstm.SetRecurrentToCellWeights(
+ {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+ -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+ -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+ lstm.SetRecurrentToForgetWeights(
+ {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+ -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+ 0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+ lstm.SetRecurrentToOutputWeights(
+ {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+ 0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+ -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+ static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+ static float lstm_golden_output[] = {-0.02973187, 0.1229473, 0.20885126,
+ -0.15358765, -0.03716109, 0.12507336,
+ 0.41193449, -0.20860538, -0.15053082,
+ 0.09120187, 0.24278517, -0.12222792};
+
+ // Resetting cell_state and output_state
+ lstm.ResetCellState();
+ lstm.ResetOutputState();
+
+ const int input_sequence_size =
+ sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs());
+ for (int i = 0; i < input_sequence_size; i++) {
+ float* batch0_start = lstm_input + i * lstm.num_inputs();
+ float* batch0_end = batch0_start + lstm.num_inputs();
+
+ lstm.SetInput(0, batch0_start, batch0_end);
+
+ lstm.Invoke();
+
+ float* golden_start = lstm_golden_output + i * lstm.num_outputs();
+ float* golden_end = golden_start + lstm.num_outputs();
+ std::vector<float> expected;
+ expected.insert(expected.end(), golden_start, golden_end);
+ EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+ }
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
+ const int n_batch = 1;
+ const int n_input = 2;
+ // n_cell and n_output have the same size when there is no projection.
+ const int n_cell = 4;
+ const int n_output = 4;
+
+ LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+ /*use_cifg=*/true, /*use_peephole=*/true,
+ /*use_projection_weights=*/false,
+ /*use_projection_bias=*/false,
+ /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+ {
+ {n_batch, n_input}, // input tensor
+
+ {0, 0}, // input_to_input_weight tensor
+ {n_cell, n_input}, // input_to_forget_weight tensor
+ {n_cell, n_input}, // input_to_cell_weight tensor
+ {n_cell, n_input}, // input_to_output_weight tensor
+
+ {0, 0}, // recurrent_to_input_weight tensor
+ {n_cell, n_output}, // recurrent_to_forget_weight tensor
+ {n_cell, n_output}, // recurrent_to_cell_weight tensor
+ {n_cell, n_output}, // recurrent_to_output_weight tensor
+
+ {0}, // cell_to_input_weight tensor
+ {n_cell}, // cell_to_forget_weight tensor
+ {n_cell}, // cell_to_output_weight tensor
+
+ {0}, // input_gate_bias tensor
+ {n_cell}, // forget_gate_bias tensor
+ {n_cell}, // cell_bias tensor
+ {n_cell}, // output_gate_bias tensor
+
+ {0, 0}, // projection_weight tensor
+ {0}, // projection_bias tensor
+ });
+
+ lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+ 0.04717243, 0.48944736, -0.38535351,
+ -0.17212132});
+
+ lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988,
+ -0.3633365, -0.22755712, 0.28253698, 0.24407166,
+ 0.33826375});
+
+ lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593,
+ -0.09426838, -0.44257352, 0.54939759,
+ 0.01533556, 0.42751634});
+
+ lstm.SetCellBias({0., 0., 0., 0.});
+
+ lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+ lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+ lstm.SetRecurrentToCellWeights(
+ {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711,
+ 0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004,
+ 0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288,
+ 0.21193194});
+
+ lstm.SetRecurrentToForgetWeights(
+ {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827,
+ 0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795,
+ -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349});
+
+ lstm.SetRecurrentToOutputWeights(
+ {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908,
+ -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835,
+ 0.50248802, 0.26114327, -0.43736315, 0.33149987});
+
+ lstm.SetCellToForgetWeights(
+ {0.47485286, -0.51955009, -0.24458408, 0.31544167});
+ lstm.SetCellToOutputWeights(
+ {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+
+ static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+ static float lstm_golden_output[] = {-0.36444446, -0.00352185, 0.12886585,
+ -0.05163646, -0.42312205, -0.01218222,
+ 0.24201041, -0.08124574, -0.358325,
+ -0.04621704, 0.21641694, -0.06471302};
+
+ // Resetting cell_state and output_state
+ lstm.ResetCellState();
+ lstm.ResetOutputState();
+
+ const int input_sequence_size =
+ sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs());
+ for (int i = 0; i < input_sequence_size; i++) {
+ float* batch0_start = lstm_input + i * lstm.num_inputs();
+ float* batch0_end = batch0_start + lstm.num_inputs();
+
+ lstm.SetInput(0, batch0_start, batch0_end);
+
+ lstm.Invoke();
+
+ float* golden_start = lstm_golden_output + i * lstm.num_outputs();
+ float* golden_end = golden_start + lstm.num_outputs();
+ std::vector<float> expected;
+ expected.insert(expected.end(), golden_start, golden_end);
+ EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+ }
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
+ const int n_batch = 2;
+ const int n_input = 5;
+ const int n_cell = 20;
+ const int n_output = 16;
+
+ LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+ /*use_cifg=*/false, /*use_peephole=*/true,
+ /*use_projection_weights=*/true,
+ /*use_projection_bias=*/false,
+ /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+ {
+ {n_batch, n_input}, // input tensor
+
+ {n_cell, n_input}, // input_to_input_weight tensor
+ {n_cell, n_input}, // input_to_forget_weight tensor
+ {n_cell, n_input}, // input_to_cell_weight tensor
+ {n_cell, n_input}, // input_to_output_weight tensor
+
+ {n_cell, n_output}, // recurrent_to_input_weight tensor
+ {n_cell, n_output}, // recurrent_to_forget_weight tensor
+ {n_cell, n_output}, // recurrent_to_cell_weight tensor
+ {n_cell, n_output}, // recurrent_to_output_weight tensor
+
+ {n_cell}, // cell_to_input_weight tensor
+ {n_cell}, // cell_to_forget_weight tensor
+ {n_cell}, // cell_to_output_weight tensor
+
+ {n_cell}, // input_gate_bias tensor
+ {n_cell}, // forget_gate_bias tensor
+ {n_cell}, // cell_bias tensor
+ {n_cell}, // output_gate_bias tensor
+
+ {n_output, n_cell}, // projection_weight tensor
+ {0}, // projection_bias tensor
+ });
+
+ lstm.SetInputToInputWeights(
+ {0.021393683, 0.06124551, 0.046905167, -0.014657677, -0.03149463,
+ 0.09171803, 0.14647801, 0.10797193, -0.0057968358, 0.0019193048,
+ -0.2726754, 0.10154029, -0.018539885, 0.080349885, -0.10262385,
+ -0.022599787, -0.09121155, -0.008675967, -0.045206103, -0.0821282,
+ -0.008045952, 0.015478081, 0.055217247, 0.038719587, 0.044153627,
+ -0.06453243, 0.05031825, -0.046935108, -0.008164439, 0.014574226,
+ -0.1671009, -0.15519552, -0.16819797, -0.13971269, -0.11953059,
+ 0.25005487, -0.22790983, 0.009855087, -0.028140958, -0.11200698,
+ 0.11295408, -0.0035217577, 0.054485075, 0.05184695, 0.064711206,
+ 0.10989193, 0.11674786, 0.03490607, 0.07727357, 0.11390585,
+ -0.1863375, -0.1034451, -0.13945189, -0.049401227, -0.18767063,
+ 0.042483903, 0.14233552, 0.13832581, 0.18350165, 0.14545603,
+ -0.028545704, 0.024939531, 0.050929718, 0.0076203286, -0.0029723682,
+ -0.042484224, -0.11827596, -0.09171104, -0.10808628, -0.16327988,
+ -0.2273378, -0.0993647, -0.017155107, 0.0023917493, 0.049272764,
+ 0.0038534778, 0.054764505, 0.089753784, 0.06947234, 0.08014476,
+ -0.04544234, -0.0497073, -0.07135631, -0.048929106, -0.004042012,
+ -0.009284026, 0.018042054, 0.0036860977, -0.07427302, -0.11434604,
+ -0.018995456, 0.031487543, 0.012834908, 0.019977754, 0.044256654,
+ -0.39292613, -0.18519334, -0.11651281, -0.06809892, 0.011373677});
+
+ lstm.SetInputToForgetWeights(
+ {-0.0018401089, -0.004852237, 0.03698424, 0.014181704, 0.028273236,
+ -0.016726194, -0.05249759, -0.10204261, 0.00861066, -0.040979505,
+ -0.009899187, 0.01923892, -0.028177269, -0.08535103, -0.14585495,
+ 0.10662567, -0.01909731, -0.017883534, -0.0047269356, -0.045103323,
+ 0.0030784295, 0.076784775, 0.07463696, 0.094531395, 0.0814421,
+ -0.12257899, -0.033945758, -0.031303465, 0.045630626, 0.06843887,
+ -0.13492945, -0.012480007, -0.0811829, -0.07224499, -0.09628791,
+ 0.045100946, 0.0012300825, 0.013964662, 0.099372394, 0.02543059,
+ 0.06958324, 0.034257296, 0.0482646, 0.06267997, 0.052625068,
+ 0.12784666, 0.07077897, 0.025725935, 0.04165009, 0.07241905,
+ 0.018668644, -0.037377294, -0.06277783, -0.08833636, -0.040120605,
+ -0.011405586, -0.007808335, -0.010301386, -0.005102167, 0.027717464,
+ 0.05483423, 0.11449111, 0.11289652, 0.10939839, 0.13396506,
+ -0.08402166, -0.01901462, -0.044678304, -0.07720565, 0.014350063,
+ -0.11757958, -0.0652038, -0.08185733, -0.076754324, -0.092614375,
+ 0.10405491, 0.052960336, 0.035755895, 0.035839386, -0.012540553,
+ 0.036881298, 0.02913376, 0.03420159, 0.05448447, -0.054523353,
+ 0.02582715, 0.02327355, -0.011857179, -0.0011980024, -0.034641717,
+ -0.026125094, -0.17582615, -0.15923657, -0.27486774, -0.0006143371,
+ 0.0001771948, -8.470171e-05, 0.02651807, 0.045790765, 0.06956496});
+
+ lstm.SetInputToCellWeights(
+ {-0.04580283, -0.09549462, -0.032418985, -0.06454633,
+ -0.043528453, 0.043018587, -0.049152344, -0.12418144,
+ -0.078985475, -0.07596889, 0.019484362, -0.11434962,
+ -0.0074034138, -0.06314844, -0.092981495, 0.0062155537,
+ -0.025034338, -0.0028890965, 0.048929527, 0.06235075,
+ 0.10665918, -0.032036792, -0.08505916, -0.10843358,
+ -0.13002433, -0.036816437, -0.02130134, -0.016518239,
+ 0.0047691227, -0.0025825808, 0.066017866, 0.029991534,
+ -0.10652836, -0.1037554, -0.13056071, -0.03266643,
+ -0.033702414, -0.006473424, -0.04611692, 0.014419339,
+ -0.025174323, 0.0396852, 0.081777506, 0.06157468,
+ 0.10210095, -0.009658194, 0.046511717, 0.03603906,
+ 0.0069369148, 0.015960095, -0.06507666, 0.09551598,
+ 0.053568836, 0.06408714, 0.12835667, -0.008714329,
+ -0.20211966, -0.12093674, 0.029450472, 0.2849013,
+ -0.029227901, 0.1164364, -0.08560263, 0.09941786,
+ -0.036999565, -0.028842626, -0.0033637602, -0.017012902,
+ -0.09720865, -0.11193351, -0.029155117, -0.017936034,
+ -0.009768936, -0.04223324, -0.036159635, 0.06505112,
+ -0.021742892, -0.023377212, -0.07221364, -0.06430552,
+ 0.05453865, 0.091149814, 0.06387331, 0.007518393,
+ 0.055960953, 0.069779344, 0.046411168, 0.10509911,
+ 0.07463894, 0.0075130584, 0.012850982, 0.04555431,
+ 0.056955688, 0.06555285, 0.050801456, -0.009862683,
+ 0.00826772, -0.026555609, -0.0073611983, -0.0014897042});
+
+ lstm.SetInputToOutputWeights(
+ {-0.0998932, -0.07201956, -0.052803773, -0.15629593, -0.15001918,
+ -0.07650751, 0.02359855, -0.075155355, -0.08037709, -0.15093534,
+ 0.029517552, -0.04751393, 0.010350531, -0.02664851, -0.016839722,
+ -0.023121163, 0.0077019283, 0.012851257, -0.05040649, -0.0129761,
+ -0.021737747, -0.038305793, -0.06870586, -0.01481247, -0.001285394,
+ 0.10124236, 0.083122835, 0.053313006, -0.062235646, -0.075637154,
+ -0.027833903, 0.029774971, 0.1130802, 0.09218906, 0.09506135,
+ -0.086665764, -0.037162706, -0.038880914, -0.035832845, -0.014481564,
+ -0.09825003, -0.12048569, -0.097665586, -0.05287633, -0.0964047,
+ -0.11366429, 0.035777505, 0.13568819, 0.052451383, 0.050649304,
+ 0.05798951, -0.021852335, -0.099848844, 0.014740475, -0.078897946,
+ 0.04974699, 0.014160473, 0.06973932, 0.04964942, 0.033364646,
+ 0.08190124, 0.025535367, 0.050893165, 0.048514254, 0.06945813,
+ -0.078907564, -0.06707616, -0.11844508, -0.09986688, -0.07509403,
+ 0.06263226, 0.14925587, 0.20188436, 0.12098451, 0.14639415,
+ 0.0015017595, -0.014267382, -0.03417257, 0.012711468, 0.0028300495,
+ -0.024758482, -0.05098548, -0.0821182, 0.014225672, 0.021544158,
+ 0.08949725, 0.07505268, -0.0020780868, 0.04908258, 0.06476295,
+ -0.022907063, 0.027562456, 0.040185735, 0.019567577, -0.015598739,
+ -0.049097303, -0.017121866, -0.083368234, -0.02332002, -0.0840956});
+
+ lstm.SetInputGateBias(
+ {0.02234832, 0.14757581, 0.18176508, 0.10380666, 0.053110216,
+ -0.06928846, -0.13942584, -0.11816189, 0.19483899, 0.03652339,
+ -0.10250295, 0.036714908, -0.18426876, 0.036065217, 0.21810818,
+ 0.02383196, -0.043370757, 0.08690144, -0.04444982, 0.00030581196});
+
+ lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696,
+ 0.11098921, 0.15378423, 0.09263801, 0.09790885,
+ 0.09508917, 0.061199076, 0.07665568, -0.015443159,
+ -0.03499149, 0.046190713, 0.08895977, 0.10899629,
+ 0.40694186, 0.06030037, 0.012413437, -0.06108739});
+
+ lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132, 0.033463873,
+ -0.1483596, -0.10639995, -0.091433935, 0.058573797,
+ -0.06809782, -0.07889636, -0.043246906, -0.09829136,
+ -0.4279842, 0.034901652, 0.18797937, 0.0075234566,
+ 0.016178843, 0.1749513, 0.13975595, 0.92058027});
+
+ lstm.SetOutputGateBias(
+ {0.046159424, -0.0012809046, 0.03563469, 0.12648113, 0.027195795,
+ 0.35373217, -0.018957434, 0.008907322, -0.0762701, 0.12018895,
+ 0.04216877, 0.0022856654, 0.040952638, 0.3147856, 0.08225149,
+ -0.057416286, -0.14995944, -0.008040261, 0.13208859, 0.029760877});
+
+ lstm.SetRecurrentToInputWeights(
+ {-0.001374326, -0.078856036, 0.10672688, 0.029162422,
+ -0.11585556, 0.02557986, -0.13446963, -0.035785314,
+ -0.01244275, 0.025961924, -0.02337298, -0.044228926,
+ -0.055839065, -0.046598054, -0.010546039, -0.06900766,
+ 0.027239809, 0.022582639, -0.013296484, -0.05459212,
+ 0.08981, -0.045407712, 0.08682226, -0.06867011,
+ -0.14390695, -0.02916037, 0.000996957, 0.091420636,
+ 0.14283475, -0.07390571, -0.06402044, 0.062524505,
+ -0.093129106, 0.04860203, -0.08364217, -0.08119002,
+ 0.009352075, 0.22920375, 0.0016303885, 0.11583097,
+ -0.13732095, 0.012405723, -0.07551853, 0.06343048,
+ 0.12162708, -0.031923793, -0.014335606, 0.01790974,
+ -0.10650317, -0.0724401, 0.08554849, -0.05727212,
+ 0.06556731, -0.042729504, -0.043227166, 0.011683251,
+ -0.013082158, -0.029302018, -0.010899579, -0.062036745,
+ -0.022509435, -0.00964907, -0.01567329, 0.04260106,
+ -0.07787477, -0.11576462, 0.017356863, 0.048673786,
+ -0.017577527, -0.05527947, -0.082487635, -0.040137455,
+ -0.10820036, -0.04666372, 0.022746278, -0.07851417,
+ 0.01068115, 0.032956902, 0.022433773, 0.0026891115,
+ 0.08944216, -0.0685835, 0.010513544, 0.07228705,
+ 0.02032331, -0.059686817, -0.0005566496, -0.086984694,
+ 0.040414046, -0.1380399, 0.094208956, -0.05722982,
+ 0.012092817, -0.04989123, -0.086576, -0.003399834,
+ -0.04696032, -0.045747425, 0.10091314, 0.048676282,
+ -0.029037097, 0.031399418, -0.0040285117, 0.047237843,
+ 0.09504992, 0.041799378, -0.049185462, -0.031518843,
+ -0.10516937, 0.026374253, 0.10058866, -0.0033195973,
+ -0.041975245, 0.0073591834, 0.0033782164, -0.004325073,
+ -0.10167381, 0.042500053, -0.01447153, 0.06464186,
+ -0.017142897, 0.03312627, 0.009205989, 0.024138335,
+ -0.011337001, 0.035530265, -0.010912711, 0.0706555,
+ -0.005894094, 0.051841937, -0.1401738, -0.02351249,
+ 0.0365468, 0.07590991, 0.08838724, 0.021681072,
+ -0.10086113, 0.019608743, -0.06195883, 0.077335775,
+ 0.023646897, -0.095322326, 0.02233014, 0.09756986,
+ -0.048691444, -0.009579111, 0.07595467, 0.11480546,
+ -0.09801813, 0.019894179, 0.08502348, 0.004032281,
+ 0.037211012, 0.068537936, -0.048005626, -0.091520436,
+ -0.028379958, -0.01556313, 0.06554592, -0.045599163,
+ -0.01672207, -0.020169014, -0.011877351, -0.20212261,
+ 0.010889619, 0.0047078193, 0.038385306, 0.08540671,
+ -0.017140968, -0.0035865551, 0.016678626, 0.005633034,
+ 0.015963363, 0.00871737, 0.060130805, 0.028611384,
+ 0.10109069, -0.015060172, -0.07894427, 0.06401885,
+ 0.011584063, -0.024466386, 0.0047652307, -0.09041358,
+ 0.030737216, -0.0046374933, 0.14215417, -0.11823516,
+ 0.019899689, 0.006106124, -0.027092824, 0.0786356,
+ 0.05052217, -0.058925, -0.011402121, -0.024987547,
+ -0.0013661642, -0.06832946, -0.015667673, -0.1083353,
+ -0.00096863037, -0.06988685, -0.053350925, -0.027275559,
+ -0.033664223, -0.07978348, -0.025200296, -0.017207067,
+ -0.058403496, -0.055697463, 0.005798788, 0.12965427,
+ -0.062582195, 0.0013350133, -0.10482091, 0.0379771,
+ 0.072521195, -0.0029455067, -0.13797039, -0.03628521,
+ 0.013806405, -0.017858358, -0.01008298, -0.07700066,
+ -0.017081132, 0.019358726, 0.0027079724, 0.004635139,
+ 0.062634714, -0.02338735, -0.039547626, -0.02050681,
+ 0.03385117, -0.083611414, 0.002862572, -0.09421313,
+ 0.058618143, -0.08598433, 0.00972939, 0.023867095,
+ -0.053934585, -0.023203006, 0.07452513, -0.048767887,
+ -0.07314807, -0.056307215, -0.10433547, -0.06440842,
+ 0.04328182, 0.04389765, -0.020006588, -0.09076438,
+ -0.11652589, -0.021705797, 0.03345259, -0.010329105,
+ -0.025767034, 0.013057034, -0.07316461, -0.10145612,
+ 0.06358255, 0.18531723, 0.07759293, 0.12006465,
+ 0.1305557, 0.058638252, -0.03393652, 0.09622831,
+ -0.16253184, -2.4580743e-06, 0.079869635, -0.070196845,
+ -0.005644518, 0.06857898, -0.12598175, -0.035084512,
+ 0.03156317, -0.12794146, -0.031963028, 0.04692781,
+ 0.030070418, 0.0071660685, -0.095516115, -0.004643372,
+ 0.040170413, -0.062104587, -0.0037324072, 0.0554317,
+ 0.08184801, -0.019164372, 0.06791302, 0.034257166,
+ -0.10307039, 0.021943003, 0.046745934, 0.0790918,
+ -0.0265588, -0.007824208, 0.042546265, -0.00977924,
+ -0.0002440307, -0.017384544, -0.017990116, 0.12252321,
+ -0.014512694, -0.08251313, 0.08861942, 0.13589665,
+ 0.026351685, 0.012641483, 0.07466548, 0.044301085,
+ -0.045414884, -0.051112458, 0.03444247, -0.08502782,
+ -0.04106223, -0.028126027, 0.028473156, 0.10467447});
+
+ lstm.SetRecurrentToForgetWeights(
+ {-0.057784554, -0.026057621, -0.068447545, -0.022581743,
+ 0.14811787, 0.10826372, 0.09471067, 0.03987225,
+ -0.0039523416, 0.00030638507, 0.053185795, 0.10572994,
+ 0.08414449, -0.022036452, -0.00066928595, -0.09203576,
+ 0.032950465, -0.10985798, -0.023809856, 0.0021431844,
+ -0.02196096, -0.00326074, 0.00058621005, -0.074678116,
+ -0.06193199, 0.055729095, 0.03736828, 0.020123724,
+ 0.061878487, -0.04729229, 0.034919553, -0.07585433,
+ -0.04421272, -0.044019096, 0.085488975, 0.04058006,
+ -0.06890133, -0.030951202, -0.024628663, -0.07672815,
+ 0.034293607, 0.08556707, -0.05293577, -0.033561368,
+ -0.04899627, 0.0241671, 0.015736353, -0.095442444,
+ -0.029564252, 0.016493602, -0.035026584, 0.022337519,
+ -0.026871363, 0.004780428, 0.0077918363, -0.03601621,
+ 0.016435321, -0.03263031, -0.09543275, -0.047392778,
+ 0.013454138, 0.028934088, 0.01685226, -0.086110644,
+ -0.046250615, -0.01847454, 0.047608484, 0.07339695,
+ 0.034546845, -0.04881143, 0.009128804, -0.08802852,
+ 0.03761666, 0.008096139, -0.014454086, 0.014361001,
+ -0.023502491, -0.0011840804, -0.07607001, 0.001856849,
+ -0.06509276, -0.006021153, -0.08570962, -0.1451793,
+ 0.060212336, 0.055259194, 0.06974018, 0.049454916,
+ -0.027794661, -0.08077226, -0.016179763, 0.1169753,
+ 0.17213494, -0.0056326236, -0.053934924, -0.0124349,
+ -0.11520337, 0.05409887, 0.088759385, 0.0019655675,
+ 0.0042065294, 0.03881498, 0.019844765, 0.041858196,
+ -0.05695512, 0.047233116, 0.038937137, -0.06542224,
+ 0.014429736, -0.09719407, 0.13908425, -0.05379757,
+ 0.012321099, 0.082840554, -0.029899208, 0.044217527,
+ 0.059855383, 0.07711018, -0.045319796, 0.0948846,
+ -0.011724666, -0.0033288454, -0.033542685, -0.04764985,
+ -0.13873616, 0.040668588, 0.034832682, -0.015319203,
+ -0.018715994, 0.046002675, 0.0599172, -0.043107376,
+ 0.0294216, -0.002314414, -0.022424703, 0.0030315618,
+ 0.0014641669, 0.0029166266, -0.11878115, 0.013738511,
+ 0.12375372, -0.0006038222, 0.029104086, 0.087442465,
+ 0.052958444, 0.07558703, 0.04817258, 0.044462286,
+ -0.015213451, -0.08783778, -0.0561384, -0.003008196,
+ 0.047060397, -0.002058388, 0.03429439, -0.018839769,
+ 0.024734668, 0.024614193, -0.042046934, 0.09597743,
+ -0.0043254104, 0.04320769, 0.0064070094, -0.0019131786,
+ -0.02558259, -0.022822596, -0.023273505, -0.02464396,
+ -0.10991725, -0.006240552, 0.0074488563, 0.024044557,
+ 0.04383914, -0.046476185, 0.028658995, 0.060410924,
+ 0.050786525, 0.009452605, -0.0073054377, -0.024810238,
+ 0.0052906186, 0.0066939713, -0.0020913032, 0.014515517,
+ 0.015898481, 0.021362653, -0.030262267, 0.016587038,
+ -0.011442813, 0.041154444, -0.007631438, -0.03423484,
+ -0.010977775, 0.036152758, 0.0066366293, 0.11915515,
+ 0.02318443, -0.041350313, 0.021485701, -0.10906167,
+ -0.028218046, -0.00954771, 0.020531068, -0.11995105,
+ -0.03672871, 0.024019798, 0.014255957, -0.05221243,
+ -0.00661567, -0.04630967, 0.033188973, 0.10107534,
+ -0.014027541, 0.030796422, -0.10270911, -0.035999842,
+ 0.15443139, 0.07684145, 0.036571592, -0.035900835,
+ -0.0034699554, 0.06209149, 0.015920248, -0.031122351,
+ -0.03858649, 0.01849943, 0.13872518, 0.01503974,
+ 0.069941424, -0.06948533, -0.0088794185, 0.061282158,
+ -0.047401894, 0.03100163, -0.041533746, -0.10430945,
+ 0.044574402, -0.01425562, -0.024290353, 0.034563623,
+ 0.05866852, 0.023947537, -0.09445152, 0.035450947,
+ 0.02247216, -0.0042998926, 0.061146557, -0.10250651,
+ 0.020881841, -0.06747029, 0.10062043, -0.0023941975,
+ 0.03532124, -0.016341697, 0.09685456, -0.016764693,
+ 0.051808182, 0.05875331, -0.04536488, 0.001626336,
+ -0.028892258, -0.01048663, -0.009793449, -0.017093895,
+ 0.010987891, 0.02357273, -0.00010856845, 0.0099760275,
+ -0.001845119, -0.03551521, 0.0018358806, 0.05763657,
+ -0.01769146, 0.040995963, 0.02235177, -0.060430344,
+ 0.11475477, -0.023854522, 0.10071741, 0.0686208,
+ -0.014250481, 0.034261297, 0.047418304, 0.08562733,
+ -0.030519066, 0.0060542435, 0.014653856, -0.038836084,
+ 0.04096551, 0.032249358, -0.08355519, -0.026823482,
+ 0.056386515, -0.010401743, -0.028396193, 0.08507674,
+ 0.014410365, 0.020995233, 0.17040324, 0.11511526,
+ 0.02459721, 0.0066619175, 0.025853224, -0.023133837,
+ -0.081302024, 0.017264642, -0.009585969, 0.09491168,
+ -0.051313367, 0.054532815, -0.014298593, 0.10657464,
+ 0.007076659, 0.10964551, 0.0409152, 0.008275321,
+ -0.07283536, 0.07937492, 0.04192024, -0.1075027});
+
+ lstm.SetRecurrentToCellWeights(
+ {-0.037322544, 0.018592842, 0.0056175636, -0.06253426,
+ 0.055647098, -0.05713207, -0.05626563, 0.005559383,
+ 0.03375411, -0.025757805, -0.088049285, 0.06017052,
+ -0.06570978, 0.007384076, 0.035123326, -0.07920549,
+ 0.053676967, 0.044480428, -0.07663568, 0.0071805613,
+ 0.08089997, 0.05143358, 0.038261272, 0.03339287,
+ -0.027673481, 0.044746667, 0.028349208, 0.020090483,
+ -0.019443132, -0.030755889, -0.0040000007, 0.04465846,
+ -0.021585021, 0.0031670958, 0.0053199246, -0.056117613,
+ -0.10893326, 0.076739706, -0.08509834, -0.027997585,
+ 0.037871376, 0.01449768, -0.09002357, -0.06111149,
+ -0.046195522, 0.0422062, -0.005683705, -0.1253618,
+ -0.012925729, -0.04890792, 0.06985068, 0.037654128,
+ 0.03398274, -0.004781977, 0.007032333, -0.031787455,
+ 0.010868644, -0.031489216, 0.09525667, 0.013939797,
+ 0.0058680447, 0.0167067, 0.02668468, -0.04797466,
+ -0.048885044, -0.12722108, 0.035304096, 0.06554885,
+ 0.00972396, -0.039238118, -0.05159735, -0.11329045,
+ 0.1613692, -0.03750952, 0.06529313, -0.071974665,
+ -0.11769596, 0.015524369, -0.0013754242, -0.12446318,
+ 0.02786344, -0.014179351, 0.005264273, 0.14376344,
+ 0.015983658, 0.03406988, -0.06939408, 0.040699873,
+ 0.02111075, 0.09669095, 0.041345075, -0.08316494,
+ -0.07684199, -0.045768797, 0.032298047, -0.041805092,
+ 0.0119405, 0.0061010392, 0.12652606, 0.0064572375,
+ -0.024950314, 0.11574242, 0.04508852, -0.04335324,
+ 0.06760663, -0.027437469, 0.07216407, 0.06977076,
+ -0.05438599, 0.034033038, -0.028602652, 0.05346137,
+ 0.043184172, -0.037189785, 0.10420091, 0.00882477,
+ -0.054019816, -0.074273005, -0.030617684, -0.0028467078,
+ 0.024302477, -0.0038869337, 0.005332455, 0.0013399826,
+ 0.04361412, -0.007001822, 0.09631092, -0.06702025,
+ -0.042049985, -0.035070654, -0.04103342, -0.10273396,
+ 0.0544271, 0.037184782, -0.13150354, -0.0058036847,
+ -0.008264958, 0.042035464, 0.05891794, 0.029673764,
+ 0.0063542654, 0.044788733, 0.054816857, 0.062257513,
+ -0.00093483756, 0.048938446, -0.004952862, -0.007730018,
+ -0.04043371, -0.017094059, 0.07229206, -0.023670016,
+ -0.052195564, -0.025616996, -0.01520939, 0.045104615,
+ -0.007376126, 0.003533447, 0.006570588, 0.056037236,
+ 0.12436656, 0.051817212, 0.028532185, -0.08686856,
+ 0.11868599, 0.07663395, -0.07323171, 0.03463402,
+ -0.050708205, -0.04458982, -0.11590894, 0.021273347,
+ 0.1251325, -0.15313013, -0.12224372, 0.17228661,
+ 0.023029093, 0.086124025, 0.006445803, -0.03496501,
+ 0.028332196, 0.04449512, -0.042436164, -0.026587414,
+ -0.006041347, -0.09292539, -0.05678812, 0.03897832,
+ 0.09465633, 0.008115513, -0.02171956, 0.08304309,
+ 0.071401566, 0.019622514, 0.032163795, -0.004167056,
+ 0.02295182, 0.030739572, 0.056506045, 0.004612461,
+ 0.06524936, 0.059999723, 0.046395954, -0.0045512207,
+ -0.1335546, -0.030136576, 0.11584653, -0.014678886,
+ 0.0020118146, -0.09688814, -0.0790206, 0.039770417,
+ -0.0329582, 0.07922767, 0.029322514, 0.026405897,
+ 0.04207835, -0.07073373, 0.063781224, 0.0859677,
+ -0.10925287, -0.07011058, 0.048005477, 0.03438226,
+ -0.09606514, -0.006669445, -0.043381985, 0.04240257,
+ -0.06955775, -0.06769346, 0.043903265, -0.026784198,
+ -0.017840602, 0.024307009, -0.040079936, -0.019946516,
+ 0.045318738, -0.12233574, 0.026170589, 0.0074471775,
+ 0.15978073, 0.10185836, 0.10298046, -0.015476589,
+ -0.039390966, -0.072174534, 0.0739445, -0.1211869,
+ -0.0347889, -0.07943156, 0.014809798, -0.12412325,
+ -0.0030663363, 0.039695457, 0.0647603, -0.08291318,
+ -0.018529687, -0.004423833, 0.0037507233, 0.084633216,
+ -0.01514876, -0.056505352, -0.012800942, -0.06994386,
+ 0.012962922, -0.031234352, 0.07029052, 0.016418684,
+ 0.03618972, 0.055686004, -0.08663945, -0.017404709,
+ -0.054761406, 0.029065743, 0.052404847, 0.020238016,
+ 0.0048197987, -0.0214882, 0.07078733, 0.013016777,
+ 0.06262858, 0.009184685, 0.020785125, -0.043904778,
+ -0.0270329, -0.03299152, -0.060088247, -0.015162964,
+ -0.001828936, 0.12642565, -0.056757294, 0.013586685,
+ 0.09232601, -0.035886683, 0.06000002, 0.05229691,
+ -0.052580316, -0.082029596, -0.010794592, 0.012947712,
+ -0.036429964, -0.085508935, -0.13127148, -0.017744139,
+ 0.031502828, 0.036232427, -0.031581745, 0.023051167,
+ -0.05325106, -0.03421577, 0.028793324, -0.034633752,
+ -0.009881397, -0.043551125, -0.018609839, 0.0019097115,
+ -0.008799762, 0.056595087, 0.0022273948, 0.055752404});
+
+ lstm.SetRecurrentToOutputWeights({
+ 0.025825322, -0.05813119, 0.09495884, -0.045984812, -0.01255415,
+ -0.0026479573, -0.08196161, -0.054914974, -0.0046604523, -0.029587349,
+ -0.044576716, -0.07480124, -0.082868785, 0.023254942, 0.027502948,
+ -0.0039728214, -0.08683098, -0.08116779, -0.014675607, -0.037924774,
+ -0.023314456, -0.007401714, -0.09255757, 0.029460307, -0.08829125,
+ -0.005139627, -0.08989442, -0.0555066, 0.13596267, -0.025062224,
+ -0.048351806, -0.03850004, 0.07266485, -0.022414139, 0.05940088,
+ 0.075114764, 0.09597592, -0.010211725, -0.0049794707, -0.011523867,
+ -0.025980417, 0.072999895, 0.11091378, -0.081685916, 0.014416728,
+ 0.043229222, 0.034178585, -0.07530371, 0.035837382, -0.085607,
+ -0.007721233, -0.03287832, -0.043848954, -0.06404588, -0.06632928,
+ -0.073643476, 0.008214239, -0.045984086, 0.039764922, 0.03474462,
+ 0.060612556, -0.080590084, 0.049127717, 0.04151091, -0.030063879,
+ 0.008801774, -0.023021035, -0.019558564, 0.05158114, -0.010947698,
+ -0.011825728, 0.0075720972, 0.0699727, -0.0039981045, 0.069350146,
+ 0.08799282, 0.016156472, 0.035502106, 0.11695009, 0.006217345,
+ 0.13392477, -0.037875112, 0.025745004, 0.08940699, -0.00924166,
+ 0.0046702605, -0.036598757, -0.08811812, 0.10522024, -0.032441203,
+ 0.008176899, -0.04454919, 0.07058152, 0.0067963637, 0.039206743,
+ 0.03259838, 0.03725492, -0.09515802, 0.013326398, -0.052055415,
+ -0.025676316, 0.03198509, -0.015951829, -0.058556724, 0.036879618,
+ 0.043357447, 0.028362012, -0.05908629, 0.0059240665, -0.04995891,
+ -0.019187413, 0.0276265, -0.01628143, 0.0025863599, 0.08800015,
+ 0.035250366, -0.022165963, -0.07328642, -0.009415526, -0.07455109,
+ 0.11690406, 0.0363299, 0.07411125, 0.042103454, -0.009660886,
+ 0.019076364, 0.018299393, -0.046004917, 0.08891175, 0.0431396,
+ -0.026327137, -0.051502608, 0.08979574, -0.051670972, 0.04940282,
+ -0.07491107, -0.021240504, 0.022596184, -0.034280192, 0.060163025,
+ -0.058211457, -0.051837247, -0.01349775, -0.04639988, -0.035936575,
+ -0.011681591, 0.064818054, 0.0073146066, -0.021745546, -0.043124277,
+ -0.06471268, -0.07053354, -0.029321948, -0.05330136, 0.016933719,
+ -0.053782392, 0.13747959, -0.1361751, -0.11569455, 0.0033329215,
+ 0.05693899, -0.053219706, 0.063698, 0.07977434, -0.07924483,
+ 0.06936997, 0.0034815092, -0.007305279, -0.037325785, -0.07251102,
+ -0.033633437, -0.08677009, 0.091591336, -0.14165086, 0.021752775,
+ 0.019683983, 0.0011612234, -0.058154266, 0.049996935, 0.0288841,
+ -0.0024567875, -0.14345716, 0.010955264, -0.10234828, 0.1183656,
+ -0.0010731248, -0.023590032, -0.072285876, -0.0724771, -0.026382286,
+ -0.0014920527, 0.042667855, 0.0018776858, 0.02986552, 0.009814309,
+ 0.0733756, 0.12289186, 0.018043943, -0.0458958, 0.049412545,
+ 0.033632483, 0.05495232, 0.036686596, -0.013781798, -0.010036754,
+ 0.02576849, -0.08307328, 0.010112348, 0.042521734, -0.05869831,
+ -0.071689695, 0.03876447, -0.13275425, -0.0352966, -0.023077697,
+ 0.10285965, 0.084736146, 0.15568255, -0.00040734606, 0.027835453,
+ -0.10292561, -0.032401145, 0.10053256, -0.026142767, -0.08271222,
+ -0.0030240538, -0.016368777, 0.1070414, 0.042672627, 0.013456989,
+ -0.0437609, -0.022309763, 0.11576483, 0.04108048, 0.061026827,
+ -0.0190714, -0.0869359, 0.037901703, 0.0610107, 0.07202949,
+ 0.01675338, 0.086139716, -0.08795751, -0.014898893, -0.023771819,
+ -0.01965048, 0.007955471, -0.043740474, 0.03346837, -0.10549954,
+ 0.090567775, 0.042013682, -0.03176985, 0.12569028, -0.02421228,
+ -0.029526481, 0.023851605, 0.031539805, 0.05292009, -0.02344001,
+ -0.07811758, -0.08834428, 0.10094801, 0.16594367, -0.06861939,
+ -0.021256343, -0.041093912, -0.06669611, 0.035498552, 0.021757556,
+ -0.09302526, -0.015403468, -0.06614931, -0.051798206, -0.013874718,
+ 0.03630673, 0.010412845, -0.08077351, 0.046185967, 0.0035662893,
+ 0.03541868, -0.094149634, -0.034814864, 0.003128424, -0.020674974,
+ -0.03944324, -0.008110165, -0.11113267, 0.08484226, 0.043586485,
+ 0.040582247, 0.0968012, -0.065249965, -0.028036479, 0.0050708856,
+ 0.0017462453, 0.0326779, 0.041296225, 0.09164146, -0.047743853,
+ -0.015952192, -0.034451712, 0.084197424, -0.05347844, -0.11768019,
+ 0.085926116, -0.08251791, -0.045081906, 0.0948852, 0.068401024,
+ 0.024856757, 0.06978981, -0.057309967, -0.012775832, -0.0032452994,
+ 0.01977615, -0.041040014, -0.024264973, 0.063464895, 0.05431621,
+ });
+
+ lstm.SetCellToInputWeights(
+ {0.040369894, 0.030746894, 0.24704495, 0.018586371, -0.037586458,
+ -0.15312155, -0.11812848, -0.11465643, 0.20259799, 0.11418174,
+ -0.10116027, -0.011334949, 0.12411352, -0.076769054, -0.052169047,
+ 0.21198851, -0.38871562, -0.09061183, -0.09683246, -0.21929175});
+
+ lstm.SetCellToForgetWeights(
+ {-0.01998659, -0.15568835, -0.24248174, -0.012770197, 0.041331276,
+ -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+ -0.047248036, 0.021479502, 0.033189066, 0.11952997, -0.020432774,
+ 0.64658105, -0.06650122, -0.03467612, 0.095340036, 0.23647355});
+
+ lstm.SetCellToOutputWeights(
+ {0.08286371, -0.08261836, -0.51210177, 0.002913762, 0.17764764,
+ -0.5495371, -0.08460716, -0.24552552, 0.030037103, 0.04123544,
+ -0.11940523, 0.007358328, 0.1890978, 0.4833202, -0.34441817,
+ 0.36312827, -0.26375428, 0.1457655, -0.19724406, 0.15548733});
+
+ lstm.SetProjectionWeights(
+ {-0.009802181, 0.09401916, 0.0717386, -0.13895074, 0.09641832,
+ 0.060420845, 0.08539281, 0.054285463, 0.061395317, 0.034448683,
+ -0.042991187, 0.019801661, -0.16840284, -0.015726732, -0.23041931,
+ -0.024478018, -0.10959692, -0.013875541, 0.18600968, -0.061274476,
+ 0.0138165, -0.08160894, -0.07661644, 0.032372914, 0.16169067,
+ 0.22465782, -0.03993472, -0.004017731, 0.08633481, -0.28869787,
+ 0.08682067, 0.17240396, 0.014975425, 0.056431185, 0.031037588,
+ 0.16702051, 0.0077946745, 0.15140012, 0.29405436, 0.120285,
+ -0.188994, -0.027265169, 0.043389652, -0.022061434, 0.014777949,
+ -0.20203483, 0.094781205, 0.19100232, 0.13987629, -0.036132768,
+ -0.06426278, -0.05108664, 0.13221376, 0.009441198, -0.16715929,
+ 0.15859416, -0.040437475, 0.050779544, -0.022187516, 0.012166504,
+ 0.027685808, -0.07675938, -0.0055694645, -0.09444123, 0.0046453946,
+ 0.050794356, 0.10770313, -0.20790008, -0.07149004, -0.11425117,
+ 0.008225835, -0.035802525, 0.14374903, 0.15262283, 0.048710253,
+ 0.1847461, -0.007487823, 0.11000021, -0.09542012, 0.22619456,
+ -0.029149994, 0.08527916, 0.009043713, 0.0042746216, 0.016261552,
+ 0.022461696, 0.12689082, -0.043589946, -0.12035478, -0.08361797,
+ -0.050666027, -0.1248618, -0.1275799, -0.071875185, 0.07377272,
+ 0.09944291, -0.18897448, -0.1593054, -0.06526116, -0.040107165,
+ -0.004618631, -0.067624845, -0.007576253, 0.10727444, 0.041546922,
+ -0.20424393, 0.06907816, 0.050412357, 0.00724631, 0.039827548,
+ 0.12449835, 0.10747581, 0.13708383, 0.09134148, -0.12617786,
+ -0.06428341, 0.09956831, 0.1208086, -0.14676677, -0.0727722,
+ 0.1126304, 0.010139365, 0.015571211, -0.038128063, 0.022913318,
+ -0.042050496, 0.16842307, -0.060597885, 0.10531834, -0.06411776,
+ -0.07451711, -0.03410368, -0.13393489, 0.06534304, 0.003620307,
+ 0.04490757, 0.05970546, 0.05197996, 0.02839995, 0.10434969,
+ -0.013699693, -0.028353551, -0.07260381, 0.047201227, -0.024575593,
+ -0.036445823, 0.07155557, 0.009672501, -0.02328883, 0.009533515,
+ -0.03606021, -0.07421458, -0.028082801, -0.2678904, -0.13221288,
+ 0.18419984, -0.13012612, -0.014588381, -0.035059117, -0.04824723,
+ 0.07830115, -0.056184657, 0.03277091, 0.025466874, 0.14494097,
+ -0.12522776, -0.098633975, -0.10766018, -0.08317623, 0.08594209,
+ 0.07749552, 0.039474737, 0.1776665, -0.07409566, -0.0477268,
+ 0.29323658, 0.10801441, 0.1154011, 0.013952499, 0.10739139,
+ 0.10708251, -0.051456142, 0.0074137426, -0.10430189, 0.10034707,
+ 0.045594677, 0.0635285, -0.0715442, -0.089667566, -0.10811871,
+ 0.00026344223, 0.08298446, -0.009525053, 0.006585689, -0.24567553,
+ -0.09450807, 0.09648481, 0.026996298, -0.06419476, -0.04752702,
+ -0.11063944, -0.23441927, -0.17608605, -0.052156363, 0.067035615,
+ 0.19271925, -0.0032889997, -0.043264326, 0.09663576, -0.057112187,
+ -0.10100678, 0.0628376, 0.04447668, 0.017961001, -0.10094388,
+ -0.10190601, 0.18335468, 0.10494553, -0.052095775, -0.0026118709,
+ 0.10539724, -0.04383912, -0.042349473, 0.08438151, -0.1947263,
+ 0.02251204, 0.11216432, -0.10307853, 0.17351969, -0.039091777,
+ 0.08066188, -0.00561982, 0.12633002, 0.11335965, -0.0088127935,
+ -0.019777594, 0.06864014, -0.059751723, 0.016233567, -0.06894641,
+ -0.28651384, -0.004228674, 0.019708522, -0.16305895, -0.07468996,
+ -0.0855457, 0.099339016, -0.07580735, -0.13775392, 0.08434318,
+ 0.08330512, -0.12131499, 0.031935584, 0.09180414, -0.08876437,
+ -0.08049874, 0.008753825, 0.03498998, 0.030215185, 0.03907079,
+ 0.089751154, 0.029194152, -0.03337423, -0.019092513, 0.04331237,
+ 0.04299654, -0.036394123, -0.12915532, 0.09793732, 0.07512415,
+ -0.11319543, -0.032502122, 0.15661901, 0.07671967, -0.005491124,
+ -0.19379048, -0.218606, 0.21448623, 0.017840758, 0.1416943,
+ -0.07051762, 0.19488361, 0.02664691, -0.18104725, -0.09334311,
+ 0.15026465, -0.15493552, -0.057762887, -0.11604192, -0.262013,
+ -0.01391798, 0.012185008, 0.11156489, -0.07483202, 0.06693364,
+ -0.26151478, 0.046425626, 0.036540434, -0.16435726, 0.17338543,
+ -0.21401681, -0.11385144, -0.08283257, -0.069031075, 0.030635102,
+ 0.010969227, 0.11109743, 0.010919218, 0.027526086, 0.13519906,
+ 0.01891392, -0.046839405, -0.040167913, 0.017953383, -0.09700955,
+ 0.0061885654, -0.07000971, 0.026893595, -0.038844477, 0.14543656});
+
+ static float lstm_input[][20] = {
+ {// Batch0: 4 (input_sequence_size) * 5 (n_input)
+ 0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386,
+ 0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199,
+ 0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339},
+
+ {// Batch1: 4 (input_sequence_size) * 5 (n_input)
+ 0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260,
+ 0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485,
+ 0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}};
+
+ static float lstm_golden_output[][64] = {
+ {// Batch0: 4 (input_sequence_size) * 16 (n_output)
+ -0.00396806, 0.029352, -0.00279226, 0.0159977, -0.00835576,
+ -0.0211779, 0.0283512, -0.0114597, 0.00907307, -0.0244004,
+ -0.0152191, -0.0259063, 0.00914318, 0.00415118, 0.017147,
+ 0.0134203, -0.0166936, 0.0381209, 0.000889694, 0.0143363,
+ -0.0328911, -0.0234288, 0.0333051, -0.012229, 0.0110322,
+ -0.0457725, -0.000832209, -0.0202817, 0.0327257, 0.0121308,
+ 0.0155969, 0.0312091, -0.0213783, 0.0350169, 0.000324794,
+ 0.0276012, -0.0263374, -0.0371449, 0.0446149, -0.0205474,
+ 0.0103729, -0.0576349, -0.0150052, -0.0292043, 0.0376827,
+ 0.0136115, 0.0243435, 0.0354492, -0.0189322, 0.0464512,
+ -0.00251373, 0.0225745, -0.0308346, -0.0317124, 0.0460407,
+ -0.0189395, 0.0149363, -0.0530162, -0.0150767, -0.0340193,
+ 0.0286833, 0.00824207, 0.0264887, 0.0305169},
+ {// Batch1: 4 (input_sequence_size) * 16 (n_output)
+ -0.013869, 0.0287268, -0.00334693, 0.00733398, -0.0287926,
+ -0.0186926, 0.0193662, -0.0115437, 0.00422612, -0.0345232,
+ 0.00223253, -0.00957321, 0.0210624, 0.013331, 0.0150954,
+ 0.02168, -0.0141913, 0.0322082, 0.00227024, 0.0260507,
+ -0.0188721, -0.0296489, 0.0399134, -0.0160509, 0.0116039,
+ -0.0447318, -0.0150515, -0.0277406, 0.0316596, 0.0118233,
+ 0.0214762, 0.0293641, -0.0204549, 0.0450315, -0.00117378,
+ 0.0167673, -0.0375007, -0.0238314, 0.038784, -0.0174034,
+ 0.0131743, -0.0506589, -0.0048447, -0.0240239, 0.0325789,
+ 0.00790065, 0.0220157, 0.0333314, -0.0264787, 0.0387855,
+ -0.000764675, 0.0217599, -0.037537, -0.0335206, 0.0431679,
+ -0.0211424, 0.010203, -0.062785, -0.00832363, -0.025181,
+ 0.0412031, 0.0118723, 0.0239643, 0.0394009}};
+
+ // Resetting cell_state and output_state
+ lstm.ResetCellState();
+ lstm.ResetOutputState();
+
+ const int input_sequence_size =
+ sizeof(lstm_input[0]) / sizeof(float) / (lstm.num_inputs());
+ for (int i = 0; i < input_sequence_size; i++) {
+ float* batch0_start = lstm_input[0] + i * lstm.num_inputs();
+ float* batch0_end = batch0_start + lstm.num_inputs();
+
+ lstm.SetInput(0, batch0_start, batch0_end);
+
+ float* batch1_start = lstm_input[1] + i * lstm.num_inputs();
+ float* batch1_end = batch1_start + lstm.num_inputs();
+ lstm.SetInput(lstm.num_inputs(), batch1_start, batch1_end);
+
+ lstm.Invoke();
+
+ float* golden_start_batch0 = lstm_golden_output[0] + i * lstm.num_outputs();
+ float* golden_end_batch0 = golden_start_batch0 + lstm.num_outputs();
+ float* golden_start_batch1 = lstm_golden_output[1] + i * lstm.num_outputs();
+ float* golden_end_batch1 = golden_start_batch1 + lstm.num_outputs();
+ std::vector<float> expected;
+ expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
+ expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
+ EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+ }
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
new file mode 100644
index 0000000000..81c73f2523
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -0,0 +1,167 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace mul {
+
+// This file has three implementation of Mul.
+enum KernelType {
+ kReference,
+ kGenericOptimized, // Neon-free
+ kNeonOptimized,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+ TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
+ for (int i = 0; i < NumDimensions(input1); ++i) {
+ TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
+ SizeOfDimension(input2, i));
+ }
+
+ TF_LITE_ENSURE_EQ(context, input1->type, output->type);
+ TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
+ return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+ TfLiteMulParams* params, TfLiteTensor* input1,
+ TfLiteTensor* input2, TfLiteTensor* output) {
+ float output_activation_min, output_activation_max;
+ CalculateActivationRangeFloat(params->activation, &output_activation_min,
+ &output_activation_max);
+#define TF_LITE_MUL(type) \
+ type::Mul(GetTensorData<float>(input1), GetTensorDims(input1), \
+ GetTensorData<float>(input2), GetTensorDims(input2), \
+ output_activation_min, output_activation_max, \
+ GetTensorData<float>(output), GetTensorDims(output))
+ if (kernel_type == kReference) {
+ TF_LITE_MUL(reference_ops);
+ } else {
+ TF_LITE_MUL(optimized_ops);
+ }
+#undef TF_LITE_MUL
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+ TfLiteMulParams* params, TfLiteTensor* input1,
+ TfLiteTensor* input2, TfLiteTensor* output) {
+ auto input1_offset = -input1->params.zero_point;
+ auto input2_offset = -input2->params.zero_point;
+ auto output_offset = output->params.zero_point;
+
+ int32_t output_multiplier;
+ int output_shift;
+
+ double real_multiplier =
+ input1->params.scale * input2->params.scale / output->params.scale;
+ QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier,
+ &output_shift);
+
+ int32 output_activation_min, output_activation_max;
+ CalculateActivationRangeUint8(params->activation, output,
+ &output_activation_min, &output_activation_max);
+
+#define TF_LITE_MUL(type) \
+ type::BroadcastMul(GetTensorData<uint8_t>(input1), GetTensorDims(input1), \
+ input1_offset, GetTensorData<uint8_t>(input2), \
+ GetTensorDims(input2), input2_offset, output_offset, \
+ output_multiplier, output_shift, output_activation_min, \
+ output_activation_max, GetTensorData<uint8_t>(output), \
+ GetTensorDims(output));
+ if (kernel_type == kReference) {
+ TF_LITE_MUL(reference_ops);
+ } else {
+ TF_LITE_MUL(optimized_ops);
+ }
+#undef TF_LITE_MUL
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+
+ TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+ TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ if (output->type == kTfLiteFloat32) {
+ EvalFloat<kernel_type>(context, node, params, input1, input2, output);
+ } else if (output->type == kTfLiteUInt8) {
+ EvalQuantized<kernel_type>(context, node, params, input1, input2, output);
+ } else {
+ context->ReportError(context,
+ "Mul only supports FLOAT32 and quantized UINT8 now.");
+ return kTfLiteError;
+ }
+
+ return kTfLiteOk;
+}
+
+} // namespace mul
+
+TfLiteRegistration* Register_MUL_REF() {
+ static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+ mul::Eval<mul::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_MUL_GENERIC_OPT() {
+ static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+ mul::Eval<mul::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_MUL_NEON_OPT() {
+ static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+ mul::Eval<mul::kNeonOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_MUL() {
+#ifdef USE_NEON
+ return Register_MUL_NEON_OPT();
+#else
+ return Register_MUL_GENERIC_OPT();
+#endif
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/contrib/lite/kernels/mul_test.cc
new file mode 100644
index 0000000000..4b858e1f39
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mul_test.cc
@@ -0,0 +1,127 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseMulOpModel : public SingleOpModel {
+ public:
+ BaseMulOpModel(TensorData input, TensorData output,
+ ActivationFunctionType activation_type) {
+ input1_ = AddInput(input);
+ input2_ = AddInput(input);
+ output_ = AddOutput(output);
+ SetBuiltinOp(BuiltinOperator_MUL, BuiltinOptions_MulOptions,
+ CreateMulOptions(builder_, activation_type).Union());
+ BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+ }
+
+ int input1() { return input1_; }
+ int input2() { return input2_; }
+
+ protected:
+ int input1_;
+ int input2_;
+ int output_;
+};
+
+class FloatMulOpModel : public BaseMulOpModel {
+ public:
+ using BaseMulOpModel::BaseMulOpModel;
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+// For quantized Mul, the error shouldn't exceed (2*step + step^2).
+// The param min=-1.0 & max=1.0 is used in the following tests.
+// The tolerance value is ~0.0157.
+const float kQuantizedStep = 2.0 / 255.0;
+const float kQuantizedTolerance =
+ 2.0 * kQuantizedStep + kQuantizedStep * kQuantizedStep;
+
+class QuantizedMulOpModel : public BaseMulOpModel {
+ public:
+ using BaseMulOpModel::BaseMulOpModel;
+
+ std::vector<float> GetDequantizedOutput() {
+ return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+ GetScale(output_), GetZeroPoint(output_));
+ }
+};
+
+TEST(FloatMulOpTest, NoActivation) {
+ FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+ {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+ m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+ m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({-0.2, 0.04, 0.21, 0.4})));
+}
+
+TEST(FloatMulOpTest, ActivationRELU1) {
+ FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+ {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU1);
+ m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+ m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 5});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({-0.2, 0.04, 0.21, 1.0})));
+}
+
+TEST(FloatMulOpTest, VariousInputShapes) {
+ std::vector<std::initializer_list<int>> test_shapes = {
+ {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+ for (int i = 0; i < test_shapes.size(); ++i) {
+ FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+ {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+ m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+ m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
+ m.Invoke();
+ EXPECT_THAT(
+ m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear({-0.2, 0.04, 0.21, 0.4, 1.21, 0.2})))
+ << "With shape number " << i;
+ }
+}
+
+TEST(QuantizedMulOpTest, NoActivation) {
+ QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+ {TensorType_UINT8, {}, -1.0, 1.0},
+ ActivationFunctionType_NONE);
+ m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
+ m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.6, 0.4, 0.9, 0.8});
+ m.Invoke();
+ EXPECT_THAT(m.GetDequantizedOutput(),
+ ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56},
+ kQuantizedTolerance)));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/contrib/lite/kernels/op_macros.h
new file mode 100644
index 0000000000..7535afaf8e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/op_macros.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+
+#define TF_LITE_FATAL(msg) \
+ do { \
+ fprintf(stderr, "%s\n", (msg)); \
+ exit(1); \
+ } while (0)
+#define TF_LITE_ASSERT(x) \
+ do { \
+ if (!(x)) TF_LITE_FATAL(#x); \
+ } while (0)
+#define TF_LITE_ASSERT_EQ(x, y) \
+ do { \
+ if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \
+ } while (0)
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
new file mode 100644
index 0000000000..8977d27f73
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
@@ -0,0 +1,343 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite LSTM op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LSTMOpModel : public SingleOpModel {
+ public:
+ LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+ bool use_peephole, bool use_projection_weights,
+ bool use_projection_bias, float cell_clip, float proj_clip,
+ const std::vector<std::vector<int>>& input_shapes)
+ : n_batch_(n_batch),
+ n_input_(n_input),
+ n_cell_(n_cell),
+ n_output_(n_output) {
+ input_ = AddInput(TensorType_FLOAT32);
+
+ if (use_cifg) {
+ input_to_input_weights_ = AddNullInput();
+ } else {
+ input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+ }
+
+ input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+ input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+ input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+ if (use_cifg) {
+ recurrent_to_input_weights_ = AddNullInput();
+ } else {
+ recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+ }
+
+ recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+ recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+ recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+ if (use_peephole) {
+ if (use_cifg) {
+ cell_to_input_weights_ = AddNullInput();
+ } else {
+ cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+ }
+ cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+ cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+ } else {
+ cell_to_input_weights_ = AddNullInput();
+ cell_to_forget_weights_ = AddNullInput();
+ cell_to_output_weights_ = AddNullInput();
+ }
+
+ if (use_cifg) {
+ input_gate_bias_ = AddNullInput();
+ } else {
+ input_gate_bias_ = AddInput(TensorType_FLOAT32);
+ }
+ forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+ cell_bias_ = AddInput(TensorType_FLOAT32);
+ output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+ if (use_projection_weights) {
+ projection_weights_ = AddInput(TensorType_FLOAT32);
+ if (use_projection_bias) {
+ projection_bias_ = AddInput(TensorType_FLOAT32);
+ } else {
+ projection_bias_ = AddNullInput();
+ }
+ } else {
+ projection_weights_ = AddNullInput();
+ projection_bias_ = AddNullInput();
+ }
+
+ scratch_buffer_ = AddOutput(TensorType_FLOAT32);
+ // TODO(ghodrat): Modify these states when we have a permanent solution for
+ // persistent buffer.
+ output_state_ = AddOutput(TensorType_FLOAT32);
+ cell_state_ = AddOutput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+
+ SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+ CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+ cell_clip, proj_clip)
+ .Union());
+ BuildInterpreter(input_shapes);
+ }
+
+ void SetInputToInputWeights(std::initializer_list<float> f) {
+ PopulateTensor(input_to_input_weights_, f);
+ }
+
+ void SetInputToForgetWeights(std::initializer_list<float> f) {
+ PopulateTensor(input_to_forget_weights_, f);
+ }
+
+ void SetInputToCellWeights(std::initializer_list<float> f) {
+ PopulateTensor(input_to_cell_weights_, f);
+ }
+
+ void SetInputToOutputWeights(std::initializer_list<float> f) {
+ PopulateTensor(input_to_output_weights_, f);
+ }
+
+ void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+ PopulateTensor(recurrent_to_input_weights_, f);
+ }
+
+ void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+ PopulateTensor(recurrent_to_forget_weights_, f);
+ }
+
+ void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+ PopulateTensor(recurrent_to_cell_weights_, f);
+ }
+
+ void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+ PopulateTensor(recurrent_to_output_weights_, f);
+ }
+
+ void SetCellToInputWeights(std::initializer_list<float> f) {
+ PopulateTensor(cell_to_input_weights_, f);
+ }
+
+ void SetCellToForgetWeights(std::initializer_list<float> f) {
+ PopulateTensor(cell_to_forget_weights_, f);
+ }
+
+ void SetCellToOutputWeights(std::initializer_list<float> f) {
+ PopulateTensor(cell_to_output_weights_, f);
+ }
+
+ void SetInputGateBias(std::initializer_list<float> f) {
+ PopulateTensor(input_gate_bias_, f);
+ }
+
+ void SetForgetGateBias(std::initializer_list<float> f) {
+ PopulateTensor(forget_gate_bias_, f);
+ }
+
+ void SetCellBias(std::initializer_list<float> f) {
+ PopulateTensor(cell_bias_, f);
+ }
+
+ void SetOutputGateBias(std::initializer_list<float> f) {
+ PopulateTensor(output_gate_bias_, f);
+ }
+
+ void SetProjectionWeights(std::initializer_list<float> f) {
+ PopulateTensor(projection_weights_, f);
+ }
+
+ void SetProjectionBias(std::initializer_list<float> f) {
+ PopulateTensor(projection_bias_, f);
+ }
+
+ void ResetOutputState() {
+ const int zero_buffer_size = n_cell_ * n_batch_;
+ std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+ memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+ PopulateTensor(output_state_, 0, zero_buffer.get(),
+ zero_buffer.get() + zero_buffer_size);
+ }
+
+ void ResetCellState() {
+ const int zero_buffer_size = n_cell_ * n_batch_;
+ std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+ memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+ PopulateTensor(cell_state_, 0, zero_buffer.get(),
+ zero_buffer.get() + zero_buffer_size);
+ }
+
+ void SetInput(int offset, float* begin, float* end) {
+ PopulateTensor(input_, offset, begin, end);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+ void Verify() {
+ auto model = tflite::UnPackModel(builder_.GetBufferPointer());
+ EXPECT_NE(model, nullptr);
+ }
+
+ int num_inputs() { return n_input_; }
+ int num_outputs() { return n_output_; }
+ int num_cells() { return n_cell_; }
+ int num_batches() { return n_batch_; }
+
+ private:
+ int input_;
+ int input_to_input_weights_;
+ int input_to_forget_weights_;
+ int input_to_cell_weights_;
+ int input_to_output_weights_;
+
+ int recurrent_to_input_weights_;
+ int recurrent_to_forget_weights_;
+ int recurrent_to_cell_weights_;
+ int recurrent_to_output_weights_;
+
+ int cell_to_input_weights_;
+ int cell_to_forget_weights_;
+ int cell_to_output_weights_;
+
+ int input_gate_bias_;
+ int forget_gate_bias_;
+ int cell_bias_;
+ int output_gate_bias_;
+
+ int projection_weights_;
+ int projection_bias_;
+
+ int output_;
+ int output_state_;
+ int cell_state_;
+ int scratch_buffer_;
+
+ int n_batch_;
+ int n_input_;
+ int n_cell_;
+ int n_output_;
+};
+
+
+TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
+ const int n_batch = 1;
+ const int n_input = 2;
+ // n_cell and n_output have the same size when there is no projection.
+ const int n_cell = 4;
+ const int n_output = 4;
+
+ LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+ /*use_cifg=*/true, /*use_peephole=*/true,
+ /*use_projection_weights=*/false,
+ /*use_projection_bias=*/false,
+ /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+ {
+ {n_batch, n_input}, // input tensor
+
+ {0, 0}, // input_to_input_weight tensor
+ {n_cell, n_input}, // input_to_forget_weight tensor
+ {n_cell, n_input}, // input_to_cell_weight tensor
+ {n_cell, n_input}, // input_to_output_weight tensor
+
+ {0, 0}, // recurrent_to_input_weight tensor
+ {n_cell, n_output}, // recurrent_to_forget_weight tensor
+ {n_cell, n_output}, // recurrent_to_cell_weight tensor
+ {n_cell, n_output}, // recurrent_to_output_weight tensor
+
+ {0}, // cell_to_input_weight tensor
+ {n_cell}, // cell_to_forget_weight tensor
+ {n_cell}, // cell_to_output_weight tensor
+
+ {0}, // input_gate_bias tensor
+ {n_cell}, // forget_gate_bias tensor
+ {n_cell}, // cell_bias tensor
+ {n_cell}, // output_gate_bias tensor
+
+ {0, 0}, // projection_weight tensor
+ {0}, // projection_bias tensor
+ });
+
+
+ lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+ 0.04717243, 0.48944736, -0.38535351,
+ -0.17212132});
+
+ lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988,
+ -0.3633365, -0.22755712, 0.28253698, 0.24407166,
+ 0.33826375});
+
+ lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593,
+ -0.09426838, -0.44257352, 0.54939759,
+ 0.01533556, 0.42751634});
+
+ lstm.SetCellBias({0., 0., 0., 0.});
+
+ lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+ lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+ lstm.SetRecurrentToCellWeights(
+ {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711,
+ 0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004,
+ 0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288,
+ 0.21193194});
+
+ lstm.SetRecurrentToForgetWeights(
+ {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827,
+ 0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795,
+ -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349});
+
+ lstm.SetRecurrentToOutputWeights(
+ {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908,
+ -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835,
+ 0.50248802, 0.26114327, -0.43736315, 0.33149987});
+
+ lstm.SetCellToForgetWeights(
+ {0.47485286, -0.51955009, -0.24458408, 0.31544167});
+ lstm.SetCellToOutputWeights(
+ {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+
+ // Resetting cell_state and output_state
+ lstm.ResetCellState();
+ lstm.ResetOutputState();
+
+ // Verify the model by unpacking it.
+ lstm.Verify();
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h
new file mode 100644
index 0000000000..3a60274524
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/padding.h
@@ -0,0 +1,28 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+
+namespace tflite {
+
+inline int ComputePadding(int stride, int in_size, int filter_size,
+ int out_size) {
+ int padding = ((out_size - 1) * stride + filter_size - in_size) / 2;
+ return padding > 0 ? padding : 0;
+}
+
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc
new file mode 100644
index 0000000000..b798801108
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pooling.cc
@@ -0,0 +1,355 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace pooling {
+
+// This file has two implementation of each pooling op.
+enum KernelType {
+ kReference,
+ kGenericOptimized,
+};
+
+enum PoolType {
+ kAverage,
+ kMax,
+ kL2,
+};
+
+struct OpData {
+ TfLitePaddingValues padding;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+ // This is a builtin op, so we don't use the contents in 'buffer', if any.
+ // Instead, we allocate a new object to carry information from Prepare() to
+ // Eval().
+ return new OpData;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+ delete reinterpret_cast<OpData*>(buffer);
+}
+
+template <PoolType pool_type>
+TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TfLiteTensor* input = GetInput(context, node, 0);
+ TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+ TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+ int batches = input->dims->data[0];
+ int height = input->dims->data[1];
+ int width = input->dims->data[2];
+ int channels_out = input->dims->data[3];
+
+ // Matching GetWindowedOutputSize in TensorFlow.
+ auto padding = params->padding;
+ auto computeOutSize = [padding](int imageSize, int filterSize,
+ int stride) -> int {
+ return padding == kTfLitePaddingSame
+ ? (imageSize + stride - 1) / stride
+ : padding == kTfLitePaddingValid
+ ? (imageSize - filterSize + stride) / stride
+ : 0;
+ };
+
+ int outWidth =
+ computeOutSize(width, params->filter_width, params->stride_width);
+ int outHeight =
+ computeOutSize(height, params->filter_height, params->stride_height);
+
+ data->padding.height = ComputePadding(params->stride_height, height,
+ params->filter_height, outHeight);
+ data->padding.width = ComputePadding(params->stride_width, width,
+ params->filter_width, outWidth);
+
+ if (input->type == kTfLiteUInt8) {
+ if (pool_type == kAverage || pool_type == kMax) {
+ TF_LITE_ENSURE_EQ(context, input->params.scale, output->params.scale);
+ TF_LITE_ENSURE_EQ(context, input->params.zero_point,
+ output->params.zero_point);
+ }
+ if (pool_type == kL2) {
+ // We currently don't have a quantized implementation of L2Pool
+ TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+ }
+ }
+
+ TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
+ outputSize->data[0] = batches;
+ outputSize->data[1] = outHeight;
+ outputSize->data[2] = outWidth;
+ outputSize->data[3] = channels_out;
+ return context->ResizeTensor(context, output, outputSize);
+}
+
+template <KernelType kernel_type>
+void AverageEvalFloat(TfLiteContext* context, TfLiteNode* node,
+ TfLitePoolParams* params, OpData* data,
+ TfLiteTensor* input, TfLiteTensor* output) {
+ float activation_min, activation_max;
+ CalculateActivationRangeFloat(params->activation, &activation_min,
+ &activation_max);
+#define TF_LITE_AVERAGE_POOL(type) \
+ type::AveragePool( \
+ GetTensorData<float>(input), GetTensorDims(input), params->stride_width, \
+ params->stride_height, data->padding.width, data->padding.height, \
+ params->filter_width, params->filter_height, activation_min, \
+ activation_max, GetTensorData<float>(output), GetTensorDims(output))
+ if (kernel_type == kReference) {
+ TF_LITE_AVERAGE_POOL(reference_ops);
+ } else {
+ TF_LITE_AVERAGE_POOL(optimized_ops);
+ }
+#undef TF_LITE_AVERAGE_POOL
+}
+
+template <KernelType kernel_type>
+void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+ TfLitePoolParams* params, OpData* data,
+ TfLiteTensor* input, TfLiteTensor* output) {
+ int32_t activation_min;
+ int32_t activation_max;
+ CalculateActivationRangeUint8(params->activation, output, &activation_min,
+ &activation_max);
+#define TF_LITE_AVERAGE_POOL(type) \
+ type::AveragePool(GetTensorData<uint8_t>(input), GetTensorDims(input), \
+ params->stride_width, params->stride_height, \
+ data->padding.width, data->padding.height, \
+ params->filter_width, params->filter_height, \
+ activation_min, activation_max, \
+ GetTensorData<uint8_t>(output), GetTensorDims(output))
+ if (kernel_type == kReference) {
+ TF_LITE_AVERAGE_POOL(reference_ops);
+ } else {
+ TF_LITE_AVERAGE_POOL(optimized_ops);
+ }
+#undef TF_LITE_AVERAGE_POOL
+}
+
+template <KernelType kernel_type>
+void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
+ TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
+ TfLiteTensor* output) {
+ float activation_min, activation_max;
+ CalculateActivationRangeFloat(params->activation, &activation_min,
+ &activation_max);
+#define TF_LITE_MAX_POOL(type) \
+ type::MaxPool( \
+ GetTensorData<float>(input), GetTensorDims(input), params->stride_width, \
+ params->stride_height, data->padding.width, data->padding.height, \
+ params->filter_width, params->filter_height, activation_min, \
+ activation_max, GetTensorData<float>(output), GetTensorDims(output))
+ if (kernel_type == kReference) {
+ TF_LITE_MAX_POOL(reference_ops);
+ } else {
+ TF_LITE_MAX_POOL(optimized_ops);
+ }
+#undef TF_LITE_MAX_POOL
+}
+
+template <KernelType kernel_type>
+void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+ TfLitePoolParams* params, OpData* data,
+ TfLiteTensor* input, TfLiteTensor* output) {
+ int32_t activation_min;
+ int32_t activation_max;
+ CalculateActivationRangeUint8(params->activation, output, &activation_min,
+ &activation_max);
+#define TF_LITE_MAX_POOL(type) \
+ type::MaxPool(GetTensorData<uint8_t>(input), GetTensorDims(input), \
+ params->stride_width, params->stride_height, \
+ data->padding.width, data->padding.height, \
+ params->filter_width, params->filter_height, activation_min, \
+ activation_max, GetTensorData<uint8_t>(output), \
+ GetTensorDims(output))
+ if (kernel_type == kReference) {
+ TF_LITE_MAX_POOL(reference_ops);
+ } else {
+ TF_LITE_MAX_POOL(optimized_ops);
+ }
+#undef TF_LITE_MAX_POOL
+}
+
+template <KernelType kernel_type>
+void L2EvalFloat(TfLiteContext* context, TfLiteNode* node,
+ TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
+ TfLiteTensor* output) {
+ float activation_min, activation_max;
+ CalculateActivationRangeFloat(params->activation, &activation_min,
+ &activation_max);
+#define TF_LITE_L2_POOL(type) \
+ type::L2Pool( \
+ GetTensorData<float>(input), GetTensorDims(input), params->stride_width, \
+ params->stride_height, data->padding.width, data->padding.height, \
+ params->filter_width, params->filter_height, activation_min, \
+ activation_max, GetTensorData<float>(output), GetTensorDims(output))
+ if (kernel_type == kReference) {
+ TF_LITE_L2_POOL(reference_ops);
+ } else {
+ TF_LITE_L2_POOL(optimized_ops);
+ }
+#undef TF_LITE_L2_POOL
+}
+
+#undef TF_LITE_KERNEL_TYPE_DISPATCH
+
+template <KernelType kernel_type>
+TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TfLiteTensor* input = GetInput(context, node, 0);
+ switch (input->type) { // Already know in/out types are same.
+ case kTfLiteFloat32:
+ AverageEvalFloat<kernel_type>(context, node, params, data, input, output);
+ break;
+ case kTfLiteUInt8:
+ AverageEvalQuantized<kernel_type>(context, node, params, data, input,
+ output);
+ break;
+ default:
+ context->ReportError(context, "Type not currently supported.");
+ return kTfLiteError;
+ }
+ return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TfLiteTensor* input = GetInput(context, node, 0);
+ switch (input->type) { // Already know in/out types are same.
+ case kTfLiteFloat32:
+ MaxEvalFloat<kernel_type>(context, node, params, data, input, output);
+ break;
+ case kTfLiteUInt8:
+ MaxEvalQuantized<kernel_type>(context, node, params, data, input, output);
+ break;
+ default:
+ context->ReportError(context, "Type not currently supported.");
+ return kTfLiteError;
+ }
+ return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TfLiteTensor* output = GetOutput(context, node, 0);
+ TfLiteTensor* input = GetInput(context, node, 0);
+ switch (input->type) { // Already know in/out types are same.
+ case kTfLiteFloat32:
+ L2EvalFloat<kernel_type>(context, node, params, data, input, output);
+ break;
+ case kTfLiteUInt8:
+ // We don't have a quantized implementation, so just fall through to the
+ // 'default' case.
+ default:
+ context->ReportError(context, "Type not currently supported.");
+ return kTfLiteError;
+ }
+ return kTfLiteOk;
+}
+
+} // namespace pooling
+
+TfLiteRegistration* Register_AVERAGE_POOL_REF() {
+ static TfLiteRegistration r = {pooling::Init, pooling::Free,
+ pooling::GenericPrepare<pooling::kAverage>,
+ pooling::AverageEval<pooling::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_MAX_POOL_REF() {
+ static TfLiteRegistration r = {pooling::Init, pooling::Free,
+ pooling::GenericPrepare<pooling::kMax>,
+ pooling::MaxEval<pooling::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_L2_POOL_REF() {
+ static TfLiteRegistration r = {pooling::Init, pooling::Free,
+ pooling::GenericPrepare<pooling::kL2>,
+ pooling::L2Eval<pooling::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_AVERAGE_POOL_GENERIC_OPT() {
+ static TfLiteRegistration r = {
+ pooling::Init, pooling::Free, pooling::GenericPrepare<pooling::kAverage>,
+ pooling::AverageEval<pooling::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_MAX_POOL_GENERIC_OPT() {
+ static TfLiteRegistration r = {pooling::Init, pooling::Free,
+ pooling::GenericPrepare<pooling::kMax>,
+ pooling::MaxEval<pooling::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_L2_POOL_GENERIC_OPT() {
+ static TfLiteRegistration r = {pooling::Init, pooling::Free,
+ pooling::GenericPrepare<pooling::kL2>,
+ pooling::L2Eval<pooling::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_AVERAGE_POOL_2D() {
+ return Register_AVERAGE_POOL_GENERIC_OPT();
+}
+
+TfLiteRegistration* Register_MAX_POOL_2D() {
+ return Register_MAX_POOL_GENERIC_OPT();
+}
+
+TfLiteRegistration* Register_L2_POOL_2D() {
+ return Register_L2_POOL_GENERIC_OPT();
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/pooling_test.cc b/tensorflow/contrib/lite/kernels/pooling_test.cc
new file mode 100644
index 0000000000..e1b51ec7d5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pooling_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BasePoolingOpModel : public SingleOpModel {
+ public:
+ // TODO(ahentz): Also test different activation types, bias, padding types,
+ // stride values.
+ BasePoolingOpModel(BuiltinOperator type, const TensorData& input,
+ int filter_width, int filter_height,
+ const TensorData& output) {
+ input_ = AddInput(input);
+ output_ = AddOutput(output);
+
+ SetBuiltinOp(
+ type, BuiltinOptions_Pool2DOptions,
+ CreatePool2DOptions(builder_, Padding_VALID, 2, 2, filter_width,
+ filter_height, ActivationFunctionType_NONE)
+ .Union());
+
+ BuildInterpreter({GetShape(input_)});
+ }
+
+ protected:
+ int input_;
+ int output_;
+};
+
+class FloatPoolingOpModel : public BasePoolingOpModel {
+ public:
+ using BasePoolingOpModel::BasePoolingOpModel;
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedPoolingOpModel : public BasePoolingOpModel {
+ public:
+ using BasePoolingOpModel::BasePoolingOpModel;
+
+ void SetInput(std::initializer_list<float> data) {
+ QuantizeAndPopulate<uint8_t>(input_, data);
+ }
+
+ std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+ std::vector<float> GetDequantizedOutput() {
+ return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+ GetScale(output_), GetZeroPoint(output_));
+ }
+};
+
+TEST(FloatPoolingOpTest, AveragePool) {
+ FloatPoolingOpModel m(BuiltinOperator_AVERAGE_POOL_2D,
+ /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
+ /*filter_width=*/2, /*filter_height=*/2,
+ /*output=*/{TensorType_FLOAT32, {}});
+ m.SetInput({
+ 0, 6, 2, 4, //
+ 3, 2, 10, 7, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({2.75, 5.75}));
+}
+
+TEST(QuantizedPoolingOpTest, AveragePool) {
+ // Choose the input ranges carefully so that the dequantized output matches
+ // the results of the float model above.
+ QuantizedPoolingOpModel m(
+ BuiltinOperator_AVERAGE_POOL_2D,
+ /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 0, 15.9375},
+ /*filter_width=*/2, /*filter_height=*/2,
+ /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+ m.SetInput({
+ 0, 6, 2, 4, //
+ 3, 2, 10, 7, //
+ });
+ m.Invoke();
+
+ EXPECT_THAT(m.GetDequantizedOutput(),
+ ElementsAreArray(ArrayFloatNear({2.75, 5.75})));
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({44, 92}));
+}
+
+TEST(FloatPoolingOpTest, MaxPool) {
+ FloatPoolingOpModel m(BuiltinOperator_MAX_POOL_2D,
+ /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
+ /*filter_width=*/2, /*filter_height=*/2,
+ /*output=*/{TensorType_FLOAT32, {}});
+ m.SetInput({
+ 0, 6, 2, 4, //
+ 3, 2, 10, 7, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 10}));
+}
+
+TEST(QuantizedPoolingOpTest, MaxPool) {
+ // Choose the input ranges carefully so that the dequantized output matches
+ // the results of the float model above.
+ QuantizedPoolingOpModel m(
+ BuiltinOperator_MAX_POOL_2D,
+ /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 0, 15.9375},
+ /*filter_width=*/2, /*filter_height=*/2,
+ /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+ m.SetInput({
+ 0, 6, 2, 4, //
+ 3, 2, 10, 7, //
+ });
+ m.Invoke();
+
+ EXPECT_THAT(m.GetDequantizedOutput(),
+ ElementsAreArray(ArrayFloatNear({6, 10})));
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({96, 160}));
+}
+
+TEST(FloatPoolingOpTest, L2Pool) {
+ FloatPoolingOpModel m(BuiltinOperator_L2_POOL_2D,
+ /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
+ /*filter_width=*/2, /*filter_height=*/2,
+ /*output=*/{TensorType_FLOAT32, {}});
+ m.SetInput({
+ 0, 6, 2, 4, //
+ 3, 2, 10, 7, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({3.5, 6.5}));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
new file mode 100644
index 0000000000..ca7a0dd194
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -0,0 +1,109 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_RELU();
+TfLiteRegistration* Register_RELU1();
+TfLiteRegistration* Register_RELU6();
+TfLiteRegistration* Register_TANH();
+TfLiteRegistration* Register_LOGISTIC();
+TfLiteRegistration* Register_AVERAGE_POOL_2D();
+TfLiteRegistration* Register_MAX_POOL_2D();
+TfLiteRegistration* Register_L2_POOL_2D();
+TfLiteRegistration* Register_CONV_2D();
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration* Register_SVDF();
+TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
+TfLiteRegistration* Register_FULLY_CONNECTED();
+TfLiteRegistration* Register_LSH_PROJECTION();
+TfLiteRegistration* Register_HASHTABLE_LOOKUP();
+TfLiteRegistration* Register_SOFTMAX();
+TfLiteRegistration* Register_CONCATENATION();
+TfLiteRegistration* Register_ADD();
+TfLiteRegistration* Register_MUL();
+TfLiteRegistration* Register_L2_NORMALIZATION();
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORMALIZATION();
+TfLiteRegistration* Register_LSTM();
+TfLiteRegistration* Register_RESHAPE();
+TfLiteRegistration* Register_RESIZE_BILINEAR();
+TfLiteRegistration* Register_SKIP_GRAM();
+TfLiteRegistration* Register_SPACE_TO_DEPTH();
+
+BuiltinOpResolver::BuiltinOpResolver() {
+ AddBuiltin(BuiltinOperator_RELU, Register_RELU());
+ AddBuiltin(BuiltinOperator_RELU1, Register_RELU1());
+ AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
+ AddBuiltin(BuiltinOperator_TANH, Register_TANH());
+ AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
+ AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D());
+ AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
+ AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
+ AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D());
+ AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
+ AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
+ AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+ AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
+ AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+ Register_EMBEDDING_LOOKUP_SPARSE());
+ AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED());
+ AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
+ AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
+ AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
+ AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
+ AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+ AddBuiltin(BuiltinOperator_MUL, Register_MUL());
+ AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
+ AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+ Register_LOCAL_RESPONSE_NORMALIZATION());
+ AddBuiltin(BuiltinOperator_LSTM, Register_LSTM());
+ AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
+ AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
+ AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
+ AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH());
+}
+
+TfLiteRegistration* BuiltinOpResolver::FindOp(
+ tflite::BuiltinOperator op) const {
+ auto it = builtins_.find(op);
+ return it != builtins_.end() ? it->second : nullptr;
+}
+
+TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op) const {
+ auto it = custom_ops_.find(op);
+ return it != custom_ops_.end() ? it->second : nullptr;
+}
+
+void BuiltinOpResolver::AddBuiltin(tflite::BuiltinOperator op,
+ TfLiteRegistration* registration) {
+ registration->builtin_code = op;
+ builtins_.insert(std::make_pair(op, registration));
+}
+
+void BuiltinOpResolver::AddCustom(const char* name,
+ TfLiteRegistration* registration) {
+ registration->builtin_code = BuiltinOperator_CUSTOM;
+ custom_ops_.insert(std::make_pair(std::string(name), registration));
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/register.h b/tensorflow/contrib/lite/kernels/register.h
new file mode 100644
index 0000000000..28f5e0fcc8
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/register.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+
+#include <unordered_map>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+class BuiltinOpResolver : public OpResolver {
+ public:
+ BuiltinOpResolver();
+ TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override;
+ TfLiteRegistration* FindOp(const char* op) const override;
+ void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration);
+ void AddCustom(const char* name, TfLiteRegistration* registration);
+
+ private:
+ struct BuiltinOperatorHasher {
+ size_t operator()(const tflite::BuiltinOperator& x) const {
+ return std::hash<size_t>()(static_cast<size_t>(x));
+ }
+ };
+ std::unordered_map<tflite::BuiltinOperator, TfLiteRegistration*,
+ BuiltinOperatorHasher>
+ builtins_;
+ std::unordered_map<std::string, TfLiteRegistration*> custom_ops_;
+};
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_BUILTIN_KERNELS_H
diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc
new file mode 100644
index 0000000000..f3e6ddc9f4
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/reshape.cc
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace reshape {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
+
+ // TODO(ahentz): we are often given a tensor with the shape but we only pay
+ // attention to what the shape specified in 'params'.
+ TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ // Tensorflow's Reshape allows one of the shape components to have the
+ // special -1 value, meaning it will be calculated automatically based on the
+ // input. Here we calculate what that dimension should be so that the number
+ // of output elements in the same as the number of input elements.
+ int num_input_elements = 1;
+ for (int i = 0; i < NumDimensions(input); ++i) {
+ num_input_elements *= SizeOfDimension(input, i);
+ }
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCreate(params->num_dimensions);
+ int num_output_elements = 1;
+ int strech_dim = -1;
+ for (int i = 0; i < params->num_dimensions; ++i) {
+ int value = params->shape[i];
+ if (value == -1) {
+ TF_LITE_ENSURE_EQ(context, strech_dim, -1);
+ strech_dim = i;
+ } else {
+ num_output_elements *= value;
+ output_size->data[i] = value;
+ }
+ }
+ if (strech_dim != -1) {
+ output_size->data[strech_dim] = num_input_elements / num_output_elements;
+ num_output_elements *= output_size->data[strech_dim];
+ }
+
+ TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
+ return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ memcpy(output->data.raw, input->data.raw, input->bytes);
+
+ return kTfLiteOk;
+}
+
+} // namespace reshape
+
+TfLiteRegistration* Register_RESHAPE() {
+ static TfLiteRegistration r = {nullptr, nullptr, reshape::Prepare,
+ reshape::Eval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/reshape_test.cc b/tensorflow/contrib/lite/kernels/reshape_test.cc
new file mode 100644
index 0000000000..59ce7d5648
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/reshape_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class ReshapeOpModel : public SingleOpModel {
+ public:
+ ReshapeOpModel(std::initializer_list<int> input_shape,
+ std::initializer_list<int> new_shape) {
+ input_ = AddInput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+ SetBuiltinOp(
+ BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+ CreateReshapeOptions(builder_, builder_.CreateVector<int>(new_shape))
+ .Union());
+ BuildInterpreter({input_shape});
+ }
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor<float>(input_, data);
+ }
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+ std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+ int input_;
+ int output_;
+};
+
+TEST(ReshapeOpTest, MismatchedDimensions) {
+ EXPECT_DEATH(ReshapeOpModel({1, 2, 4, 1}, {2, 1}),
+ "num_input_elements != num_output_elements");
+}
+
+TEST(ReshapeOpTest, TooManyDimensions) {
+ EXPECT_DEATH(
+ ReshapeOpModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 2, 3, 4, 5, 6, 7, 8, 9}),
+ "Found too many dimensions");
+}
+
+TEST(ReshapeOpTest, TooManySpecialDimensions) {
+ EXPECT_DEATH(ReshapeOpModel({1, 2, 4, 1}, {-1, -1, 2, 4}),
+ "strech_dim != -1");
+}
+
+TEST(ReshapeOpTest, SimpleTest) {
+ ReshapeOpModel m({1, 2, 4, 1}, {2, 2, 2});
+ m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+ EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
+}
+
+TEST(ReshapeOpTest, WithStretchDimension) {
+ ReshapeOpModel m({1, 2, 4, 1}, {2, 1, -1});
+ m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+ EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 4}));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
new file mode 100644
index 0000000000..1613c9a89f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace resize_bilinear {
+
+// This file has three implementation of RESIZE_BILINEAR.
+enum KernelType {
+ kReference,
+ kGenericOptimized, // Neon-free
+ kNeonOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
+
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ // TODO(ahentz): Our current implementations rely on the inputs being 4D.
+ TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+
+ // TODO(ahentz): Our current implementations only support float32.
+ TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+ TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+ output_size->data[0] = input->dims->data[0];
+ output_size->data[1] = params->new_height;
+ output_size->data[2] = params->new_width;
+ output_size->data[3] = input->dims->data[3];
+
+ return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ // We have to fake a tensor here, to satisfy ResizeBilinear().
+ int32 output_size_data[2] = {params->new_height, params->new_width};
+
+ if (output->type == kTfLiteFloat32) {
+#define TF_LITE_RESIZE_BILINEAR(type) \
+ type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input), \
+ output_size_data, GetTensorDims({1, 1, 1, 2}), \
+ GetTensorData<float>(output), GetTensorDims(output))
+
+ if (kernel_type == kReference) {
+ TF_LITE_RESIZE_BILINEAR(reference_ops);
+ }
+ if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
+ TF_LITE_RESIZE_BILINEAR(optimized_ops);
+ }
+#undef TF_LITE_RESIZE_BILINEAR
+ } else {
+ context->ReportError(context, "Inputs and outputs not all float types.");
+ return kTfLiteError;
+ }
+
+ return kTfLiteOk;
+}
+
+} // namespace resize_bilinear
+
+TfLiteRegistration* Register_RESIZE_BILINEAR_REF() {
+ static TfLiteRegistration r = {
+ nullptr, nullptr, resize_bilinear::Prepare,
+ resize_bilinear::Eval<resize_bilinear::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_RESIZE_BILINEAR_GENERIC_OPT() {
+ static TfLiteRegistration r = {
+ nullptr, nullptr, resize_bilinear::Prepare,
+ resize_bilinear::Eval<resize_bilinear::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_RESIZE_BILINEAR_NEON_OPT() {
+ static TfLiteRegistration r = {
+ nullptr, nullptr, resize_bilinear::Prepare,
+ resize_bilinear::Eval<resize_bilinear::kNeonOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_RESIZE_BILINEAR() {
+#ifdef USE_NEON
+ return Register_RESIZE_BILINEAR_NEON_OPT();
+#else
+ return Register_RESIZE_BILINEAR_GENERIC_OPT();
+#endif
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
new file mode 100644
index 0000000000..0257c0b557
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class ResizeBilinearOpModel : public SingleOpModel {
+ public:
+ ResizeBilinearOpModel(std::initializer_list<int> input_shape, int new_height,
+ int new_width) {
+ input_ = AddInput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+ SetBuiltinOp(
+ BuiltinOperator_RESIZE_BILINEAR, BuiltinOptions_ResizeBilinearOptions,
+ CreateResizeBilinearOptions(builder_, new_height, new_width).Union());
+ BuildInterpreter({input_shape});
+ }
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+ int input_;
+ int output_;
+};
+
+TEST(ResizeBilinearOpTest, HorizontalResize) {
+ ResizeBilinearOpModel m({1, 1, 2, 1}, 1, 3);
+ m.SetInput({3, 6});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
+TEST(ResizeBilinearOpTest, VerticalResize) {
+ ResizeBilinearOpModel m({1, 2, 1, 1}, 3, 1);
+ m.SetInput({3, 9});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
+ ResizeBilinearOpModel m({1, 2, 2, 1}, 3, 3);
+ m.SetInput({
+ 3, 6, //
+ 9, 12 //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ 3, 5, 6, //
+ 7, 9, 10, //
+ 9, 11, 12, //
+ })));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
+ ResizeBilinearOpModel m({2, 2, 2, 1}, 3, 3);
+ m.SetInput({
+ 3, 6, //
+ 9, 12, //
+ 4, 10, //
+ 10, 16 //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ 3, 5, 6, //
+ 7, 9, 10, //
+ 9, 11, 12, //
+ 4, 8, 10, //
+ 8, 12, 14, //
+ 10, 14, 16, //
+ })));
+}
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
+ ResizeBilinearOpModel m({1, 2, 2, 2}, 3, 3);
+ m.SetInput({
+ 3, 4, 6, 10, //
+ 9, 10, 12, 16, //
+ });
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+ 3, 4, 5, 8, 6, 10, //
+ 7, 8, 9, 12, 10, 14, //
+ 9, 10, 11, 14, 12, 16, //
+ })));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/skip_gram.cc b/tensorflow/contrib/lite/kernels/skip_gram.cc
new file mode 100644
index 0000000000..c90a15b3a2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/skip_gram.cc
@@ -0,0 +1,160 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Generate a list of skip grams from an input.
+//
+// Options:
+// ngram_size: num of words for each output item.
+// max_skip_size: max num of words to skip.
+// The op generates ngrams when it is 0.
+// include_all_ngrams: include all ngrams with size up to ngram_size.
+//
+// Input:
+// A string tensor to generate n-grams.
+// Dim = {1}
+//
+// Output:
+// A list of strings, each of which contains ngram_size words.
+// Dim = {num_ngram}
+
+#include <ctype.h>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+namespace {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TF_LITE_ENSURE_EQ(context, GetInput(context, node, 0)->type, kTfLiteString);
+ TF_LITE_ENSURE_EQ(context, GetOutput(context, node, 0)->type, kTfLiteString);
+ return kTfLiteOk;
+}
+
+bool ShouldIncludeCurrentNgram(const TfLiteSkipGramParams* params, int size) {
+ if (size <= 0) {
+ return false;
+ }
+ if (params->include_all_ngrams) {
+ return size <= params->ngram_size;
+ } else {
+ return size == params->ngram_size;
+ }
+}
+
+bool ShouldStepInRecursion(const TfLiteSkipGramParams* params,
+ const std::vector<int>& stack, int stack_idx,
+ int num_words) {
+ // If current stack size and next word enumeration are within valid range.
+ if (stack_idx < params->ngram_size && stack[stack_idx] + 1 < num_words) {
+ // If this stack is empty, step in for first word enumeration.
+ if (stack_idx == 0) {
+ return true;
+ }
+ // If next word enumeration are within the range of max_skip_size.
+ // NOTE: equivalent to
+ // next_word_idx = stack[stack_idx] + 1
+ // next_word_idx - stack[stack_idx-1] <= max_skip_size + 1
+ if (stack[stack_idx] - stack[stack_idx - 1] <= params->max_skip_size) {
+ return true;
+ }
+ }
+ return false;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteSkipGramParams*>(node->builtin_data);
+
+ // Split sentence to words.
+ std::vector<StringRef> words;
+ tflite::StringRef strref = tflite::GetString(GetInput(context, node, 0), 0);
+ int prev_idx = 0;
+ for (int i = 1; i < strref.len; i++) {
+ if (isspace(*(strref.str + i))) {
+ if (i > prev_idx && !isspace(*(strref.str + prev_idx))) {
+ words.push_back({strref.str + prev_idx, i - prev_idx});
+ }
+ prev_idx = i + 1;
+ }
+ }
+ if (strref.len > prev_idx) {
+ words.push_back({strref.str + prev_idx, strref.len - prev_idx});
+ }
+
+ // Generate n-grams recursively.
+ tflite::DynamicBuffer buf;
+ if (words.size() < params->ngram_size) {
+ buf.WriteToTensor(GetOutput(context, node, 0));
+ return kTfLiteOk;
+ }
+
+ // Stack stores the index of word used to generate ngram.
+ // The size of stack is the size of ngram.
+ std::vector<int> stack(params->ngram_size, 0);
+ // Stack index that indicates which depth the recursion is operating at.
+ int stack_idx = 1;
+ int num_words = words.size();
+
+ while (stack_idx >= 0) {
+ if (ShouldStepInRecursion(params, stack, stack_idx, num_words)) {
+ // When current depth can fill with a new word
+ // and the new word is within the max range to skip,
+ // fill this word to stack, recurse into next depth.
+ stack[stack_idx]++;
+ stack_idx++;
+ if (stack_idx < params->ngram_size) {
+ stack[stack_idx] = stack[stack_idx - 1];
+ }
+ } else {
+ if (ShouldIncludeCurrentNgram(params, stack_idx)) {
+ // Add n-gram to tensor buffer when the stack has filled with enough
+ // words to generate the ngram.
+ std::vector<StringRef> gram(stack_idx);
+ for (int i = 0; i < stack_idx; i++) {
+ gram[i] = words[stack[i]];
+ }
+ buf.AddJoinedString(gram, ' ');
+ }
+ // When current depth cannot fill with a valid new word,
+ // and not in last depth to generate ngram,
+ // step back to previous depth to iterate to next possible word.
+ stack_idx--;
+ }
+ }
+
+ buf.WriteToTensor(GetOutput(context, node, 0));
+ return kTfLiteOk;
+}
+} // namespace
+
+TfLiteRegistration* Register_SKIP_GRAM() {
+ static TfLiteRegistration r = {nullptr, nullptr, Prepare, Eval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/skip_gram_test.cc b/tensorflow/contrib/lite/kernels/skip_gram_test.cc
new file mode 100644
index 0000000000..e7f6bc904b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/skip_gram_test.cc
@@ -0,0 +1,257 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+static char kSentence[] = "The quick\t brown fox\n jumps over\n the lazy dog!";
+
+class SkipGramOp : public SingleOpModel {
+ public:
+ SkipGramOp(int ngram_size, int max_skip_size, bool include_all_ngrams) {
+ input_ = AddInput(TensorType_STRING);
+ output_ = AddOutput(TensorType_STRING);
+
+ SetBuiltinOp(BuiltinOperator_SKIP_GRAM, BuiltinOptions_SkipGramOptions,
+ CreateSkipGramOptions(builder_, ngram_size, max_skip_size,
+ include_all_ngrams)
+ .Union());
+ BuildInterpreter({{1}});
+ }
+ void SetInput(const string& content) {
+ PopulateStringTensor(input_, {content});
+ }
+
+ std::vector<string> GetOutput() {
+ std::vector<string> ans;
+ TfLiteTensor* tensor = interpreter_->tensor(output_);
+
+ int num = GetStringCount(tensor);
+ for (int i = 0; i < num; i++) {
+ StringRef strref = GetString(tensor, i);
+ ans.push_back(string(strref.str, strref.len));
+ }
+ return ans;
+ }
+
+ private:
+ int input_;
+ int output_;
+};
+
+TEST(SkipGramTest, TestUnigram) {
+ SkipGramOp m(1, 0, false);
+
+ m.SetInput(kSentence);
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), testing::UnorderedElementsAreArray(
+ {"The", "quick", "brown", "fox", "jumps",
+ "over", "the", "lazy", "dog!"}));
+}
+
+TEST(SkipGramTest, TestBigram) {
+ SkipGramOp m(2, 0, false);
+ m.SetInput(kSentence);
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ testing::UnorderedElementsAreArray(
+ {"The quick", "quick brown", "brown fox", "fox jumps",
+ "jumps over", "over the", "the lazy", "lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestAllBigram) {
+ SkipGramOp m(2, 0, true);
+ m.SetInput(kSentence);
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ testing::UnorderedElementsAreArray(
+ {// Unigram
+ "The", "quick", "brown", "fox", "jumps", "over", "the",
+ "lazy", "dog!",
+ // Bigram
+ "The quick", "quick brown", "brown fox", "fox jumps",
+ "jumps over", "over the", "the lazy", "lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestAllTrigram) {
+ SkipGramOp m(3, 0, true);
+ m.SetInput(kSentence);
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ testing::UnorderedElementsAreArray(
+ {// Unigram
+ "The", "quick", "brown", "fox", "jumps", "over", "the",
+ "lazy", "dog!",
+ // Bigram
+ "The quick", "quick brown", "brown fox", "fox jumps",
+ "jumps over", "over the", "the lazy", "lazy dog!",
+ // Trigram
+ "The quick brown", "quick brown fox", "brown fox jumps",
+ "fox jumps over", "jumps over the", "over the lazy",
+ "the lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSkip1Bigram) {
+ SkipGramOp m(2, 1, false);
+ m.SetInput(kSentence);
+ m.Invoke();
+ EXPECT_THAT(
+ m.GetOutput(),
+ testing::UnorderedElementsAreArray(
+ {"The quick", "The brown", "quick brown", "quick fox", "brown fox",
+ "brown jumps", "fox jumps", "fox over", "jumps over", "jumps the",
+ "over the", "over lazy", "the lazy", "the dog!", "lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSkip2Bigram) {
+ SkipGramOp m(2, 2, false);
+ m.SetInput(kSentence);
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ testing::UnorderedElementsAreArray(
+ {"The quick", "The brown", "The fox", "quick brown",
+ "quick fox", "quick jumps", "brown fox", "brown jumps",
+ "brown over", "fox jumps", "fox over", "fox the",
+ "jumps over", "jumps the", "jumps lazy", "over the",
+ "over lazy", "over dog!", "the lazy", "the dog!",
+ "lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSkip1Trigram) {
+ SkipGramOp m(3, 1, false);
+ m.SetInput(kSentence);
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ testing::UnorderedElementsAreArray(
+ {"The quick brown", "The quick fox", "The brown fox",
+ "The brown jumps", "quick brown fox", "quick brown jumps",
+ "quick fox jumps", "quick fox over", "brown fox jumps",
+ "brown fox over", "brown jumps over", "brown jumps the",
+ "fox jumps over", "fox jumps the", "fox over the",
+ "fox over lazy", "jumps over the", "jumps over lazy",
+ "jumps the lazy", "jumps the dog!", "over the lazy",
+ "over the dog!", "over lazy dog!", "the lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSkip2Trigram) {
+ SkipGramOp m(3, 2, false);
+ m.SetInput(kSentence);
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(),
+ testing::UnorderedElementsAreArray(
+ {"The quick brown", "The quick fox", "The quick jumps",
+ "The brown fox", "The brown jumps", "The brown over",
+ "The fox jumps", "The fox over", "The fox the",
+ "quick brown fox", "quick brown jumps", "quick brown over",
+ "quick fox jumps", "quick fox over", "quick fox the",
+ "quick jumps over", "quick jumps the", "quick jumps lazy",
+ "brown fox jumps", "brown fox over", "brown fox the",
+ "brown jumps over", "brown jumps the", "brown jumps lazy",
+ "brown over the", "brown over lazy", "brown over dog!",
+ "fox jumps over", "fox jumps the", "fox jumps lazy",
+ "fox over the", "fox over lazy", "fox over dog!",
+ "fox the lazy", "fox the dog!", "jumps over the",
+ "jumps over lazy", "jumps over dog!", "jumps the lazy",
+ "jumps the dog!", "jumps lazy dog!", "over the lazy",
+ "over the dog!", "over lazy dog!", "the lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestAllSkip2Trigram) {
+ SkipGramOp m(3, 2, true);
+ m.SetInput(kSentence);
+ m.Invoke();
+ EXPECT_THAT(
+ m.GetOutput(),
+ testing::UnorderedElementsAreArray(
+ {// Unigram
+ "The", "quick", "brown", "fox", "jumps", "over", "the", "lazy",
+ "dog!",
+ // Bigram
+ "The quick", "The brown", "The fox", "quick brown", "quick fox",
+ "quick jumps", "brown fox", "brown jumps", "brown over", "fox jumps",
+ "fox over", "fox the", "jumps over", "jumps the", "jumps lazy",
+ "over the", "over lazy", "over dog!", "the lazy", "the dog!",
+ "lazy dog!",
+ // Trigram
+ "The quick brown", "The quick fox", "The quick jumps",
+ "The brown fox", "The brown jumps", "The brown over",
+ "The fox jumps", "The fox over", "The fox the", "quick brown fox",
+ "quick brown jumps", "quick brown over", "quick fox jumps",
+ "quick fox over", "quick fox the", "quick jumps over",
+ "quick jumps the", "quick jumps lazy", "brown fox jumps",
+ "brown fox over", "brown fox the", "brown jumps over",
+ "brown jumps the", "brown jumps lazy", "brown over the",
+ "brown over lazy", "brown over dog!", "fox jumps over",
+ "fox jumps the", "fox jumps lazy", "fox over the", "fox over lazy",
+ "fox over dog!", "fox the lazy", "fox the dog!", "jumps over the",
+ "jumps over lazy", "jumps over dog!", "jumps the lazy",
+ "jumps the dog!", "jumps lazy dog!", "over the lazy",
+ "over the dog!", "over lazy dog!", "the lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSingleWord) {
+ SkipGramOp m(1, 1, false);
+ m.SetInput("Hi");
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAre("Hi"));
+}
+
+TEST(SkipGramTest, TestWordsLessThanGram) {
+ SkipGramOp m(3, 1, false);
+ m.SetInput("Hi hi");
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), std::vector<string>());
+}
+
+TEST(SkipGramTest, TestEmptyInput) {
+ SkipGramOp m(1, 1, false);
+ m.SetInput("");
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAre());
+}
+
+TEST(SkipGramTest, TestWhitespaceInput) {
+ SkipGramOp m(1, 1, false);
+ m.SetInput(" ");
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAre());
+}
+
+TEST(SkipGramTest, TestInputWithExtraSpace) {
+ SkipGramOp m(1, 1, false);
+ m.SetInput(" Hello world ! ");
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput(), ElementsAre("Hello", "world", "!"));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/softmax_test.cc b/tensorflow/contrib/lite/kernels/softmax_test.cc
new file mode 100644
index 0000000000..ec8ec03b0d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/softmax_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite SOFTMAX op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+class SoftmaxOpModel : public SingleOpModel {
+ public:
+ SoftmaxOpModel(int batches, int size, float beta)
+ : batches_(batches), input_size_(size), beta_(beta) {
+ input_ = AddInput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+ SetBuiltinOp(BuiltinOperator_SOFTMAX, BuiltinOptions_SoftmaxOptions,
+ CreateSoftmaxOptions(builder_, beta_).Union());
+ BuildInterpreter({{batches_, input_size_}});
+ }
+
+ void SetInput(std::initializer_list<float> data) {
+ PopulateTensor(input_, data);
+ }
+
+ void SetInput(int offset, float* begin, float* end) {
+ PopulateTensor(input_, offset, begin, end);
+ }
+
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+ int input_;
+ int output_;
+
+ int batches_;
+ int input_size_;
+ float beta_;
+};
+
+TEST(SoftmaxOpTest, SimpleTest) {
+ SoftmaxOpModel m(/*batches=*/2, /*size=*/5, /*beta=*/1.0);
+ m.SetInput({
+ 1.0, 2.0, 3.0, 4.0, 5.0, // b = 0
+ -1.0, -2.0, -3.0, -4.0, -5.0, // b = 0
+ });
+
+ m.Invoke();
+
+ EXPECT_THAT(
+ m.GetOutput(),
+ ElementsAreArray(ArrayFloatNear(
+ {0.011656231, 0.031684921, 0.086128544, 0.234121657, 0.636408647,
+ 0.636408647, 0.234121657, 0.086128544, 0.031684921, 0.011656231},
+ 1e-6)));
+}
+
+TEST(SoftmaxOpTest, CompareWithTFminiBetaEq1) {
+ const int batch_size = 2;
+ const int input_size = 5;
+ const float beta = 1.0;
+ static float input_buffer[] = {
+ 1.0, 2.0, 3.0, 4.0, 5.0, // b = 0
+ -1.0, -2.0, -3.0, -4.0, -5.0, // b = 1
+ };
+
+ SoftmaxOpModel m(batch_size, input_size, beta);
+
+ m.SetInput(0, input_buffer, input_buffer + input_size * batch_size);
+
+ m.Invoke();
+
+ std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]);
+ static tflite::Dims<4> input_dims = {{input_size, 1, 1, batch_size},
+ {1, 0, 0, input_size}};
+ tflite::reference_ops::Softmax(input_buffer, input_dims, beta,
+ output_buffer.get(), input_dims);
+
+ std::vector<float> expected;
+ expected.insert(expected.end(), output_buffer.get(),
+ output_buffer.get() + input_size * batch_size);
+
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected, 1e-6)));
+}
+
+TEST(SoftmaxOpTest, CompareWithTFminiBetaNotEq1) {
+ const int batch_size = 2;
+ const int input_size = 5;
+ const float beta = 0.5;
+ static float input_buffer[] = {
+ 1.0, 2.0, 3.0, 4.0, 5.0, // b = 0
+ -1.0, -2.0, -3.0, -4.0, -5.0, // b = 1
+ };
+
+ SoftmaxOpModel m(batch_size, input_size, beta);
+
+ m.SetInput(0, input_buffer, input_buffer + input_size * batch_size);
+
+ m.Invoke();
+
+ std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]);
+ static tflite::Dims<4> input_dims = {{input_size, 1, 1, batch_size},
+ {1, 0, 0, input_size}};
+ tflite::reference_ops::Softmax(input_buffer, input_dims, beta,
+ output_buffer.get(), input_dims);
+
+ std::vector<float> expected;
+ expected.insert(expected.end(), output_buffer.get(),
+ output_buffer.get() + input_size * batch_size);
+
+ EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected, 1e-6)));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth.cc b/tensorflow/contrib/lite/kernels/space_to_depth.cc
new file mode 100644
index 0000000000..cb2e509c98
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/space_to_depth.cc
@@ -0,0 +1,146 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace space_to_depth {
+
+// This file has two implementation of SpaceToDepth. Note that SpaceToDepth
+// only works on 4D tensors.
+enum KernelType {
+ kReference,
+ kGenericOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteSpaceToDepthParams*>(node->builtin_data);
+
+ TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+ TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+ TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+
+ auto data_type = output->type;
+ TF_LITE_ENSURE(context,
+ data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
+ data_type == kTfLiteInt32 || data_type == kTfLiteInt64);
+ TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+ const int block_size = params->block_size;
+ const int input_height = input->dims->data[1];
+ const int input_width = input->dims->data[2];
+ int output_height = input_height / block_size;
+ int output_width = input_width / block_size;
+
+ TF_LITE_ENSURE_EQ(context, input_height, output_height * block_size);
+ TF_LITE_ENSURE_EQ(context, input_width, output_width * block_size);
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+ output_size->data[0] = input->dims->data[0];
+ output_size->data[1] = output_height;
+ output_size->data[2] = output_width;
+ output_size->data[3] = input->dims->data[3] * block_size * block_size;
+
+ return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params =
+ reinterpret_cast<TfLiteSpaceToDepthParams*>(node->builtin_data);
+
+ TfLiteTensor* input = GetInput(context, node, kInputTensor);
+ TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+#define TF_LITE_SPACE_TO_DEPTH(type, scalar) \
+ type::SpaceToDepth<scalar>( \
+ GetTensorData<scalar>(input), GetTensorDims(input), params->block_size, \
+ GetTensorData<scalar>(output), GetTensorDims(output))
+ switch (input->type) { // Already know in/out types are same.
+ case kTfLiteFloat32:
+ if (kernel_type == kReference) {
+ TF_LITE_SPACE_TO_DEPTH(reference_ops, float);
+ } else {
+ TF_LITE_SPACE_TO_DEPTH(optimized_ops, float);
+ }
+ break;
+ case kTfLiteUInt8:
+ if (kernel_type == kReference) {
+ TF_LITE_SPACE_TO_DEPTH(reference_ops, uint8_t);
+ } else {
+ TF_LITE_SPACE_TO_DEPTH(optimized_ops, uint8_t);
+ }
+ break;
+ case kTfLiteInt32:
+ if (kernel_type == kReference) {
+ TF_LITE_SPACE_TO_DEPTH(reference_ops, int32_t);
+ } else {
+ TF_LITE_SPACE_TO_DEPTH(optimized_ops, int32_t);
+ }
+ break;
+ case kTfLiteInt64:
+ if (kernel_type == kReference) {
+ TF_LITE_SPACE_TO_DEPTH(reference_ops, int64_t);
+ } else {
+ TF_LITE_SPACE_TO_DEPTH(optimized_ops, int64_t);
+ }
+ break;
+ default:
+ context->ReportError(context, "Type not currently supported.");
+ return kTfLiteError;
+ }
+#undef TF_LITE_SPACE_TO_DEPTH
+
+ return kTfLiteOk;
+}
+
+} // namespace space_to_depth
+
+TfLiteRegistration* Register_SPACE_TO_DEPTH_REF() {
+ static TfLiteRegistration r = {
+ nullptr, nullptr, space_to_depth::Prepare,
+ space_to_depth::Eval<space_to_depth::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_SPACE_TO_DEPTH_GENERIC_OPT() {
+ static TfLiteRegistration r = {
+ nullptr, nullptr, space_to_depth::Prepare,
+ space_to_depth::Eval<space_to_depth::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_SPACE_TO_DEPTH() {
+ return Register_SPACE_TO_DEPTH_GENERIC_OPT();
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth_test.cc b/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
new file mode 100644
index 0000000000..911f08a92c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class SpaceToDepthOpModel : public SingleOpModel {
+ public:
+ SpaceToDepthOpModel(const TensorData& tensor_data, int block_size) {
+ input_ = AddInput(tensor_data);
+ output_ = AddOutput(tensor_data);
+ SetBuiltinOp(BuiltinOperator_SPACE_TO_DEPTH,
+ BuiltinOptions_SpaceToDepthOptions,
+ CreateSpaceToDepthOptions(builder_, block_size).Union());
+ BuildInterpreter({GetShape(input_)});
+ }
+
+ template <typename T>
+ void SetInput(std::initializer_list<T> data) {
+ PopulateTensor<T>(input_, data);
+ }
+ template <typename T>
+ std::vector<T> GetOutput() {
+ return ExtractVector<T>(output_);
+ }
+ std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+ int input_;
+ int output_;
+};
+
+TEST(SpaceToDepthOpModel, BadBlockSize) {
+ EXPECT_DEATH(SpaceToDepthOpModel({TensorType_FLOAT32, {1, 2, 2, 1}}, 3),
+ "Cannot allocate tensors");
+}
+
+TEST(SpaceToDepthOpModel, Float32) {
+ SpaceToDepthOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}, 2);
+ m.SetInput<float>({1.4, 2.3, 3.2, 4.1, 5.4, 6.3, 7.2, 8.1});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput<float>(),
+ ElementsAreArray({1.4, 2.3, 3.2, 4.1, 5.4, 6.3, 7.2, 8.1}));
+ EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 8));
+}
+
+TEST(SpaceToDepthOpModel, Uint8) {
+ SpaceToDepthOpModel m({TensorType_UINT8, {1, 2, 2, 1}}, 2);
+ m.SetInput<uint8_t>({1, 2, 3, 4});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({1, 2, 3, 4}));
+ EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+TEST(SpaceToDepthOpModel, Int32) {
+ SpaceToDepthOpModel m({TensorType_INT32, {1, 2, 2, 3}}, 2);
+ m.SetInput<int32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput<int32_t>(),
+ ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+ EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 12));
+}
+
+TEST(SpaceToDepthOpModel, Int64) {
+ SpaceToDepthOpModel m({TensorType_INT64, {1, 4, 4, 1}}, 2);
+ m.SetInput<int64_t>({1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16});
+ m.Invoke();
+ EXPECT_THAT(m.GetOutput<int64_t>(),
+ ElementsAreArray(
+ {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}));
+ EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 2, 2, 4));
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
new file mode 100644
index 0000000000..dd414d53bd
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -0,0 +1,224 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace svdf {
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsFeatureTensor = 1;
+constexpr int kWeightsTimeTensor = 2;
+constexpr int kBiasTensor = 3;
+constexpr int kStateTensor = 0;
+constexpr int KOutputTensor = 1;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+ auto* scratch_tensor_index = new int;
+ context->AddTensors(context, 1, scratch_tensor_index);
+ return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+ delete reinterpret_cast<int*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+ int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+
+ // Check we have all the inputs and outputs we need.
+ TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
+ TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+
+ TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+ TfLiteTensor* weights_feature =
+ &context->tensors[node->inputs->data[kWeightsFeatureTensor]];
+ TfLiteTensor* weights_time =
+ &context->tensors[node->inputs->data[kWeightsTimeTensor]];
+
+ // Check all the parameters of tensor match within themselves and match the
+ // input configuration.
+ const int rank = params->rank;
+ const int batch_size = input->dims->data[0];
+ const int num_filters = weights_feature->dims->data[0];
+ TF_LITE_ASSERT_EQ(num_filters % rank, 0);
+ const int num_units = num_filters / rank;
+ const int memory_size = weights_time->dims->data[1];
+ TF_LITE_ASSERT_EQ(input->dims->data[1], weights_feature->dims->data[1]);
+ TF_LITE_ASSERT_EQ(weights_time->dims->data[0], num_filters);
+
+ TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+ if (bias) {
+ TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
+ }
+
+ TfLiteTensor* state = &context->tensors[node->outputs->data[kStateTensor]];
+ TfLiteTensor* output = &context->tensors[node->outputs->data[KOutputTensor]];
+
+ // Resize state.
+ // For each batch, the state is a 2-D tensor: memory_size * num_filters
+ // The left most column is used to save current cycle activation.
+ // The right most column is used to save temporary output which will be
+ // reduced to num_units outputs.
+ TfLiteIntArray* state_size_array = TfLiteIntArrayCreate(2);
+ state_size_array->data[0] = batch_size;
+ state_size_array->data[1] = memory_size * num_filters;
+ TF_LITE_ENSURE_OK(context,
+ context->ResizeTensor(context, state, state_size_array));
+
+ // Mark state as a persistent tensor.
+ state->allocation_type = kTfLiteArenaRwPersistent;
+
+ // Resize output.
+ TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
+ output_size_array->data[0] = batch_size;
+ output_size_array->data[1] = num_units;
+ TF_LITE_ENSURE_OK(context,
+ context->ResizeTensor(context, output, output_size_array));
+
+ // Resize scratch.
+ TfLiteIntArrayFree(node->temporaries);
+ node->temporaries = TfLiteIntArrayCreate(1);
+ node->temporaries->data[0] = *scratch_tensor_index;
+
+ TfLiteIntArray* scratch_size_array = TfLiteIntArrayCreate(2);
+ scratch_size_array->data[0] = batch_size;
+ scratch_size_array->data[1] = num_filters;
+
+ TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
+ scratch_tensor->type = input->type;
+ scratch_tensor->allocation_type = kTfLiteArenaRw;
+ TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor,
+ scratch_size_array));
+
+ return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+
+ TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+ TfLiteTensor* weights_feature =
+ &context->tensors[node->inputs->data[kWeightsFeatureTensor]];
+ TfLiteTensor* weights_time =
+ &context->tensors[node->inputs->data[kWeightsTimeTensor]];
+
+ TfLiteTensor* state = &context->tensors[node->outputs->data[kStateTensor]];
+ TfLiteTensor* output = &context->tensors[node->outputs->data[KOutputTensor]];
+ TfLiteTensor* scratch = &context->tensors[node->temporaries->data[0]];
+
+ TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+
+ const int rank = params->rank;
+ const int batch_size = input->dims->data[0];
+ const int input_size = input->dims->data[1];
+ const int num_filters = weights_feature->dims->data[0];
+ const int num_units = num_filters / rank;
+ const int memory_size = weights_time->dims->data[1];
+
+ // Clear the activation (state left most column).
+ // TODO(ghodrat): Add a test which initialize state with invalid values in
+ // left most column and make sure it passes.
+ for (int b = 0; b < batch_size; b++) {
+ float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
+ for (int c = 0; c < num_filters; c++) {
+ float* state_ptr = state_ptr_batch + c * memory_size;
+ state_ptr[memory_size - 1] = 0.0;
+ }
+ }
+
+ // Compute conv1d(inputs, weights_feature).
+ // The state left most column is used to save current cycle activation. This
+ // is achieved by starting at state->data.f[memory_size - 1] and having the
+ // stride equal to memory_size.
+ tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+ weights_feature->data.f, num_filters, input_size, input->data.f,
+ batch_size, &state->data.f[memory_size - 1], memory_size);
+
+ // Compute matmul(state, weights_time).
+ // The right most column is used to save temporary output (with the size of
+ // num_filters). This is achieved by starting at state->data.f and having the
+ // stride equal to memory_size.
+ for (int b = 0; b < batch_size; b++) {
+ float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
+ float* scratch_ptr_batch = scratch->data.f + b * num_filters;
+ tensor_utils::BatchVectorBatchVectorDotProduct(
+ weights_time->data.f, state_ptr_batch, memory_size, num_filters,
+ scratch_ptr_batch, /*result_stride=*/1);
+ }
+
+ // Initialize output with bias if provided.
+ if (bias) {
+ tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
+ output->data.f);
+ } else {
+ tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
+ }
+
+ // Reduction sum
+ // TODO(ghodrat): Consider not reusing state for the temporary output, this
+ // way ReductionSum operates on row-vector instead of column vector.
+ for (int b = 0; b < batch_size; b++) {
+ float* output_ptr_batch = output->data.f + b * num_units;
+ float* scratch_ptr_batch = scratch->data.f + b * num_filters;
+ tensor_utils::ReductionSumVector(scratch_ptr_batch, output_ptr_batch,
+ num_units, rank);
+ }
+
+ // Apply activation.
+ for (int b = 0; b < batch_size; b++) {
+ float* output_ptr_batch = output->data.f + b * num_units;
+ tensor_utils::ApplyActivationToVector(output_ptr_batch, num_units,
+ params->activation, output_ptr_batch);
+ }
+
+ // Right shift the state.
+ for (int b = 0; b < batch_size; b++) {
+ float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
+ for (int f = 0; f < num_filters; f++) {
+ tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
+ /*shift_value=*/0.0);
+ state_ptr_batch += memory_size;
+ }
+ }
+ return kTfLiteOk;
+}
+
+} // namespace svdf
+
+TfLiteRegistration* Register_SVDF() {
+ static TfLiteRegistration r = {svdf::Init, svdf::Free, svdf::Prepare,
+ svdf::Eval};
+ return &r;
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc
new file mode 100644
index 0000000000..d956025e9d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/svdf_test.cc
@@ -0,0 +1,312 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite SVDF op.
+
+#include <vector>
+#include <iomanip>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float svdf_input[] = {
+ 0.12609188, -0.46347019, -0.89598465,
+ 0.35867718, 0.36897406, 0.73463392,
+
+ 0.14278367, -1.64410412, -0.75222826,
+ -0.57290924, 0.12729003, 0.7567004,
+
+ 0.49837467, 0.19278903, 0.26584083,
+ 0.17660543, 0.52949083, -0.77931279,
+
+ -0.11186574, 0.13164264, -0.05349274,
+ -0.72674477, -0.5683046, 0.55900657,
+
+ -0.68892461, 0.37783599, 0.18263303,
+ -0.63690937, 0.44483393, -0.71817774,
+
+ -0.81299269, -0.86831826, 1.43940818,
+ -0.95760226, 1.82078898, 0.71135032,
+
+ -1.45006323, -0.82251364, -1.69082689,
+ -1.65087092, -1.89238167, 1.54172635,
+
+ 0.03966608, -0.24936394, -0.77526885,
+ 2.06740379, -1.51439476, 1.43768692,
+
+ 0.11771342, -0.23761693, -0.65898693,
+ 0.31088525, -1.55601168, -0.87661445,
+
+ -0.89477462, 1.67204106, -0.53235275,
+ -0.6230064, 0.29819036, 1.06939757,
+};
+
+static float svdf_golden_output_rank_1[] = {
+ 0.014899, -0.0517661, -0.143725, -0.00271883,
+ -0.03004015, 0.09565311, 0.1587342, 0.00784263,
+
+ 0.068281, -0.162217, -0.152268, 0.00323521,
+ 0.01582633, 0.03858774, -0.03001583, -0.02671271,
+
+ -0.0317821, -0.0333089, 0.0609602, 0.0333759,
+ -0.01432795, 0.05524484, 0.1101355, -0.02382665,
+
+ -0.00623099, -0.077701, -0.391193, -0.0136691,
+ -0.02333033, 0.02293761, 0.12338032, 0.04326871,
+
+ 0.201551, -0.164607, -0.179462, -0.0592739,
+ 0.01064911, -0.17503069, 0.07821996, -0.00224009,
+
+ 0.0886511, -0.0875401, -0.269283, 0.0281379,
+ -0.02282338, 0.09741908, 0.32973239, 0.12281385,
+
+ -0.201174, -0.586145, -0.628624, -0.0330412,
+ 0.24780814, -0.39304617, -0.22473189, 0.02589256,
+
+ -0.0839096, -0.299329, 0.108746, 0.109808,
+ 0.10084175, -0.06416984, 0.28936723, 0.0026358,
+
+ 0.419114, -0.237824, -0.422627, 0.175115,
+ -0.2314795, -0.18584411, -0.4228974, -0.12928449,
+
+ 0.36726, -0.522303, -0.456502, -0.175475,
+ 0.17012937, -0.34447709, 0.38505614, -0.28158101,
+};
+
+static float svdf_golden_output_rank_2[] = {
+ -0.09623547, -0.10193135, 0.11083051, -0.0347917,
+ 0.1141196, 0.12965347, -0.12652366, 0.01007236,
+
+ -0.16396809, -0.21247184, 0.11259045, -0.04156673,
+ 0.10132131, -0.06143532, -0.00924693, 0.10084561,
+
+ 0.01257364, 0.0506071, -0.19287863, -0.07162561,
+ -0.02033747, 0.22673416, 0.15487903, 0.02525555,
+
+ -0.1411963, -0.37054959, 0.01774767, 0.05867489,
+ 0.09607603, -0.0141301, -0.08995658, 0.12867066,
+
+ -0.27142537, -0.16955489, 0.18521598, -0.12528358,
+ 0.00331409, 0.11167502, 0.02218599, -0.07309391,
+
+ 0.09593632, -0.28361851, -0.0773851, 0.17199151,
+ -0.00075242, 0.33691186, -0.1536046, 0.16572715,
+
+ -0.27916506, -0.27626723, 0.42615682, 0.3225764,
+ -0.37472126, -0.55655634, -0.05013514, 0.289112,
+
+ -0.24418658, 0.07540751, -0.1940318, -0.08911639,
+ 0.00732617, 0.46737891, 0.26449674, 0.24888524,
+
+ -0.17225097, -0.54660404, -0.38795233, 0.08389944,
+ 0.07736043, -0.28260678, 0.15666828, 1.14949894,
+
+ -0.57454878, -0.64704704, 0.73235172, -0.34616736,
+ 0.21120001, -0.22927976, 0.02455296, -0.35906726,
+};
+
+// Derived class of SingleOpModel, which is used to test SVDF TFLite op.
+class SVDFOpModel : public SingleOpModel {
+ public:
+ SVDFOpModel(int batches, int units, int input_size, int memory_size, int rank)
+ : batches_(batches),
+ units_(units),
+ input_size_(input_size),
+ memory_size_(memory_size),
+ rank_(rank) {
+ input_ = AddInput(TensorType_FLOAT32);
+ weights_feature_ = AddInput(TensorType_FLOAT32);
+ weights_time_ = AddInput(TensorType_FLOAT32);
+ bias_ = AddNullInput();
+ state_ = AddOutput(TensorType_FLOAT32);
+ output_ = AddOutput(TensorType_FLOAT32);
+ SetBuiltinOp(
+ BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
+ CreateSVDFOptions(builder_, rank, ActivationFunctionType_NONE).Union());
+ BuildInterpreter({
+ {batches_, input_size_}, // Input tensor
+ {units_ * rank, input_size_}, // weights_feature tensor
+ {units_ * rank, memory_size_}, // weights_time tensor
+ {units_} // bias tensor
+ });
+ }
+
+ // Populates the weights_feature tensor.
+ void SetWeightsFeature(std::initializer_list<float> f) {
+ PopulateTensor(weights_feature_, f);
+ }
+
+ // Populates the weights_time tensor.
+ void SetWeightsTime(std::initializer_list<float> f) {
+ PopulateTensor(weights_time_, f);
+ }
+
+ // Populates the input tensor.
+ void SetInput(int offset, float* begin, float* end) {
+ PopulateTensor(input_, offset, begin, end);
+ }
+
+ // Resets the state of SVDF op by filling it with 0's.
+ void ResetState() {
+ const int zero_buffer_size = rank_ * units_ * batches_ * memory_size_;
+ std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+ memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+ PopulateTensor(state_, 0, zero_buffer.get(),
+ zero_buffer.get() + zero_buffer_size);
+ }
+
+ // Extracts the output tensor from the SVDF op.
+ std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ int input_size() { return input_size_; }
+ int num_units() { return units_; }
+ int num_batches() { return batches_; }
+
+ private:
+ int input_;
+ int weights_feature_;
+ int weights_time_;
+ int bias_;
+ int state_;
+ int output_;
+
+ int batches_;
+ int units_;
+ int input_size_;
+ int memory_size_;
+ int rank_;
+};
+
+TEST(SVDFOpTest, BlackBoxTestRank1) {
+ SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
+ /*memory_size=*/10, /*rank=*/1);
+ svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347,
+ 0.22197971, 0.12416199, 0.27901134, 0.27557442,
+ 0.3905206, -0.36137494, -0.06634006, -0.10640851});
+
+ svdf.SetWeightsTime(
+ {-0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156,
+ 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199,
+
+ 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518,
+ -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296,
+
+ -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236,
+ 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846,
+
+ -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166,
+ -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657});
+
+ svdf.ResetState();
+ const int svdf_num_batches = svdf.num_batches();
+ const int svdf_input_size = svdf.input_size();
+ const int svdf_num_units = svdf.num_units();
+ const int input_sequence_size =
+ sizeof(svdf_input) / sizeof(float) / (svdf_input_size * svdf_num_batches);
+ // Going over each input batch, setting the input tensor, invoking the SVDF op
+ // and checking the output with the expected golden values.
+ for (int i = 0; i < input_sequence_size; i++) {
+ float* batch_start = svdf_input + i * svdf_input_size * svdf_num_batches;
+ float* batch_end = batch_start + svdf_input_size * svdf_num_batches;
+ svdf.SetInput(0, batch_start, batch_end);
+
+ svdf.Invoke();
+
+ float* golden_start =
+ svdf_golden_output_rank_1 + i * svdf_num_units * svdf_num_batches;
+ float* golden_end = golden_start + svdf_num_units * svdf_num_batches;
+ std::vector<float> expected;
+ expected.insert(expected.end(), golden_start, golden_end);
+
+ EXPECT_THAT(svdf.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+ }
+}
+
+TEST(SVDFOpTest, BlackBoxTestRank2) {
+ SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
+ /*memory_size=*/10, /*rank=*/2);
+ svdf.SetWeightsFeature({-0.31930989, 0.0079667, 0.39296314, 0.37613347,
+ 0.12416199, 0.15785322, 0.27901134, 0.3905206,
+ 0.21931258, -0.36137494, -0.10640851, 0.31053296,
+ -0.36118156, -0.0976817, -0.36916667, 0.22197971,
+ 0.15294972, 0.38031587, 0.27557442, 0.39635518,
+ -0.21580373, -0.06634006, -0.02702999, 0.27072677});
+
+ svdf.SetWeightsTime(
+ {-0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156,
+ 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199,
+
+ 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518,
+ -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296,
+
+ -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236,
+ 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846,
+
+ -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166,
+ -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657,
+
+ -0.14884081, 0.19931212, -0.36002168, 0.34663299, -0.11405486,
+ 0.12672701, 0.39463779, -0.07886535, -0.06384811, 0.08249187,
+
+ -0.26816407, -0.19905911, 0.29211238, 0.31264046, -0.28664589,
+ 0.05698794, 0.11613581, 0.14078894, 0.02187902, -0.21781836,
+
+ -0.15567942, 0.08693647, -0.38256618, 0.36580828, -0.22922277,
+ -0.0226903, 0.12878349, -0.28122205, -0.10850525, -0.11955214,
+
+ 0.27179423, -0.04710215, 0.31069002, 0.22672787, 0.09580326,
+ 0.08682203, 0.1258215, 0.1851041, 0.29228821, 0.12366763});
+
+ svdf.ResetState();
+ const int svdf_num_batches = svdf.num_batches();
+ const int svdf_input_size = svdf.input_size();
+ const int svdf_num_units = svdf.num_units();
+ const int input_sequence_size =
+ sizeof(svdf_input) / sizeof(float) / (svdf_input_size * svdf_num_batches);
+ // Going over each input batch, setting the input tensor, invoking the SVDF op
+ // and checking the output with the expected golden values.
+ for (int i = 0; i < input_sequence_size; i++) {
+ float* batch_start = svdf_input + i * svdf_input_size * svdf_num_batches;
+ float* batch_end = batch_start + svdf_input_size * svdf_num_batches;
+ svdf.SetInput(0, batch_start, batch_end);
+
+ svdf.Invoke();
+
+ float* golden_start =
+ svdf_golden_output_rank_2 + i * svdf_num_units * svdf_num_batches;
+ float* golden_end = golden_start + svdf_num_units * svdf_num_batches;
+ std::vector<float> expected;
+ expected.insert(expected.end(), golden_start, golden_end);
+
+ EXPECT_THAT(svdf.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+ }
+}
+
+} // namespace
+} // namespace tflite
+
+int main(int argc, char** argv) {
+ // On Linux, add: tflite::LogToStderr();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
new file mode 100644
index 0000000000..f716ba8741
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -0,0 +1,183 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+
+#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tflite {
+
+using ::testing::FloatNear;
+using ::testing::Matcher;
+
+namespace {
+template <typename T>
+std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
+ // These are required by many quantized operations.
+ CHECK_LE(f_min, 0);
+ CHECK_GE(f_max, 0);
+ T q_min = std::numeric_limits<T>::min();
+ T q_max = std::numeric_limits<T>::max();
+ float range = q_max - q_min;
+ float scale = (f_max - f_min) / range;
+ int32_t zero_point = std::min(
+ q_max,
+ std::max(q_min, static_cast<T>(std::round(q_min - f_min / scale))));
+ return {scale, zero_point};
+}
+} // namespace
+
+std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
+ float max_abs_error) {
+ std::vector<Matcher<float>> matchers;
+ matchers.reserve(values.size());
+ for (const float& v : values) {
+ matchers.emplace_back(FloatNear(v, max_abs_error));
+ }
+ return matchers;
+}
+
+int SingleOpModel::AddTensor(TensorData t) {
+ int id = tensors_.size();
+
+ // This is slightly different depending on whether we are adding a
+ // quantized or a regular tensor.
+ bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
+
+ flatbuffers::Offset<QuantizationParameters> q_params = 0;
+
+ if (is_quantized) {
+ if (t.min != 0 || t.max != 0) {
+ if (t.type == TensorType_UINT8) {
+ std::tie(t.scale, t.zero_point) =
+ QuantizationParams<uint8_t>(t.min, t.max);
+ } else if (t.type == TensorType_INT32) {
+ std::tie(t.scale, t.zero_point) =
+ QuantizationParams<int32_t>(t.min, t.max);
+ } else {
+ LOG(FATAL) << "No support for the requested quantized type";
+ }
+ t.min = 0;
+ t.max = 0;
+ }
+
+ q_params = CreateQuantizationParameters(
+ builder_, /*min=*/0, /*max=*/0, builder_.CreateVector<float>({t.scale}),
+ builder_.CreateVector<int64_t>({t.zero_point}));
+ }
+
+ tensors_.push_back(CreateTensor(builder_, builder_.CreateVector<int>({}),
+ t.type, /*buffer=*/0,
+ /*name=*/0, q_params));
+
+ tensor_data_[id] = t;
+
+ return id;
+}
+
+int SingleOpModel::AddInput(const TensorData& t) {
+ int id = AddTensor(t);
+ inputs_.push_back(id);
+ return id;
+}
+
+int SingleOpModel::AddNullInput() {
+ int id = kOptionalTensor;
+ inputs_.push_back(id);
+ return id;
+}
+
+int SingleOpModel::AddOutput(const TensorData& t) {
+ int id = AddTensor(t);
+ outputs_.push_back(id);
+ return id;
+}
+
+void SingleOpModel::SetBuiltinOp(BuiltinOperator type,
+ BuiltinOptions builtin_options_type,
+ flatbuffers::Offset<void> builtin_options) {
+ opcodes_.push_back(CreateOperatorCode(builder_, type, 0));
+ operators_.push_back(CreateOperator(
+ builder_, /*opcode_index=*/0, builder_.CreateVector<int32_t>(inputs_),
+ builder_.CreateVector<int32_t>(outputs_), builtin_options_type,
+ builtin_options,
+ /*custom_options=*/0, CustomOptionsFormat_FLEXBUFFERS));
+}
+
+void SingleOpModel::SetCustomOp(
+ const string& name, const std::vector<uint8_t>& custom_option,
+ const std::function<TfLiteRegistration*()>& registeration) {
+ custom_registrations_[name] = registeration;
+ opcodes_.push_back(
+ CreateOperatorCodeDirect(builder_, BuiltinOperator_CUSTOM, name.data()));
+ operators_.push_back(CreateOperator(
+ builder_, /*opcode_index=*/0, builder_.CreateVector<int32_t>(inputs_),
+ builder_.CreateVector<int32_t>(outputs_), BuiltinOptions_NONE, 0,
+ builder_.CreateVector<uint8_t>(custom_option),
+ CustomOptionsFormat_FLEXBUFFERS));
+}
+
+void SingleOpModel::BuildInterpreter(
+ std::vector<std::vector<int>> input_shapes) {
+ auto opcodes = builder_.CreateVector(opcodes_);
+ auto operators = builder_.CreateVector(operators_);
+ auto tensors = builder_.CreateVector(tensors_);
+ auto inputs = builder_.CreateVector<int32_t>(inputs_);
+ auto outputs = builder_.CreateVector<int32_t>(outputs_);
+ // Create a single subgraph
+ std::vector<flatbuffers::Offset<SubGraph>> subgraphs;
+ auto subgraph = CreateSubGraph(builder_, tensors, inputs, outputs, operators);
+ subgraphs.push_back(subgraph);
+ auto subgraphs_flatbuffer = builder_.CreateVector(subgraphs);
+
+ std::vector<flatbuffers::Offset<Buffer>> buffers_vec;
+ auto buffers = builder_.CreateVector(buffers_vec);
+ auto description = builder_.CreateString("programmatic model");
+ builder_.Finish(CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+ subgraphs_flatbuffer, description, buffers));
+
+ auto* model = GetModel(builder_.GetBufferPointer());
+
+ ops::builtin::BuiltinOpResolver builtins;
+ for (const auto& reg : custom_registrations_) {
+ builtins.AddCustom(reg.first.data(), reg.second());
+ }
+ InterpreterBuilder(model, builtins)(&interpreter_);
+
+ CHECK(interpreter_ != nullptr);
+
+ int i = 0;
+ for (const auto& shape : input_shapes) {
+ int input_idx = interpreter_->inputs()[i++];
+ if (input_idx == kOptionalTensor) continue;
+ CHECK(interpreter_->ResizeInputTensor(input_idx, shape) == kTfLiteOk);
+ }
+ CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
+ << "Cannot allocate tensors";
+}
+
+void SingleOpModel::Invoke() { CHECK(interpreter_->Invoke() == kTfLiteOk); }
+
+int32_t SingleOpModel::GetTensorSize(int index) const {
+ TfLiteTensor* t = interpreter_->tensor(index);
+ CHECK(t);
+ int total_size = 1;
+ for (int i = 0; i < t->dims->size; ++i) {
+ total_size *= t->dims->data[i];
+ }
+ return total_size;
+}
+
+} // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
new file mode 100644
index 0000000000..e68e494661
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -0,0 +1,202 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tflite {
+
+inline void LogToStderr() {
+#ifdef PLATFORM_GOOGLE
+ FLAGS_logtostderr = true;
+#endif
+}
+
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+std::vector<::testing::Matcher<float>> ArrayFloatNear(
+ const std::vector<float>& values, float max_abs_error = 1e-5);
+
+template <typename T>
+inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
+ int32_t zero_point) {
+ std::vector<T> q;
+ for (float f : data) {
+ q.push_back(std::max(
+ std::numeric_limits<T>::min(),
+ std::min(std::numeric_limits<T>::max(),
+ static_cast<T>(std::round(zero_point + (f / scale))))));
+ }
+ return q;
+}
+
+template <typename T>
+inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
+ int32_t zero_point) {
+ std::vector<float> f;
+ for (T q : data) {
+ f.push_back(scale * (q - zero_point));
+ }
+ return f;
+}
+
+// A test model that contains a single operator. All operator inputs and
+// output are external to the model, so the tests can directly access them.
+// Typical usage:
+// SingleOpModel m;
+// int a = m.AddInput({TensorType_FLOAT32, a_shape});
+// int b = m.AddInput({TensorType_FLOAT32, b_shape});
+// int c = m.AddOutput({TensorType_FLOAT32, {}});
+// m.SetBuiltinOp(...);
+// m.BuildInterpreter({GetShape(a), GetShape(b)});
+// m.PopulateTensor(a, {...});
+// m.PopulateTensor(b, {...});
+// m.Invoke();
+// EXPECT_THAT(m.ExtractVector<float>(c), ArrayFloatNear({...}));
+//
+
+// A helper struct to construct test tensors. This is particularly useful for
+// quantized tensor which must have their scale and zero_point defined before
+// the actual data is known. This mimics what happens in practice: quantization
+// parameters are calculate during training.
+struct TensorData {
+ TensorType type;
+ std::vector<int> shape;
+ float min;
+ float max;
+ float scale;
+ int32_t zero_point;
+};
+
+class SingleOpModel {
+ public:
+ SingleOpModel() {}
+ ~SingleOpModel() {}
+
+ // Copying or assignment is disallowed to simplify ownership semantics.
+ SingleOpModel(const SingleOpModel&) = delete;
+ SingleOpModel& operator=(const SingleOpModel&) = delete;
+
+ // Add a TensorType input tensor and return its index.
+ int AddInput(TensorType type) { return AddInput(TensorData{type}); }
+ int AddInput(const TensorData& t);
+
+ // Add a null input tensor (optional input) and return kOptionalTensor.
+ int AddNullInput();
+
+ // Add a TensorType output tensor and return its index.
+ int AddOutput(TensorType type) { return AddOutput(TensorData{type}); }
+ int AddOutput(const TensorData& t);
+
+ template <typename T>
+ void QuantizeAndPopulate(int index, std::initializer_list<float> data) {
+ TfLiteTensor* t = interpreter_->tensor(index);
+ auto q = Quantize<T>(data, t->params.scale, t->params.zero_point);
+ PopulateTensor(index, 0, q.data(), q.data() + q.size());
+ }
+
+ const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
+
+ float GetScale(int id) { return tensor_data_.at(id).scale; }
+ int32_t GetZeroPoint(int id) { return tensor_data_.at(id).zero_point; }
+
+ // Define the operator in this model.
+ void SetBuiltinOp(BuiltinOperator type, BuiltinOptions builtin_options_type,
+ flatbuffers::Offset<void> builtin_options);
+ void SetCustomOp(const string& name,
+ const std::vector<uint8_t>& custom_option,
+ const std::function<TfLiteRegistration*()>& registeration);
+
+ // Build the interpreter for this model. Also, resize and allocate all
+ // tensors given the shapes of the inputs.
+ void BuildInterpreter(std::vector<std::vector<int>> input_shapes);
+
+ void Invoke();
+
+ void PopulateStringTensor(int index, const std::vector<string>& content) {
+ auto tensor = interpreter_->tensor(index);
+ DynamicBuffer buf;
+ for (const string& s : content) {
+ buf.AddString(s.data(), s.length());
+ }
+ buf.WriteToTensor(tensor);
+ }
+
+ // Populate the tensor given its index.
+ template <typename T>
+ void PopulateTensor(int index, std::initializer_list<T> data) {
+ T* v = interpreter_->typed_tensor<T>(index);
+ CHECK(v) << "No tensor with index '" << index << "'.";
+ for (T f : data) {
+ *v = f;
+ ++v;
+ }
+ }
+
+ // Partially populate the tensor, starting at the given offset.
+ template <typename T>
+ void PopulateTensor(int index, int offset, T* begin, T* end) {
+ T* v = interpreter_->typed_tensor<T>(index);
+ memcpy(v + offset, begin, (end - begin) * sizeof(T));
+ }
+
+ // Return a vector with the flattened contents of a tensor.
+ template <typename T>
+ std::vector<T> ExtractVector(int index) {
+ T* v = interpreter_->typed_tensor<T>(index);
+ CHECK(v);
+ return std::vector<T>(v, v + GetTensorSize(index));
+ }
+
+ std::vector<int> GetTensorShape(int index) {
+ std::vector<int> result;
+ TfLiteTensor* t = interpreter_->tensor(index);
+ for (int i = 0; i < t->dims->size; ++i) {
+ result.push_back(t->dims->data[i]);
+ }
+ return result;
+ }
+
+ protected:
+ int32_t GetTensorSize(int index) const;
+
+ flatbuffers::FlatBufferBuilder builder_;
+ std::unique_ptr<tflite::Interpreter> interpreter_;
+
+ private:
+ int AddTensor(TensorData t);
+
+ std::map<int, TensorData> tensor_data_;
+ std::vector<int32_t> inputs_;
+ std::vector<int32_t> outputs_;
+ std::vector<flatbuffers::Offset<Tensor>> tensors_;
+ std::vector<flatbuffers::Offset<OperatorCode>> opcodes_;
+ std::vector<flatbuffers::Offset<Operator>> operators_;
+ std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
+};
+
+} // namespace tflite
+
+#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_