diff options
Diffstat (limited to 'tensorflow/contrib/lite/kernels')
185 files changed, 37209 insertions, 14456 deletions
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD index b7291dd379..d2d8073abd 100644 --- a/tensorflow/contrib/lite/kernels/BUILD +++ b/tensorflow/contrib/lite/kernels/BUILD @@ -6,13 +6,29 @@ licenses(["notice"]) # Apache 2.0 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts") load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite") -load("//tensorflow:tensorflow.bzl", "tf_cc_test") +load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_opts_nortti_if_android") + +# Suppress warnings that are introduced by Eigen Tensor. +EXTRA_EIGEN_COPTS = select({ + "//tensorflow:ios": [ + "-Wno-error=invalid-partial-specialization", + "-Wno-error=reorder", + ], + "//tensorflow:windows": [ + "/DEIGEN_HAS_C99_MATH", + "/DEIGEN_AVOID_STL_ARRAY", + ], + "//conditions:default": ["-Wno-error=reorder"], +}) tf_cc_test( name = "optional_tensor_test", size = "small", srcs = ["optional_tensor_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -46,11 +62,12 @@ cc_library( hdrs = [ "eigen_support.h", ], - copts = tflite_copts(), + copts = tflite_copts() + EXTRA_EIGEN_COPTS, deps = [ ":op_macros", - "//tensorflow/contrib/lite:context", - "//third_party/eigen3", + "//tensorflow/contrib/lite:arena_planner", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels/internal:optimized", ], ) @@ -65,7 +82,7 @@ cc_library( copts = tflite_copts(), deps = [ ":op_macros", - "//tensorflow/contrib/lite:context", + "//tensorflow/contrib/lite/c:c_api_internal", "@gemmlowp", ], ) @@ -76,7 +93,7 @@ cc_library( "activation_functor.h", ], deps = [ - "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite/c:c_api_internal", ], ) @@ -96,9 +113,9 @@ cc_library( "kernel_util.h", ], deps = [ - "//tensorflow/contrib/lite:builtin_op_data", - "//tensorflow/contrib/lite:context", + "//tensorflow/contrib/lite/c:c_api_internal", "//tensorflow/contrib/lite/kernels/internal:round", + "//tensorflow/contrib/lite/kernels/internal:types", ], ) @@ -106,7 +123,10 @@ tf_cc_test( name = "kernel_util_test", size = "small", srcs = ["kernel_util_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":kernel_util", "//tensorflow/contrib/lite/testing:util", @@ -118,6 +138,7 @@ tf_cc_test( name = "test_util_test", size = "small", srcs = ["test_util_test.cc"], + tags = ["no_oss"], deps = [ ":test_util", "//tensorflow/contrib/lite/testing:util", @@ -126,11 +147,20 @@ tf_cc_test( ) cc_library( - name = "builtin_ops", + name = "padding", + srcs = [], + hdrs = ["padding.h"], + deps = [ + "//tensorflow/contrib/lite/c:c_api_internal", + ], +) + +cc_library( + name = "builtin_op_kernels", srcs = [ "activations.cc", "add.cc", - "arg_max.cc", + "arg_min_max.cc", "audio_spectrogram.cc", "basic_rnn.cc", "batch_to_space_nd.cc", @@ -142,84 +172,120 @@ cc_library( "conv.cc", "depthwise_conv.cc", "dequantize.cc", + "detection_postprocess.cc", "div.cc", "elementwise.cc", "embedding_lookup.cc", "embedding_lookup_sparse.cc", "exp.cc", + "expand_dims.cc", + "fake_quant.cc", "floor.cc", + "floor_div.cc", "fully_connected.cc", "gather.cc", "hashtable_lookup.cc", "l2norm.cc", + "layer_norm_lstm.cc", "local_response_norm.cc", + "logical.cc", "lsh_projection.cc", "lstm.cc", "maximum_minimum.cc", - "mean.cc", "mfcc.cc", "mul.cc", "neg.cc", + "one_hot.cc", + "pack.cc", "pad.cc", "pooling.cc", - "register.cc", + "pow.cc", + "reduce.cc", + "relu1.cc", "reshape.cc", "resize_bilinear.cc", "select.cc", + "shape.cc", "skip_gram.cc", "slice.cc", "space_to_batch_nd.cc", "space_to_depth.cc", + "sparse_output_fully_connected.cc", + "sparse_to_dense.cc", "split.cc", "squeeze.cc", "strided_slice.cc", "sub.cc", "svdf.cc", + "tile.cc", "topk_v2.cc", "transpose.cc", "transpose_conv.cc", "unidirectional_sequence_lstm.cc", "unidirectional_sequence_rnn.cc", + "unpack.cc", + "zeros_like.cc", ], hdrs = [ - "padding.h", - "register.h", ], - # Suppress warnings that are introduced by Eigen Tensor. - copts = tflite_copts() + [ - "-Wno-error=reorder", - ] + select({ - "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"], - "//conditions:default": [ - ], - }), + copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS, + visibility = ["//visibility:private"], deps = [ ":activation_functor", ":eigen_support", ":kernel_util", + ":lstm_eval", ":op_macros", - "//tensorflow/contrib/lite:builtin_op_data", + ":padding", "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:string_util", + "//tensorflow/contrib/lite/c:c_api_internal", "//tensorflow/contrib/lite/kernels:gemm_support", "//tensorflow/contrib/lite/kernels/internal:audio_utils", "//tensorflow/contrib/lite/kernels/internal:kernel_utils", "//tensorflow/contrib/lite/kernels/internal:optimized", "//tensorflow/contrib/lite/kernels/internal:optimized_base", "//tensorflow/contrib/lite/kernels/internal:quantization_util", - "//tensorflow/contrib/lite/kernels/internal:reference", "//tensorflow/contrib/lite/kernels/internal:reference_base", + "//tensorflow/contrib/lite/kernels/internal:tensor", "//tensorflow/contrib/lite/kernels/internal:tensor_utils", "@farmhash_archive//:farmhash", "@flatbuffers", ], ) +cc_library( + name = "lstm_eval", + srcs = ["lstm_eval.cc"], + hdrs = ["lstm_eval.h"], + deps = [ + ":op_macros", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels/internal:kernel_utils", + "//tensorflow/contrib/lite/kernels/internal:tensor_utils", + ], +) + +cc_library( + name = "builtin_ops", + srcs = ["register.cc"], + hdrs = ["register.h"], + deps = [ + ":builtin_op_kernels", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite:util", + "//tensorflow/contrib/lite/c:c_api_internal", + ], +) + tf_cc_test( name = "audio_spectrogram_test", size = "small", srcs = ["audio_spectrogram_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -233,7 +299,61 @@ tf_cc_test( name = "mfcc_test", size = "small", srcs = ["mfcc_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + "@flatbuffers", + ], +) + +tf_cc_test( + name = "detection_postprocess_test", + size = "small", + srcs = ["detection_postprocess_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + "@flatbuffers", + ], +) + +tf_cc_test( + name = "relu1_test", + size = "small", + srcs = ["relu1_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + "@flatbuffers", + ], +) + +tf_cc_test( + name = "sparse_output_fully_connected_test", + size = "small", + srcs = ["sparse_output_fully_connected_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -270,10 +390,11 @@ tf_cc_test( ) tf_cc_test( - name = "arg_max_test", + name = "arg_min_max_test", size = "small", - srcs = ["arg_max_test.cc"], + srcs = ["arg_min_max_test.cc"], tags = [ + "no_oss", "tflite_not_portable_ios", ], deps = [ @@ -288,7 +409,10 @@ tf_cc_test( name = "div_test", size = "small", srcs = ["div_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -301,7 +425,10 @@ tf_cc_test( name = "sub_test", size = "small", srcs = ["sub_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -314,7 +441,10 @@ tf_cc_test( name = "transpose_test", size = "small", srcs = ["transpose_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -329,7 +459,10 @@ tf_cc_test( name = "space_to_batch_nd_test", size = "small", srcs = ["space_to_batch_nd_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -342,7 +475,10 @@ tf_cc_test( name = "batch_to_space_nd_test", size = "small", srcs = ["batch_to_space_nd_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -355,7 +491,10 @@ tf_cc_test( name = "cast_test", size = "small", srcs = ["cast_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -400,6 +539,7 @@ tf_cc_test( ":builtin_ops", "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_absl//absl/memory", "@com_google_googletest//:gtest", ], ) @@ -408,7 +548,10 @@ tf_cc_test( name = "dequantize_test", size = "small", srcs = ["dequantize_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -435,7 +578,10 @@ tf_cc_test( name = "bidirectional_sequence_lstm_test", size = "small", srcs = ["bidirectional_sequence_lstm_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -448,7 +594,10 @@ tf_cc_test( name = "floor_test", size = "small", srcs = ["floor_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -461,7 +610,10 @@ tf_cc_test( name = "elementwise_test", size = "small", srcs = ["elementwise_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -474,7 +626,10 @@ tf_cc_test( name = "unidirectional_sequence_lstm_test", size = "small", srcs = ["unidirectional_sequence_lstm_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -488,6 +643,7 @@ tf_cc_test( size = "small", srcs = ["bidirectional_sequence_rnn_test.cc"], tags = [ + "no_oss", "tflite_not_portable", ], deps = [ @@ -502,7 +658,10 @@ tf_cc_test( name = "unidirectional_sequence_rnn_test", size = "small", srcs = ["unidirectional_sequence_rnn_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -528,7 +687,26 @@ tf_cc_test( name = "exp_test", size = "small", srcs = ["exp_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "fake_quant_test", + size = "small", + srcs = ["fake_quant_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -541,7 +719,10 @@ tf_cc_test( name = "maximum_minimum_test", size = "small", srcs = ["maximum_minimum_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -551,10 +732,13 @@ tf_cc_test( ) tf_cc_test( - name = "mean_test", + name = "reduce_test", size = "small", - srcs = ["mean_test.cc"], - tags = ["tflite_not_portable_ios"], + srcs = ["reduce_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -580,7 +764,10 @@ tf_cc_test( name = "pad_test", size = "small", srcs = ["pad_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -606,11 +793,14 @@ tf_cc_test( name = "gather_test", size = "small", srcs = ["gather_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", - "//tensorflow/contrib/lite:builtin_op_data", "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/c:c_api_internal", "//tensorflow/contrib/lite/kernels:test_util", "@com_google_googletest//:gtest", ], @@ -620,11 +810,14 @@ tf_cc_test( name = "topk_v2_test", size = "small", srcs = ["topk_v2_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", - "//tensorflow/contrib/lite:builtin_op_data", "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/c:c_api_internal", "//tensorflow/contrib/lite/kernels:test_util", "@com_google_googletest//:gtest", ], @@ -741,7 +934,10 @@ tf_cc_test( name = "log_softmax_test", size = "small", srcs = ["log_softmax_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -779,6 +975,20 @@ tf_cc_test( ) tf_cc_test( + name = "layer_norm_lstm_test", + size = "small", + srcs = ["layer_norm_lstm_test.cc"], + tags = ["tflite_not_portable_ios"], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + "@flatbuffers", + ], +) + +tf_cc_test( name = "lstm_test", size = "small", srcs = ["lstm_test.cc"], @@ -822,7 +1032,10 @@ tf_cc_test( name = "split_test", size = "small", srcs = ["split_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -835,7 +1048,10 @@ tf_cc_test( name = "squeeze_test", size = "small", srcs = ["squeeze_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -848,7 +1064,10 @@ tf_cc_test( name = "strided_slice_test", size = "small", srcs = ["strided_slice_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -858,12 +1077,30 @@ tf_cc_test( ) tf_cc_test( + name = "tile_test", + size = "small", + srcs = ["tile_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( name = "comparisons_test", size = "small", srcs = [ "comparisons_test.cc", ], tags = [ + "no_oss", "tflite_not_portable_ios", ], deps = [ @@ -878,7 +1115,10 @@ tf_cc_test( name = "neg_test", size = "small", srcs = ["neg_test.cc"], - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -894,6 +1134,7 @@ tf_cc_test( "select_test.cc", ], tags = [ + "no_oss", "tflite_not_portable_ios", ], deps = [ @@ -911,6 +1152,7 @@ tf_cc_test( "slice_test.cc", ], tags = [ + "no_oss", "tflite_not_portable_ios", ], deps = [ @@ -925,9 +1167,163 @@ tf_cc_test( name = "transpose_conv_test", size = "small", srcs = ["transpose_conv_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "expand_dims_test", + size = "small", + srcs = ["expand_dims_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "sparse_to_dense_test", + size = "small", + srcs = ["sparse_to_dense_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "shape_test", + size = "small", + srcs = ["shape_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "pow_test", + size = "small", + srcs = ["pow_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "pack_test", + size = "small", + srcs = ["pack_test.cc"], + tags = ["tflite_not_portable_ios"], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "one_hot_test", + size = "small", + srcs = ["one_hot_test.cc"], + tags = ["tflite_not_portable_ios"], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "logical_test", + size = "small", + srcs = ["logical_test.cc"], + tags = ["tflite_not_portable_ios"], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "unpack_test", + size = "small", + srcs = ["unpack_test.cc"], tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", + "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "floor_div_test", + size = "small", + srcs = ["floor_div_test.cc"], + tags = ["tflite_not_portable_ios"], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + +tf_cc_test( + name = "zeros_like_test", + size = "small", + srcs = ["zeros_like_test.cc"], + tags = ["tflite_not_portable_ios"], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:builtin_op_data", "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite/kernels:test_util", "@com_google_googletest//:gtest", diff --git a/tensorflow/contrib/lite/kernels/activation_functor.h b/tensorflow/contrib/lite/kernels/activation_functor.h index 41ec3cca33..e075dc7054 100644 --- a/tensorflow/contrib/lite/kernels/activation_functor.h +++ b/tensorflow/contrib/lite/kernels/activation_functor.h @@ -19,7 +19,7 @@ limitations under the License. #include <cmath> #include <cstdlib> -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" namespace tflite { diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc index 4972159a05..9aed4f09b8 100644 --- a/tensorflow/contrib/lite/kernels/activations.cc +++ b/tensorflow/contrib/lite/kernels/activations.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -20,8 +19,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" @@ -41,6 +40,11 @@ struct OpData { int diff_min = 0; }; +struct LogSoftmaxOpData : public OpData { + int32_t reverse_scaling_divisor = 0; + int32_t reverse_scaling_right_shift = 0; +}; + void* Init(TfLiteContext* context, const char* buffer, size_t length) { // This is a builtin op, so we don't use the contents in 'buffer', if any. // Instead, we allocate a new object to carry information from Prepare() to @@ -48,10 +52,19 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { return new OpData; } +void* LogSoftmaxInit(TfLiteContext* context, const char* buffer, + size_t length) { + return new LogSoftmaxOpData; +} + void Free(TfLiteContext* context, void* buffer) { delete reinterpret_cast<OpData*>(buffer); } +void LogSoftmaxFree(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<LogSoftmaxOpData*>(buffer); +} + TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); @@ -84,6 +97,38 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) { &data->input_left_shift); data->input_range_radius = CalculateInputRadius(kInputIntegerBits, data->input_left_shift); + } else if (input->type == kTfLiteInt16) { + static constexpr int kInputIntegerBits = 3; + static constexpr int kOutputFractionalBits = 15; + + // These operators are implemented in fixed-point arithmetic, + // which intrinsically wants symmetric ranges (zero_point==0) + // and power-of-two scales (power-of-two is abbreviated below as POT). + // While more general support would be possible by means of rescaling, + // that would add some overhead and some loss of accuracy and wouldn't + // be used at the moment as current quantized LSTM applications are + // happy with symmetric, power-of-two-scales quantization. So we just + // implement that narrow case only for now. + + TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + + int input_scale_log2_rounded; + TF_LITE_ENSURE(context, + CheckedLog2(input->params.scale, &input_scale_log2_rounded)); + + int output_scale_log2_rounded; + TF_LITE_ENSURE( + context, CheckedLog2(output->params.scale, &output_scale_log2_rounded)); + TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded, + -kOutputFractionalBits); + + data->input_left_shift = + (15 - kInputIntegerBits) + input_scale_log2_rounded; + // Support for shifts is limited until we have a parameterized version of + // SaturatingRoundingMultiplyByPOT(). + TF_LITE_ENSURE(context, data->input_left_shift >= 0); + TF_LITE_ENSURE(context, data->input_left_shift <= 1); } return context->ResizeTensor(context, output, @@ -114,6 +159,30 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) { &data->input_left_shift); data->input_range_radius = CalculateInputRadius(kInputIntegerBits, data->input_left_shift); + } else if (input->type == kTfLiteInt16) { + static constexpr int kInputIntegerBits = 3; + static constexpr int kOutputFractionalBits = 15; + + // See comments in TanhPrepare about requiring zero_point==0 + // and a power-of-two ("POT") scale. + + TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + + int input_scale_log2_rounded; + TF_LITE_ENSURE(context, + CheckedLog2(input->params.scale, &input_scale_log2_rounded)); + + int output_scale_log2_rounded; + TF_LITE_ENSURE( + context, CheckedLog2(output->params.scale, &output_scale_log2_rounded)); + TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded, + -kOutputFractionalBits); + + data->input_left_shift = + (15 - kInputIntegerBits) + input_scale_log2_rounded; + // The int16 logistic implementation does not support shifting of the input. + TF_LITE_ENSURE_EQ(context, data->input_left_shift, 0); } return context->ResizeTensor(context, output, @@ -130,8 +199,8 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = GetOutput(context, node, 0); TF_LITE_ENSURE_EQ(context, input->type, output->type); - TF_LITE_ENSURE(context, - NumDimensions(input) == 2 || NumDimensions(input) == 4); + const int num_dims = NumDimensions(input); + TF_LITE_ENSURE(context, num_dims >= 1 && num_dims <= 4); if (input->type == kTfLiteUInt8) { TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); @@ -150,6 +219,34 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) { TfLiteIntArrayCopy(input->dims)); } +TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) { + LogSoftmaxOpData* data = reinterpret_cast<LogSoftmaxOpData*>(node->user_data); + + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + const TfLiteTensor* input = GetInput(context, node, 0); + TfLiteTensor* output = GetOutput(context, node, 0); + TF_LITE_ENSURE_EQ(context, input->type, output->type); + + if (input->type == kTfLiteUInt8) { + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255); + TF_LITE_ENSURE_EQ(context, output->params.scale, 16.0 / 256); + + static const double kBeta = 1.0; + static const int kScaledDiffIntegerBits = 5; + tflite::PreprocessLogSoftmaxScalingExp( + kBeta, input->params.scale, kScaledDiffIntegerBits, + &data->input_multiplier, &data->input_left_shift, + &data->reverse_scaling_divisor, &data->reverse_scaling_right_shift); + data->reverse_scaling_right_shift *= -1; + data->diff_min = -1.0 * tflite::CalculateInputRadius( + kScaledDiffIntegerBits, data->input_left_shift); + } + + return context->ResizeTensor(context, output, + TfLiteIntArrayCopy(input->dims)); +} + TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); @@ -157,25 +254,25 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = GetOutput(context, node, 0); const TfLiteTensor* alpha = GetInput(context, node, 1); - output->type = input->type; - // Currently only Float32 is supported // TODO(ycling): Support other data types. TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE_EQ(context, alpha->type, kTfLiteFloat32); + output->type = input->type; - // Currently, only support 4D `input` and 3D `alpha` with shape - // (1, 1, channels). - // TODO(impjdi): Support other cases where `alpha` is broadcastable - // to `input`. - TF_LITE_ENSURE_EQ(context, input->dims->size, 4); - TF_LITE_ENSURE_EQ(context, alpha->dims->size, 3); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[0], 1); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[1], 1); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[2], input->dims->data[3]); + // PRelu (parameteric Relu) shares the same alpha value on "shared axis". + // This means it's always required to "broadcast" alpha values in PRelu. + TfLiteIntArray* output_size = nullptr; + TF_LITE_ENSURE_OK( + context, CalculateShapeForBroadcast(context, input, alpha, &output_size)); - return context->ResizeTensor(context, output, - TfLiteIntArrayCopy(input->dims)); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, output, output_size)); + // After broadcasting, the output shape should always be the same as the + // input shape. + TF_LITE_ENSURE(context, HaveSameShapes(input, output)); + + return kTfLiteOk; } TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) { @@ -191,7 +288,8 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } break; default: - context->ReportError(context, "Only float32 supported currently."); + context->ReportError(context, "Only float32 supported currently, got %d.", + input->type); return kTfLiteError; } } @@ -211,7 +309,8 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } break; default: - context->ReportError(context, "Only float32 supported currently."); + context->ReportError(context, "Only float32 supported currently, got %d.", + input->type); return kTfLiteError; } } @@ -229,7 +328,8 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } break; default: - context->ReportError(context, "Only float32 supported currently."); + context->ReportError(context, "Only float32 supported currently, got %d.", + input->type); return kTfLiteError; } } @@ -247,16 +347,28 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) { for (; in < in_end; in++, out++) *out = std::tanh(*in); return kTfLiteOk; } break; + case kTfLiteInt16: { + TanhParams params; + params.input_left_shift = data->input_left_shift; + optimized_ops::Tanh(params, GetTensorShape(input), + GetTensorData<int16_t>(input), GetTensorShape(output), + GetTensorData<int16_t>(output)); + return kTfLiteOk; + } break; case kTfLiteUInt8: { - optimized_ops::Tanh(GetTensorData<uint8_t>(input), GetTensorDims(input), - input->params.zero_point, data->input_range_radius, - data->input_multiplier, data->input_left_shift, - GetTensorData<uint8_t>(output), - GetTensorDims(output)); + TanhParams params; + params.input_zero_point = input->params.zero_point; + params.input_range_radius = data->input_range_radius; + params.input_multiplier = data->input_multiplier; + params.input_left_shift = data->input_left_shift; + optimized_ops::Tanh(params, GetTensorShape(input), + GetTensorData<uint8_t>(input), GetTensorShape(output), + GetTensorData<uint8_t>(output)); return kTfLiteOk; } break; default: - context->ReportError(context, "Only float32 supported currently."); + context->ReportError(context, "Only float32 supported currently, got %d.", + input->type); return kTfLiteError; } } @@ -276,28 +388,35 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) { for (; in < in_end; in++, out++) *out = 1.f / (1.f + std::exp(-*in)); break; } + case kTfLiteInt16: { + LogisticParams params; + optimized_ops::Logistic( + params, GetTensorShape(input), GetTensorData<int16_t>(input), + GetTensorShape(output), GetTensorData<int16_t>(output)); + break; + } case kTfLiteUInt8: { + LogisticParams params; + params.input_zero_point = input->params.zero_point; + params.input_range_radius = data->input_range_radius; + params.input_multiplier = data->input_multiplier; + params.input_left_shift = data->input_left_shift; optimized_ops::Logistic( - GetTensorData<uint8_t>(input), GetTensorDims(input), - input->params.zero_point, data->input_range_radius, - data->input_multiplier, data->input_left_shift, - GetTensorData<uint8_t>(output), GetTensorDims(output)); + params, GetTensorShape(input), GetTensorData<uint8_t>(input), + GetTensorShape(output), GetTensorData<uint8_t>(output)); break; } default: - context->ReportError(context, "Only float32 supported currently."); + context->ReportError(context, "Only float32 supported currently, got %d.", + input->type); return kTfLiteError; } return kTfLiteOk; } -// Takes a 2D tensor and perform softmax along the second dimension. -void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output, - TfLiteSoftmaxParams* params) { - const int batch_size = input->dims->data[0]; - const int input_size = input->dims->data[1]; - float* in = input->data.f; - float* out = output->data.f; +// Performs softmax along the input of size (input_size * batch_size). +void Softmax(const float* in, const int input_size, const int batch_size, + const float beta, float* out) { TF_LITE_ASSERT(input_size > 0); // For each batch @@ -311,7 +430,7 @@ void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output, // Compute the normalized sum of exps. float exp_sum = 0.0; for (int i = 0; i < input_size; i++) { - out[i] = std::exp((in[i] - max_coeff) * params->beta); + out[i] = std::exp((in[i] - max_coeff) * beta); exp_sum += out[i]; } @@ -327,6 +446,52 @@ void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output, } } +// Takes a 1D tensor and performs softmax along it. +void Softmax1DFloat(const TfLiteTensor* input, TfLiteTensor* output, + TfLiteSoftmaxParams* params) { + const int input_size = input->dims->data[0]; + Softmax(input->data.f, input_size, 1, params->beta, output->data.f); +} + +// Takes a 2D tensor and perform softmax along the last dimension. +void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output, + TfLiteSoftmaxParams* params) { + const int batch_size = input->dims->data[0]; + const int input_size = input->dims->data[1]; + Softmax(input->data.f, input_size, batch_size, params->beta, output->data.f); +} + +// Takes a 3D tensor and perform softmax along the last dimension. +void Softmax3DFloat(const TfLiteTensor* input, TfLiteTensor* output, + TfLiteSoftmaxParams* params) { + const int batch_size = input->dims->data[0]; + const int intermediate_size = input->dims->data[1]; + const int input_size = input->dims->data[2]; + SoftmaxParams op_params; + op_params.beta = params->beta; + optimized_ops::Softmax( + op_params, GetTensorShape({batch_size, intermediate_size, 1, input_size}), + GetTensorData<float>(input), + GetTensorShape({batch_size, intermediate_size, 1, input_size}), + GetTensorData<float>(output)); +} + +void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output, + TfLiteSoftmaxParams* params, OpData* data) { + // TODO(ahentz): this is arguably a dirty trick. Since the implementation + // always traverses the last dimension of a 4D tensor, we will pretend our 1D + // tensor is 4D in a special way. We will convert a (Y) shape into a (1, + // 1, 1, Y) shape. + const int input_size = input->dims->data[0]; + SoftmaxParams op_params; + op_params.input_multiplier = data->input_multiplier; + op_params.input_left_shift = data->input_left_shift; + op_params.diff_min = data->diff_min; + optimized_ops::Softmax(op_params, GetTensorShape({1, 1, 1, input_size}), + GetTensorData<uint8_t>(input), + GetTensorShape({1, 1, 1, input_size}), + GetTensorData<uint8_t>(output)); +} void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output, TfLiteSoftmaxParams* params, OpData* data) { // TODO(ahentz): this is arguably a dirty trick. Since the implementation @@ -335,27 +500,52 @@ void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output, // 1, 1, Y) shape. const int batch_size = input->dims->data[0]; const int input_size = input->dims->data[1]; - optimized_ops::Softmax(GetTensorData<uint8_t>(input), - GetTensorDims({batch_size, 1, 1, input_size}), - data->input_multiplier, data->input_left_shift, - data->diff_min, GetTensorData<uint8_t>(output), - GetTensorDims({batch_size, 1, 1, input_size})); + SoftmaxParams op_params; + op_params.input_multiplier = data->input_multiplier; + op_params.input_left_shift = data->input_left_shift; + op_params.diff_min = data->diff_min; + optimized_ops::Softmax(op_params, + GetTensorShape({batch_size, 1, 1, input_size}), + GetTensorData<uint8_t>(input), + GetTensorShape({batch_size, 1, 1, input_size}), + GetTensorData<uint8_t>(output)); +} + +void Softmax3DQuantized(const TfLiteTensor* input, TfLiteTensor* output, + TfLiteSoftmaxParams* params, OpData* data) { + const int batch_size = input->dims->data[0]; + const int intermediate_size = input->dims->data[1]; + const int input_size = input->dims->data[2]; + SoftmaxParams op_params; + op_params.input_multiplier = data->input_multiplier; + op_params.input_left_shift = data->input_left_shift; + op_params.diff_min = data->diff_min; + optimized_ops::Softmax( + op_params, GetTensorShape({batch_size, intermediate_size, 1, input_size}), + GetTensorData<uint8_t>(input), + GetTensorShape({batch_size, intermediate_size, 1, input_size}), + GetTensorData<uint8_t>(output)); } // Takes a 4D tensor and perform softmax along the forth dimension. void Softmax4DFloat(const TfLiteTensor* input, TfLiteTensor* output, TfLiteSoftmaxParams* params) { - optimized_ops::Softmax(GetTensorData<float>(input), GetTensorDims(input), - params->beta, GetTensorData<float>(output), - GetTensorDims(output)); + SoftmaxParams op_params; + op_params.beta = params->beta; + optimized_ops::Softmax(op_params, GetTensorShape(input), + GetTensorData<float>(input), GetTensorShape(output), + GetTensorData<float>(output)); } void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output, TfLiteSoftmaxParams* params, OpData* data) { - optimized_ops::Softmax(GetTensorData<uint8_t>(input), GetTensorDims(input), - data->input_multiplier, data->input_left_shift, - data->diff_min, GetTensorData<uint8_t>(output), - GetTensorDims(output)); + SoftmaxParams op_params; + op_params.input_multiplier = data->input_multiplier; + op_params.input_left_shift = data->input_left_shift; + op_params.diff_min = data->diff_min; + optimized_ops::Softmax(op_params, GetTensorShape(input), + GetTensorData<uint8_t>(input), GetTensorShape(output), + GetTensorData<uint8_t>(output)); } TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) { @@ -369,79 +559,107 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) { // dimensions. switch (input->type) { case kTfLiteFloat32: { + if (NumDimensions(input) == 1) { + Softmax1DFloat(input, output, params); + return kTfLiteOk; + } if (NumDimensions(input) == 2) { Softmax2DFloat(input, output, params); return kTfLiteOk; } + if (NumDimensions(input) == 3) { + Softmax3DFloat(input, output, params); + return kTfLiteOk; + } if (NumDimensions(input) == 4) { Softmax4DFloat(input, output, params); return kTfLiteOk; } - context->ReportError(context, - "Only 2D and 4D tensors supported currently."); + context->ReportError( + context, "Only 1D, 2D and 4D tensors supported currently, got %dD.", + NumDimensions(input)); return kTfLiteError; } case kTfLiteUInt8: { + if (NumDimensions(input) == 1) { + Softmax1DQuantized(input, output, params, data); + return kTfLiteOk; + } if (NumDimensions(input) == 2) { Softmax2DQuantized(input, output, params, data); return kTfLiteOk; } + if (NumDimensions(input) == 3) { + Softmax3DQuantized(input, output, params, data); + return kTfLiteOk; + } if (NumDimensions(input) == 4) { Softmax4DQuantized(input, output, params, data); return kTfLiteOk; } - context->ReportError(context, - "Only 2D and 4D tensors supported currently."); + context->ReportError( + context, "Only 2D and 4D tensors supported currently, got %dD.", + NumDimensions(input)); return kTfLiteError; } default: - context->ReportError(context, - "Only float32 and uint8_t supported currently."); + context->ReportError( + context, "Only float32 and uint8_t supported currently, got %d.", + input->type); return kTfLiteError; } } TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) { + const LogSoftmaxOpData* data = + reinterpret_cast<LogSoftmaxOpData*>(node->user_data); const TfLiteTensor* input = GetInput(context, node, 0); TfLiteTensor* output = GetOutput(context, node, 0); switch (input->type) { - case kTfLiteFloat32: + case kTfLiteFloat32: { + SoftmaxParams op_params; + optimized_ops::LogSoftmax( + op_params, GetTensorShape(input), GetTensorData<float>(input), + GetTensorShape(output), GetTensorData<float>(output)); + return kTfLiteOk; + } + case kTfLiteUInt8: { + SoftmaxParams op_params; + op_params.input_multiplier = data->input_multiplier; + op_params.input_left_shift = data->input_left_shift; + op_params.reverse_scaling_divisor = data->reverse_scaling_divisor; + op_params.reverse_scaling_right_shift = data->reverse_scaling_right_shift; + op_params.diff_min = data->diff_min; optimized_ops::LogSoftmax( - GetTensorData<float>(input), GetTensorDims(input), - GetTensorData<float>(output), GetTensorDims(output)); + op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), + GetTensorShape(output), GetTensorData<uint8_t>(output)); return kTfLiteOk; + } default: - context->ReportError(context, "Only float32 supported currently."); + context->ReportError(context, "Only float32 supported currently., got %d", + input->type); return kTfLiteError; } } +template <typename T> +T ApplyPrelu(T input, T alpha) { + return input >= 0.0 ? input : input * alpha; +} + TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input = GetInput(context, node, 0); const TfLiteTensor* alpha = GetInput(context, node, 1); - const TfLiteTensor* output = GetOutput(context, node, 0); - + TfLiteTensor* output = GetOutput(context, node, 0); if (input->type != kTfLiteFloat32) { - context->ReportError(context, "Only float32 supported currently."); + context->ReportError(context, "Only float32 supported currently, got %d.", + input->type); return kTfLiteError; } - TF_LITE_ENSURE_EQ(context, input->dims->size, 4); - const int batches = input->dims->data[0]; - const int height = input->dims->data[1]; - const int width = input->dims->data[2]; - const int channels = input->dims->data[3]; - - TF_LITE_ENSURE_EQ(context, alpha->dims->size, 3); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[0], 1); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[1], 1); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[2], channels); - - const int n = batches * height * width * channels; - for (int i = 0; i < n; ++i) { - const float x = input->data.f[i]; - output->data.f[i] = x >= 0.0f ? x : alpha->data.f[i % channels] * x; - } - + reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>( + GetTensorShape(input), GetTensorData<float>(input), GetTensorShape(alpha), + GetTensorData<float>(alpha), GetTensorShape(output), + GetTensorData<float>(output), ApplyPrelu<float>); return kTfLiteOk; } @@ -490,9 +708,9 @@ TfLiteRegistration* Register_SOFTMAX() { } TfLiteRegistration* Register_LOG_SOFTMAX() { - static TfLiteRegistration r = {activations::Init, activations::Free, - activations::GenericPrepare, - activations::LogSoftmaxEval}; + static TfLiteRegistration r = { + activations::LogSoftmaxInit, activations::LogSoftmaxFree, + activations::LogSoftmaxPrepare, activations::LogSoftmaxEval}; return &r; } diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc index 50a84edd47..9fa47e190a 100644 --- a/tensorflow/contrib/lite/kernels/activations_test.cc +++ b/tensorflow/contrib/lite/kernels/activations_test.cc @@ -75,23 +75,42 @@ class FloatActivationsOpModel : public BaseActivationsOpModel { std::vector<float> GetOutput() { return ExtractVector<float>(output_); } }; -// TODO(ahentz): I don't quite understand the tradeoffs in the quantized -// implementation of sigmoid and software, but a tolerance of twice the output -// scale seems reasonable. We might want to change this if we have a better -// theoretical bound. +// Our fixed-point math function implementations have roughly 12 bits of +// accuracy, when specialized to 16-bit fixed-point arithmetic. +// That is purely an implementation compromise, it would have been possible +// to get closer to 16 bits of accuracy but that would be more expensive, +// and not needed for our purposes as ultimately the output is either +// immediately down-quantized to 8 bits, or will typically be at the output +// of the surrounding LSTM cell. +// So we can require roughly 2^-12 accuracy when the output is 16-bit, and +// we can more or less expect the full 2^-8 accuracy when the output is 8-bit. +// +// However, the representable output interval is often [-1, 1] (it has to be +// for tanh, and even for logistic, when we implement it in fixed-point, we +// typically have to do so on such a symmetric interval, e.g. ARM NEON only +// has signed fixed-point arithmetic (SQRDMULH)). As the width of [-1, 1] +// is 2, our representable values are often diluted by a factor of 2, whence +// the factor of 2 below. const float kQuantizedTolerance = 2 * (1. / 256); +const float kQuantizedToleranceInt16 = 2 * (1. / 4096); class QuantizedActivationsOpModel : public BaseActivationsOpModel { public: using BaseActivationsOpModel::BaseActivationsOpModel; + template <typename T> void SetInput(std::initializer_list<float> data) { - QuantizeAndPopulate<uint8_t>(input_, data); + QuantizeAndPopulate<T>(input_, data); } - std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); } + template <typename T> + + std::vector<T> GetOutput() { + return ExtractVector<T>(output_); + } + template <typename T> std::vector<float> GetDequantizedOutput() { - return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_), - GetScale(output_), GetZeroPoint(output_)); + return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_), + GetZeroPoint(output_)); } }; @@ -152,24 +171,47 @@ TEST(FloatActivationsOpTest, Tanh) { } TEST(QuantizedActivationsOpTest, Tanh) { + const float kMin = -1; + const float kMax = 127.f / 128.f; QuantizedActivationsOpModel m( BuiltinOperator_TANH, - /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -8, 8}, - /*output=*/{TensorType_UINT8, {1, 2, 4, 1}, -1, 1}); - m.SetInput({ + /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax}, + /*output=*/{TensorType_UINT8, {1, 2, 4, 1}, kMin, kMax}); + m.SetInput<uint8_t>({ 0, -6, 2, 4, // -4, -2, 8, 1, // }); m.Invoke(); - EXPECT_THAT(m.GetDequantizedOutput(), + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), ElementsAreArray(ArrayFloatNear( { 0.0, -0.999987, 0.964027, 0.999329, // - -0.996078, -0.96402, 0.99999, 0.76159, // + -0.999329, -0.96402, 0.99999, 0.76159, // }, - 4 * (1. / 256)))); - EXPECT_THAT(m.GetOutput(), - ElementsAreArray({128, 0, 251, 255, 0, 5, 255, 226})); + kQuantizedTolerance))); + EXPECT_THAT(m.GetOutput<uint8_t>(), + ElementsAreArray({128, 0, 251, 255, 0, 5, 255, 225})); +} + +TEST(QuantizedActivationsOpTest, TanhInt16) { + const float kMin = -1; + const float kMax = 32767.f / 32768.f; + QuantizedActivationsOpModel m( + BuiltinOperator_TANH, + /*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax}, + /*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax}); + m.SetInput<int16_t>({ + 0, -6, 2, 4, // + -4, -2, 8, 1, // + }); + m.Invoke(); + EXPECT_THAT(m.GetDequantizedOutput<int16_t>(), + ElementsAreArray(ArrayFloatNear( + { + 0.0, -0.999987, 0.964027, 0.999329, // + -0.999329, -0.96402, 0.99999, 0.76159, // + }, + kQuantizedToleranceInt16))); } TEST(FloatActivationsOpTest, Sigmoid) { @@ -190,22 +232,43 @@ TEST(QuantizedActivationsOpTest, Sigmoid) { QuantizedActivationsOpModel m( BuiltinOperator_LOGISTIC, /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -10, 10}); - m.SetInput({ + m.SetInput<uint8_t>({ 0, -6, 2, 4, // 3, -2, 10, 1, // }); m.Invoke(); - EXPECT_THAT(m.GetDequantizedOutput(), + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), ElementsAreArray(ArrayFloatNear( { 0.5, 0.002473, 0.880797, 0.982014, // 0.952574, 0.119203, 0.999955, 0.731059, // }, kQuantizedTolerance))); - EXPECT_THAT(m.GetOutput(), + EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188})); } +TEST(QuantizedActivationsOpTest, SigmoidInt16) { + const float kMin = -1; + const float kMax = 32767.f / 32768.f; + QuantizedActivationsOpModel m( + BuiltinOperator_LOGISTIC, + /*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax}, + /*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax}); + m.SetInput<int16_t>({ + 0, -6, 2, 4, // + 3, -2, 10, 1, // + }); + m.Invoke(); + EXPECT_THAT(m.GetDequantizedOutput<int16_t>(), + ElementsAreArray(ArrayFloatNear( + { + 0.5, 0.002473, 0.880797, 0.982014, // + 0.952574, 0.119203, 0.999955, 0.731059, // + }, + kQuantizedToleranceInt16))); +} + TEST(FloatActivationsOpTest, Softmax4D) { FloatActivationsOpModel m(0.1, /*input=*/{TensorType_FLOAT32, {1, 2, 1, 4}}); @@ -241,12 +304,12 @@ TEST(QuantizedActivationsOpTest, Softmax4D) { QuantizedActivationsOpModel m( 0.1, /*input=*/{TensorType_UINT8, {1, 2, 1, 4}, -10, 10}); - m.SetInput({ + m.SetInput<uint8_t>({ 0, -6, 2, 4, // depth = 0 3, -2, 10, 1, // depth = 1 }); m.Invoke(); - EXPECT_THAT(m.GetDequantizedOutput(), + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), ElementsAreArray(ArrayFloatNear( { .23463, .12877, .28658, .35003, // @@ -258,6 +321,40 @@ TEST(QuantizedActivationsOpTest, Softmax4D) { QuantizedActivationsOpModel m2( 0.1, /*input=*/{TensorType_UINT8, {4, 1, 1, 2}, -10, 10}); + m2.SetInput<uint8_t>({ + 0, -6, // + 2, 4, // + 3, -2, // + 10, 1, // + }); + m2.Invoke(); + EXPECT_THAT(m2.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear( + { + 0.645656, 0.354344, // + 0.450166, 0.549834, // + 0.622459, 0.377541, // + 0.710949, 0.28905, // + }, + kQuantizedTolerance))); +} + +TEST(FloatActivationsOpTest, Softmax3D) { + FloatActivationsOpModel m(0.1, + /*input=*/{TensorType_FLOAT32, {1, 2, 4}}); + m.SetInput({ + 0, -6, 2, 4, // depth = 0 + 3, -2, 10, 1, // depth = 1 + }); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({ + .23463, .12877, .28658, .35003, // + .22528, .13664, .45365, .18443, // + }))); + + // Same input, but a different shape. + FloatActivationsOpModel m2(0.1, + /*input=*/{TensorType_FLOAT32, {4, 1, 2}}); m2.SetInput({ 0, -6, // 2, 4, // @@ -265,14 +362,74 @@ TEST(QuantizedActivationsOpTest, Softmax4D) { 10, 1, // }); m2.Invoke(); - EXPECT_THAT(m2.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear( - { - 0.645656, 0.354344, // - 0.450166, 0.549834, // - 0.622459, 0.377541, // - 0.710949, 0.28905, // - }, - kQuantizedTolerance))); + EXPECT_THAT(m2.GetOutput(), ElementsAreArray(ArrayFloatNear({ + 0.645656, 0.354344, // + 0.450166, 0.549834, // + 0.622459, 0.377541, // + 0.710949, 0.28905, // + }))); +} + +TEST(QuantizedActivationsOpTest, Softmax3D) { + QuantizedActivationsOpModel m( + 0.1, + /*input=*/{TensorType_UINT8, {1, 2, 4}, -10, 10}); + m.SetInput<uint8_t>({ + 0, -6, 2, 4, // depth = 0 + 3, -2, 10, 1, // depth = 1 + }); + m.Invoke(); + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear( + { + .23463, .12877, .28658, .35003, // + .22528, .13664, .45365, .18443, // + }, + kQuantizedTolerance))); + + // Same input, but a different shape. + QuantizedActivationsOpModel m2( + 0.1, + /*input=*/{TensorType_UINT8, {4, 1, 2}, -10, 10}); + m2.SetInput<uint8_t>({ + 0, -6, // + 2, 4, // + 3, -2, // + 10, 1, // + }); + m2.Invoke(); + EXPECT_THAT(m2.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear( + { + 0.645656, 0.354344, // + 0.450166, 0.549834, // + 0.622459, 0.377541, // + 0.710949, 0.28905, // + }, + kQuantizedTolerance))); +} + +TEST(FloatActivationsOpTest, Softmax1D) { + FloatActivationsOpModel m(0.1, + /*input=*/{TensorType_FLOAT32, {8}}); + m.SetInput({0, -6, 2, 4, 3, -2, 10, 1}); + m.Invoke(); + EXPECT_THAT( + m.GetOutput(), + ElementsAreArray(ArrayFloatNear( + {.09752, .05352, .11911, .14548, .13164, .07984, .26509, .10778}))); +} + +TEST(QuantizedActivationsOpTest, Softmax1D) { + QuantizedActivationsOpModel m(0.1, + /*input=*/{TensorType_UINT8, {8}, -10, 10}); + m.SetInput<uint8_t>({0, -6, 2, 4, 3, -2, 10, 1}); + m.Invoke(); + EXPECT_THAT( + m.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear({0.09766, 0.05469, 0.12109, 0.14453, + 0.13281, 0.07813, 0.26563, 0.10938}, + kQuantizedTolerance))); } TEST(FloatActivationsOpTest, Softmax2D) { @@ -309,12 +466,12 @@ TEST(FloatActivationsOpTest, Softmax2D) { TEST(QuantizedActivationsOpTest, Softmax2D) { QuantizedActivationsOpModel m(0.1, /*input=*/{TensorType_UINT8, {2, 4}, -10, 10}); - m.SetInput({ + m.SetInput<uint8_t>({ 0, -6, 2, 4, // 3, -2, 10, 1, // }); m.Invoke(); - EXPECT_THAT(m.GetDequantizedOutput(), + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), ElementsAreArray(ArrayFloatNear( { .23463, .12877, .28658, .35003, // @@ -325,21 +482,22 @@ TEST(QuantizedActivationsOpTest, Softmax2D) { // Same input, but a different shape. QuantizedActivationsOpModel m2(0.1, /*input=*/{TensorType_UINT8, {4, 2}, -10, 10}); - m2.SetInput({ + m2.SetInput<uint8_t>({ 0, -6, // 2, 4, // 3, -2, // 10, 1, // }); m2.Invoke(); - EXPECT_THAT(m2.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear( - { - 0.645656, 0.354344, // - 0.450166, 0.549834, // - 0.622459, 0.377541, // - 0.710949, 0.28905, // - }, - kQuantizedTolerance))); + EXPECT_THAT(m2.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear( + { + 0.645656, 0.354344, // + 0.450166, 0.549834, // + 0.622459, 0.377541, // + 0.710949, 0.28905, // + }, + kQuantizedTolerance))); } // This contains the same test values as the Softmax test, but reference answer @@ -383,6 +541,28 @@ TEST(FloatActivationsOpTest, LogSoftmax) { }))); } +TEST(QuantizedActivationsOpTest, LogSoftmax) { + const float kLogSoftmaxQuantizedTolerance = 16 / 256.0; + QuantizedActivationsOpModel m( + BuiltinOperator_LOG_SOFTMAX, + /*input=*/{TensorType_UINT8, {2, 4}, -10, 10}, + /*output=*/{TensorType_UINT8, {}, 0, 0, 16. / 256, 255}); + m.SetInput<uint8_t>({ + 0, -6, 2, 4, // + 3, -2, 10, 1, // + }); + m.Invoke(); + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear( + { + -4.14297, -10.14297, -2.14297, -.142971, // + -7.00104, -12.00104, -.00104087, -9.00104, // + }, + kLogSoftmaxQuantizedTolerance))); + EXPECT_THAT(m.GetOutput<uint8_t>(), + ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111})); +} + class PReluOpModel : public SingleOpModel { public: PReluOpModel(const TensorData& input, const TensorData& alpha) { diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc index 7ca1e35489..b4393e8097 100644 --- a/tensorflow/contrib/lite/kernels/add.cc +++ b/tensorflow/contrib/lite/kernels/add.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" @@ -39,6 +39,23 @@ constexpr int kOutputTensor = 0; struct OpData { bool requires_broadcast; + + // These fields are used in both the general 8-bit -> 8bit quantized path, + // and the special 16-bit -> 16bit quantized path + int input1_shift; + int input2_shift; + int32 output_activation_min; + int32 output_activation_max; + + // These fields are used only in the general 8-bit -> 8bit quantized path + int32 input1_multiplier; + int32 input2_multiplier; + int32 output_multiplier; + int output_shift; + int left_shift; + int32 input1_offset; + int32 input2_offset; + int32 output_offset; }; void* Init(TfLiteContext* context, const char* buffer, size_t length) { @@ -52,6 +69,7 @@ void Free(TfLiteContext* context, void* buffer) { } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data); OpData* data = reinterpret_cast<OpData*>(node->user_data); TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); @@ -74,89 +92,182 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output_size = TfLiteIntArrayCopy(input1->dims); } + if (output->type == kTfLiteUInt8) { + // 8bit -> 8bit general quantized path, with general rescalings + data->input1_offset = -input1->params.zero_point; + data->input2_offset = -input2->params.zero_point; + data->output_offset = output->params.zero_point; + data->left_shift = 20; + const double twice_max_input_scale = + 2 * std::max(input1->params.scale, input2->params.scale); + const double real_input1_multiplier = + input1->params.scale / twice_max_input_scale; + const double real_input2_multiplier = + input2->params.scale / twice_max_input_scale; + const double real_output_multiplier = + twice_max_input_scale / + ((1 << data->left_shift) * output->params.scale); + + QuantizeMultiplierSmallerThanOneExp( + real_input1_multiplier, &data->input1_multiplier, &data->input1_shift); + + QuantizeMultiplierSmallerThanOneExp( + real_input2_multiplier, &data->input2_multiplier, &data->input2_shift); + + QuantizeMultiplierSmallerThanOneExp( + real_output_multiplier, &data->output_multiplier, &data->output_shift); + + CalculateActivationRangeUint8(params->activation, output, + &data->output_activation_min, + &data->output_activation_max); + + } else if (output->type == kTfLiteInt16) { + // 16bit -> 16bit special quantized path, supporting only a rather + // narrow case of quantization parameters: zero_points must all be 0 + // ("symmetric quantization") and scales must be power-of-two (which + // we abbreviate as "POT" below). The intended use case for this path + // is in LSTM cells, where, due to the constraints of implementing + // some of the math in these LSTM cells in fixed-point arithmetic, + // we need to have such symmetric, power-of-two quantization + // (Fixed-point formats are inherently symmetric, power-of-two). + TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + + int input1_scale_log2_rounded; + bool input1_scale_is_pot = + CheckedLog2(input1->params.scale, &input1_scale_log2_rounded); + TF_LITE_ENSURE(context, input1_scale_is_pot); + + int input2_scale_log2_rounded; + bool input2_scale_is_pot = + CheckedLog2(input2->params.scale, &input2_scale_log2_rounded); + TF_LITE_ENSURE(context, input2_scale_is_pot); + + int output_scale_log2_rounded; + bool output_scale_is_pot = + CheckedLog2(output->params.scale, &output_scale_log2_rounded); + TF_LITE_ENSURE(context, output_scale_is_pot); + + data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded; + data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded; + + // Shifting of one input is supported. The graph quantization should ensure + // that the other input matches the output. + TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0); + TF_LITE_ENSURE(context, data->input1_shift <= 0); + TF_LITE_ENSURE(context, data->input2_shift <= 0); + + CalculateActivationRangeQuantized(context, params->activation, output, + &data->output_activation_min, + &data->output_activation_max); + } + return context->ResizeTensor(context, output, output_size); } template <KernelType kernel_type> -void EvalAddFloat(TfLiteContext* context, TfLiteNode* node, - TfLiteAddParams* params, const OpData* data, - const TfLiteTensor* input1, const TfLiteTensor* input2, - TfLiteTensor* output) { - float output_activation_min, output_activation_max; - CalculateActivationRangeFloat(params->activation, &output_activation_min, - &output_activation_max); -#define TF_LITE_ADD(type, opname) \ - type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \ - GetTensorData<float>(input2), GetTensorDims(input2), \ - output_activation_min, output_activation_max, \ - GetTensorData<float>(output), GetTensorDims(output)) - if (kernel_type == kReference) { - if (data->requires_broadcast) { - TF_LITE_ADD(reference_ops, BroadcastAdd); +void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params, + const OpData* data, const TfLiteTensor* input1, + const TfLiteTensor* input2, TfLiteTensor* output) { +#define TF_LITE_ADD(type, opname, data_type) \ + data_type output_activation_min, output_activation_max; \ + CalculateActivationRange(params->activation, &output_activation_min, \ + &output_activation_max); \ + tflite::ArithmeticParams op_params; \ + SetActivationParams(output_activation_min, output_activation_max, \ + &op_params); \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<data_type>(input1), GetTensorShape(input2), \ + GetTensorData<data_type>(input2), GetTensorShape(output), \ + GetTensorData<data_type>(output)) + if (output->type == kTfLiteInt32) { + if (kernel_type == kReference) { + if (data->requires_broadcast) { + TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int32_t); + } else { + TF_LITE_ADD(reference_ops, Add, int32_t); + } } else { - TF_LITE_ADD(reference_ops, Add); + if (data->requires_broadcast) { + TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int32_t); + } else { + TF_LITE_ADD(optimized_ops, Add, int32_t); + } } - } else { - if (data->requires_broadcast) { - TF_LITE_ADD(optimized_ops, BroadcastAdd); + } else if (output->type == kTfLiteFloat32) { + if (kernel_type == kReference) { + if (data->requires_broadcast) { + TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, float); + } else { + TF_LITE_ADD(reference_ops, Add, float); + } } else { - TF_LITE_ADD(optimized_ops, Add); + if (data->requires_broadcast) { + TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, float); + } else { + TF_LITE_ADD(optimized_ops, Add, float); + } } } #undef TF_LITE_ADD } template <KernelType kernel_type> -void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, - TfLiteAddParams* params, const OpData* data, - const TfLiteTensor* input1, const TfLiteTensor* input2, - TfLiteTensor* output) { - auto input1_offset = -input1->params.zero_point; - auto input2_offset = -input2->params.zero_point; - auto output_offset = output->params.zero_point; - const int left_shift = 20; - const double twice_max_input_scale = - 2 * std::max(input1->params.scale, input2->params.scale); - const double real_input1_multiplier = - input1->params.scale / twice_max_input_scale; - const double real_input2_multiplier = - input2->params.scale / twice_max_input_scale; - const double real_output_multiplier = - twice_max_input_scale / ((1 << left_shift) * output->params.scale); - - int32 input1_multiplier; - int input1_shift; - QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &input1_multiplier, - &input1_shift); - int32 input2_multiplier; - int input2_shift; - QuantizeMultiplierSmallerThanOne(real_input2_multiplier, &input2_multiplier, - &input2_shift); - int32 output_multiplier; - int output_shift; - QuantizeMultiplierSmallerThanOne(real_output_multiplier, &output_multiplier, - &output_shift); - - int32 output_activation_min, output_activation_max; - CalculateActivationRangeUint8(params->activation, output, - &output_activation_min, &output_activation_max); - -#define TF_LITE_ADD(type, opname) \ - type::opname(left_shift, GetTensorData<uint8_t>(input1), \ - GetTensorDims(input1), input1_offset, input1_multiplier, \ - input1_shift, GetTensorData<uint8_t>(input2), \ - GetTensorDims(input2), input2_offset, input2_multiplier, \ - input2_shift, output_offset, output_multiplier, output_shift, \ - output_activation_min, output_activation_max, \ - GetTensorData<uint8_t>(output), GetTensorDims(output)); - // The quantized version of Add doesn't support activations, so we - // always use BroadcastAdd. - if (kernel_type == kReference) { - TF_LITE_ADD(reference_ops, BroadcastAdd); - } else { - TF_LITE_ADD(optimized_ops, BroadcastAdd); - } +TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, + TfLiteAddParams* params, const OpData* data, + const TfLiteTensor* input1, + const TfLiteTensor* input2, + TfLiteTensor* output) { + if (output->type == kTfLiteUInt8) { +#define TF_LITE_ADD(type, opname) \ + tflite::ArithmeticParams op_params; \ + op_params.left_shift = data->left_shift; \ + op_params.input1_offset = data->input1_offset; \ + op_params.input1_multiplier = data->input1_multiplier; \ + op_params.input1_shift = data->input1_shift; \ + op_params.input2_offset = data->input2_offset; \ + op_params.input2_multiplier = data->input2_multiplier; \ + op_params.input2_shift = data->input2_shift; \ + op_params.output_offset = data->output_offset; \ + op_params.output_multiplier = data->output_multiplier; \ + op_params.output_shift = data->output_shift; \ + SetActivationParams(data->output_activation_min, \ + data->output_activation_max, &op_params); \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<uint8_t>(input1), GetTensorShape(input2), \ + GetTensorData<uint8_t>(input2), GetTensorShape(output), \ + GetTensorData<uint8_t>(output)) + // The quantized version of Add doesn't support activations, so we + // always use BroadcastAdd. + if (kernel_type == kReference) { + TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow); + } else { + TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow); + } #undef TF_LITE_ADD + } else if (output->type == kTfLiteInt16) { +#define TF_LITE_ADD(type, opname) \ + tflite::ArithmeticParams op_params; \ + op_params.input1_shift = data->input1_shift; \ + op_params.input2_shift = data->input2_shift; \ + SetActivationParams(data->output_activation_min, \ + data->output_activation_max, &op_params); \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<int16_t>(input1), GetTensorShape(input2), \ + GetTensorData<int16_t>(input2), GetTensorShape(output), \ + GetTensorData<int16_t>(output)) + // The quantized version of Add doesn't support activations, so we + // always use BroadcastAdd. + if (kernel_type == kReference) { + TF_LITE_ADD(reference_ops, Add); + } else { + TF_LITE_ADD(optimized_ops, Add); + } +#undef TF_LITE_ADD + } + + return kTfLiteOk; } template <KernelType kernel_type> @@ -168,15 +279,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - if (output->type == kTfLiteFloat32) { - EvalAddFloat<kernel_type>(context, node, params, data, input1, input2, - output); - } else if (output->type == kTfLiteUInt8) { - EvalAddQuantized<kernel_type>(context, node, params, data, input1, input2, - output); + if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) { + EvalAdd<kernel_type>(context, node, params, data, input1, input2, output); + } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) { + TF_LITE_ENSURE_OK(context, + EvalAddQuantized<kernel_type>(context, node, params, data, + input1, input2, output)); } else { context->ReportError(context, - "Inputs and outputs not all float|uint8 types."); + "Inputs and outputs not all float|uint8|int16 types."); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/contrib/lite/kernels/add_test.cc index 956d05bed5..0b58443211 100644 --- a/tensorflow/contrib/lite/kernels/add_test.cc +++ b/tensorflow/contrib/lite/kernels/add_test.cc @@ -52,6 +52,13 @@ class FloatAddOpModel : public BaseAddOpModel { std::vector<float> GetOutput() { return ExtractVector<float>(output_); } }; +class IntegerAddOpModel : public BaseAddOpModel { + public: + using BaseAddOpModel::BaseAddOpModel; + + std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); } +}; + class QuantizedAddOpModel : public BaseAddOpModel { public: using BaseAddOpModel::BaseAddOpModel; @@ -60,15 +67,26 @@ class QuantizedAddOpModel : public BaseAddOpModel { return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_), GetScale(output_), GetZeroPoint(output_)); } + + std::vector<float> GetDequantizedOutputInt16() { + return Dequantize<int16_t>(ExtractVector<int16_t>(output_), + GetScale(output_), GetZeroPoint(output_)); + } }; // for quantized Add, the error shouldn't exceed 2*step -float GetTolerance(int min, int max) { +float GetTolerance(float min, float max) { float kQuantizedStep = (max - min) / 255.0; float kQuantizedTolerance = 2.0 * kQuantizedStep; return kQuantizedTolerance; } +float GetToleranceInt16(float min, float max) { + float kQuantizedStep = (max - min) / 32767.f; + float kQuantizedTolerance = 2.0 * kQuantizedStep; + return kQuantizedTolerance; +} + TEST(FloatAddOpModel, NoActivation) { FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}}, @@ -122,6 +140,57 @@ TEST(FloatAddOpModel, WithBroadcast) { } } +TEST(IntegerAddOpModel, NoActivation) { + IntegerAddOpModel m({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, {TensorType_INT32, {}}, + ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-19, 4, 10, 13})); +} + +TEST(IntegerAddOpModel, ActivationRELU_N1_TO_1) { + IntegerAddOpModel m({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, {TensorType_INT32, {}}, + ActivationFunctionType_RELU_N1_TO_1); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1, 1, 1, 1})); +} + +TEST(IntegerAddOpModel, VariousInputShapes) { + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + IntegerAddOpModel m({TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8, 11, 20}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5, 11, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-19, 04, 10, 13, 22, 21})) + << "With shape number " << i; + } +} + +TEST(IntegerAddOpModel, WithBroadcast) { + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + IntegerAddOpModel m({TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, {}}, // always a scalar + {TensorType_INT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8, 11, 20}); + m.PopulateTensor<int32_t>(m.input2(), {1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear({-19, 3, 8, 9, 12, 21}))) + << "With shape number " << i; + } +} + TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) { float kQuantizedTolerance = GetTolerance(-1.0, 1.0); std::vector<std::initializer_list<float>> inputs1 = { @@ -144,6 +213,31 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) { } } +TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) { + const float kMin = -1.f; + const float kMax = 32767.f / 32768.f; + float kQuantizedTolerance = GetToleranceInt16(kMin, kMax); + std::vector<std::initializer_list<float>> inputs1 = { + {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}}; + std::vector<std::initializer_list<float>> inputs2 = { + {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}}; + std::vector<std::initializer_list<float>> results = { + {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; + for (int i = 0; i < inputs1.size(); ++i) { + QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_INT16, {}, kMin, kMax}, + ActivationFunctionType_NONE); + m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]); + m.QuantizeAndPopulate<int16_t>(m.input2(), inputs2[i]); + m.Invoke(); + EXPECT_THAT( + m.GetDequantizedOutputInt16(), + ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance))) + << "With test number " << i; + } +} + TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) { float kQuantizedTolerance = GetTolerance(-1.0, 1.0); std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7}, diff --git a/tensorflow/contrib/lite/kernels/arg_max.cc b/tensorflow/contrib/lite/kernels/arg_min_max.cc index 738d475f60..b91e348c27 100644 --- a/tensorflow/contrib/lite/kernels/arg_max.cc +++ b/tensorflow/contrib/lite/kernels/arg_min_max.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -23,7 +23,7 @@ limitations under the License. namespace tflite { namespace ops { namespace builtin { -namespace arg_max { +namespace arg_min_max { constexpr int kInputTensor = 0; constexpr int kAxis = 1; @@ -52,7 +52,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output->type = kTfLiteInt64; break; default: - context->ReportError(context, "Unknown index output data type"); + context->ReportError(context, "Unknown index output data type: %d", + params->output_type); return kTfLiteError; } @@ -64,7 +65,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { break; default: - context->ReportError(context, "Only float32 and int types are supported"); + context->ReportError( + context, + "Unkonwn input type: %d, only float32 and int types are supported", + input->type); return kTfLiteError; } @@ -76,30 +80,40 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { return context->ResizeTensor(context, output, output_size); } +template <typename T> +std::function<bool(T, T)> GetComparefunction(bool is_arg_max) { + if (is_arg_max) { + return std::greater<T>(); + } else { + return std::less<T>(); + } +} + // The current impl actually ignores the axis argument. // Only determine the index of the maximum value in the last dimension. -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) { const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* axis = GetInput(context, node, kAxis); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); -#define TF_LITE_ARG_MAX(data_type, axis_type, output_type) \ - optimized_ops::ArgMax(GetTensorData<axis_type>(axis), \ - GetTensorData<data_type>(input), GetTensorDims(input), \ - GetTensorData<output_type>(output), \ - GetTensorDims(output)) +#define TF_LITE_ARG_MIN_MAX(data_type, axis_type, output_type) \ + optimized_ops::ArgMinMax( \ + GetTensorShape(input), GetTensorData<data_type>(input), \ + GetTensorData<axis_type>(axis), GetTensorShape(output), \ + GetTensorData<output_type>(output), \ + GetComparefunction<data_type>(is_arg_max)) if (axis->type == kTfLiteInt32) { switch (output->type) { case kTfLiteInt32: { switch (input->type) { case kTfLiteFloat32: - TF_LITE_ARG_MAX(float, int32_t, int32_t); + TF_LITE_ARG_MIN_MAX(float, int32_t, int32_t); break; case kTfLiteUInt8: - TF_LITE_ARG_MAX(uint8_t, int32_t, int32_t); + TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int32_t); break; case kTfLiteInt32: - TF_LITE_ARG_MAX(int32_t, int32_t, int32_t); + TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int32_t); break; default: return kTfLiteError; @@ -108,13 +122,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: { switch (input->type) { case kTfLiteFloat32: - TF_LITE_ARG_MAX(float, int32_t, int64_t); + TF_LITE_ARG_MIN_MAX(float, int32_t, int64_t); break; case kTfLiteUInt8: - TF_LITE_ARG_MAX(uint8_t, int32_t, int64_t); + TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int64_t); break; case kTfLiteInt32: - TF_LITE_ARG_MAX(int32_t, int32_t, int64_t); + TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int64_t); break; default: return kTfLiteError; @@ -128,13 +142,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt32: { switch (input->type) { case kTfLiteFloat32: - TF_LITE_ARG_MAX(float, int64_t, int32_t); + TF_LITE_ARG_MIN_MAX(float, int64_t, int32_t); break; case kTfLiteUInt8: - TF_LITE_ARG_MAX(uint8_t, int64_t, int32_t); + TF_LITE_ARG_MIN_MAX(uint8_t, int64_t, int32_t); break; case kTfLiteInt32: - TF_LITE_ARG_MAX(int32_t, int64_t, int32_t); + TF_LITE_ARG_MIN_MAX(int32_t, int64_t, int32_t); break; default: return kTfLiteError; @@ -143,13 +157,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: { switch (input->type) { case kTfLiteFloat32: - TF_LITE_ARG_MAX(float, int64_t, int64_t); + TF_LITE_ARG_MIN_MAX(float, int64_t, int64_t); break; case kTfLiteUInt8: - TF_LITE_ARG_MAX(uint8_t, int64_t, int64_t); + TF_LITE_ARG_MIN_MAX(uint8_t, int64_t, int64_t); break; case kTfLiteInt32: - TF_LITE_ARG_MAX(int32_t, int64_t, int64_t); + TF_LITE_ARG_MIN_MAX(int32_t, int64_t, int64_t); break; default: return kTfLiteError; @@ -159,16 +173,30 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteError; } } -#undef TF_LITE_ARG_MAX +#undef TF_LITE_ARG_MIN_MAX return kTfLiteOk; } -} // namespace arg_max +TfLiteStatus ArgMinEval(TfLiteContext* context, TfLiteNode* node) { + return Eval(context, node, false); +} + +TfLiteStatus ArgMaxEval(TfLiteContext* context, TfLiteNode* node) { + return Eval(context, node, true); +} + +} // namespace arg_min_max TfLiteRegistration* Register_ARG_MAX() { - static TfLiteRegistration r = {nullptr, nullptr, arg_max::Prepare, - arg_max::Eval}; + static TfLiteRegistration r = {nullptr, nullptr, arg_min_max::Prepare, + arg_min_max::ArgMaxEval}; + return &r; +} + +TfLiteRegistration* Register_ARG_MIN() { + static TfLiteRegistration r = {nullptr, nullptr, arg_min_max::Prepare, + arg_min_max::ArgMinEval}; return &r; } diff --git a/tensorflow/contrib/lite/kernels/arg_max_test.cc b/tensorflow/contrib/lite/kernels/arg_min_max_test.cc index 31b15fe19a..90e5fdc532 100644 --- a/tensorflow/contrib/lite/kernels/arg_max_test.cc +++ b/tensorflow/contrib/lite/kernels/arg_min_max_test.cc @@ -24,16 +24,13 @@ namespace { using ::testing::ElementsAreArray; template <typename T> -class ArgMaxOpModel : public SingleOpModel { +class ArgBaseOpModel : public SingleOpModel { public: - ArgMaxOpModel(std::initializer_list<int> input_shape, TensorType input_type, - TensorType output_type, TensorType index_output_type) { + ArgBaseOpModel(std::initializer_list<int> input_shape, TensorType input_type, + TensorType output_type, TensorType index_output_type) { input_ = AddInput(input_type); axis_ = AddInput(TensorType_INT32); output_ = AddOutput(output_type); - SetBuiltinOp(BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions, - CreateArgMaxOptions(builder_, index_output_type).Union()); - BuildInterpreter({input_shape, {1, 1, 1, 1}}); } int input() { return input_; } @@ -42,12 +39,42 @@ class ArgMaxOpModel : public SingleOpModel { std::vector<T> GetOutput() { return ExtractVector<T>(output_); } std::vector<int> GetOutputShape() { return GetTensorShape(output_); } - private: + protected: int input_; int axis_; int output_; }; +template <typename T> +class ArgMaxOpModel : public ArgBaseOpModel<T> { + public: + ArgMaxOpModel(std::initializer_list<int> input_shape, TensorType input_type, + TensorType output_type, TensorType index_output_type) + : ArgBaseOpModel<T>(input_shape, input_type, output_type, + index_output_type) { + ArgBaseOpModel<T>::SetBuiltinOp( + BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions, + CreateArgMaxOptions(ArgBaseOpModel<T>::builder_, index_output_type) + .Union()); + ArgBaseOpModel<T>::BuildInterpreter({input_shape, {1, 1, 1, 1}}); + } +}; + +template <typename T> +class ArgMinOpModel : public ArgBaseOpModel<T> { + public: + ArgMinOpModel(std::initializer_list<int> input_shape, TensorType input_type, + TensorType output_type, TensorType index_output_type) + : ArgBaseOpModel<T>(input_shape, input_type, output_type, + index_output_type) { + ArgBaseOpModel<T>::SetBuiltinOp( + BuiltinOperator_ARG_MIN, BuiltinOptions_ArgMinOptions, + CreateArgMinOptions(ArgBaseOpModel<T>::builder_, index_output_type) + .Union()); + ArgBaseOpModel<T>::BuildInterpreter({input_shape, {1, 1, 1, 1}}); + } +}; + TEST(ArgMaxOpTest, GetMaxArgFloat) { ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_FLOAT32, TensorType_INT32, TensorType_INT32); @@ -96,6 +123,54 @@ TEST(ArgMaxOpTest, GetMaxArgOutput64) { EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1})); } +TEST(ArgMinOpTest, GetMinArgFloat) { + ArgMinOpModel<int32_t> model({1, 1, 1, 4}, TensorType_FLOAT32, + TensorType_INT32, TensorType_INT32); + model.PopulateTensor<float>(model.input(), {0.1, 0.9, 0.7, 0.3}); + // Currently only support the last dimension. + model.PopulateTensor<int>(model.axis(), {3}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAreArray({0})); + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1})); +} + +TEST(ArgMinOpTest, GetMinArgInt) { + ArgMinOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT32, TensorType_INT32, + TensorType_INT32); + model.PopulateTensor<int>(model.input(), {1, 9, 7, 3}); + // Currently only support the last dimension. + model.PopulateTensor<int>(model.axis(), {3}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAreArray({0})); + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1})); +} + +TEST(ArgMinOpTest, GetMinArgMulDimensions) { + ArgMinOpModel<int32_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT32, + TensorType_INT32); + model.PopulateTensor<int>(model.input(), {1, 2, 7, 8, 1, 9, 7, 3}); + // Currently only support the last dimension. + model.PopulateTensor<int>(model.axis(), {3}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 0})); + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1})); +} + +TEST(ArgMinOpTest, GetMinArgOutput64) { + ArgMinOpModel<int64_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT64, + TensorType_INT64); + model.PopulateTensor<int>(model.input(), {10, 2, 7, 8, 1, 9, 7, 3}); + // Currently only support the last dimension. + model.PopulateTensor<int>(model.axis(), {3}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0})); + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1})); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc index 91d8dd3fa7..0d2d5e775f 100644 --- a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc +++ b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/spectrogram.h" @@ -22,7 +22,7 @@ limitations under the License. #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" -#include "flatbuffers/flexbuffers.h" +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers namespace tflite { namespace ops { diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc index 8d460fdfc6..7e4ff6fc16 100644 --- a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc +++ b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc @@ -18,7 +18,7 @@ limitations under the License. #include <vector> #include <gtest/gtest.h> -#include "flatbuffers/flexbuffers.h" +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers #include "tensorflow/contrib/lite/interpreter.h" #include "tensorflow/contrib/lite/kernels/register.h" #include "tensorflow/contrib/lite/kernels/test_util.h" diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc index 0907547f9f..1aa27602e5 100644 --- a/tensorflow/contrib/lite/kernels/basic_rnn.cc +++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc @@ -15,8 +15,8 @@ limitations under the License. #include <stddef.h> #include <stdint.h> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -31,12 +31,14 @@ constexpr int kInputTensor = 0; constexpr int kWeightsTensor = 1; constexpr int kRecurrentWeightsTensor = 2; constexpr int kBiasTensor = 3; -constexpr int kHiddenStateTensor = 0; -constexpr int kOutputTensor = 1; +constexpr int kHiddenStateTensor = 4; + +// Output tensor. +constexpr int kOutputTensor = 0; void* Init(TfLiteContext* context, const char* buffer, size_t length) { auto* scratch_tensor_index = new int; - context->AddTensors(context, /*tensors_to_add=*/2, scratch_tensor_index); + context->AddTensors(context, /*tensors_to_add=*/3, scratch_tensor_index); return scratch_tensor_index; } @@ -46,14 +48,16 @@ void Free(TfLiteContext* context, void* buffer) { TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Check we have all the inputs and outputs we need. - TF_LITE_ENSURE_EQ(context, node->inputs->size, 4); - TF_LITE_ENSURE_EQ(context, node->outputs->size, 2); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 5); + TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor); const TfLiteTensor* recurrent_weights = GetInput(context, node, kRecurrentWeightsTensor); const TfLiteTensor* bias = GetInput(context, node, kBiasTensor); + const TfLiteTensor* hidden_state = + GetInput(context, node, kHiddenStateTensor); // Check all the parameters of tensor match within themselves and match the // input configuration. @@ -65,20 +69,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]); TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type); + TF_LITE_ENSURE_EQ(context, NumDimensions(hidden_state), 2); + TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[0], batch_size); + TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[1], num_units); - TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - // Resize state. - TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2); - hidden_state_size_array->data[0] = batch_size; - hidden_state_size_array->data[1] = num_units; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, hidden_state, - hidden_state_size_array)); - - // Mark hidden state as a persistent tensor. - hidden_state->allocation_type = kTfLiteArenaRwPersistent; - // Resize output. TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2); output_size_array->data[0] = batch_size; @@ -91,7 +87,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) { int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data); TfLiteIntArrayFree(node->temporaries); - node->temporaries = TfLiteIntArrayCreate(2); + node->temporaries = TfLiteIntArrayCreate(3); node->temporaries->data[0] = *scratch_tensor_index; TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0); input_quantized->type = kTfLiteUInt8; @@ -114,6 +110,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { context->ResizeTensor(context, hidden_state_quantized, hidden_state_quantized_size)); } + node->temporaries->data[2] = *scratch_tensor_index + 2; + TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + scaling_factors_size->data[0] = batch_size; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } } return kTfLiteOk; @@ -145,14 +151,14 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input, return kTfLiteOk; } -TfLiteStatus EvalQuantized(const TfLiteTensor* input, - const TfLiteTensor* input_weights, - const TfLiteTensor* recurrent_weights, - const TfLiteTensor* bias, - const TfLiteRNNParams* params, - TfLiteTensor* input_scratch, - TfLiteTensor* hidden_state_scratch, - TfLiteTensor* hidden_state, TfLiteTensor* output) { +TfLiteStatus EvalHybrid(const TfLiteTensor* input, + const TfLiteTensor* input_weights, + const TfLiteTensor* recurrent_weights, + const TfLiteTensor* bias, const TfLiteRNNParams* params, + TfLiteTensor* input_scratch, + TfLiteTensor* hidden_state_scratch, + TfLiteTensor* scaling_factors, + TfLiteTensor* hidden_state, TfLiteTensor* output) { const int batch_size = input->dims->data[0]; const int num_units = input_weights->dims->data[0]; const int input_size = input->dims->data[1]; @@ -176,12 +182,14 @@ TfLiteStatus EvalQuantized(const TfLiteTensor* input, reinterpret_cast<int8_t*>(input_scratch->data.uint8); int8_t* quantized_hidden_state_ptr = reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8); + float* scaling_factors_ptr = scaling_factors->data.f; kernel_utils::RnnBatchStep( input_ptr_batch, input_weights_ptr, input_weights_scale, recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size, num_units, batch_size, params->activation, quantized_input_ptr, - quantized_hidden_state_ptr, hidden_state_ptr_batch, output_ptr_batch); + quantized_hidden_state_ptr, scaling_factors_ptr, hidden_state_ptr_batch, + output_ptr_batch); return kTfLiteOk; } @@ -193,7 +201,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* recurrent_weights = GetInput(context, node, kRecurrentWeightsTensor); const TfLiteTensor* bias = GetInput(context, node, kBiasTensor); - TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor); + TfLiteTensor* hidden_state = + &context->tensors[node->inputs->data[kHiddenStateTensor]]; TfLiteTensor* output = GetOutput(context, node, kOutputTensor); // We already checked that weight types are consistent, so branch on one. @@ -205,12 +214,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { // TODO(mirkov): implement eval with quantized inputs as well. TfLiteTensor* input_quantized = GetTemporary(context, node, 0); TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1); - return EvalQuantized(input, input_weights, recurrent_weights, bias, - params, input_quantized, hidden_state_quantized, - hidden_state, output); + TfLiteTensor* scaling_factors = GetTemporary(context, node, 2); + return EvalHybrid(input, input_weights, recurrent_weights, bias, params, + input_quantized, hidden_state_quantized, + scaling_factors, hidden_state, output); } default: - context->ReportError(context, "Type not currently supported."); + context->ReportError(context, "Type %d not currently supported.", + input_weights->type); return kTfLiteError; } return kTfLiteOk; diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc index 96465fcaf0..d179735404 100644 --- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc +++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc @@ -181,15 +181,16 @@ class RNNOpModel : public SingleOpModel { weights_ = AddInput(weights); recurrent_weights_ = AddInput(recurrent_weights); bias_ = AddInput(TensorType_FLOAT32); - hidden_state_ = AddOutput(TensorType_FLOAT32); + hidden_state_ = AddInput(TensorType_FLOAT32, true); output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp( BuiltinOperator_RNN, BuiltinOptions_RNNOptions, CreateRNNOptions(builder_, ActivationFunctionType_RELU).Union()); - BuildInterpreter({{batches_, input_size_}, - {units_, input_size_}, - {units_, units_}, - {units_}}); + BuildInterpreter({{batches_, input_size_}, // input tensor + {units_, input_size_}, // weights tensor + {units_, units_}, // recurrent weights tensor + {units_}, // bias tensor + {batches_, units_}}); // hidden state tensor } void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); } @@ -210,14 +211,6 @@ class RNNOpModel : public SingleOpModel { PopulateTensor(input_, offset, begin, end); } - void ResetHiddenState() { - const int zero_buffer_size = units_ * batches_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(hidden_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - std::vector<float> GetOutput() { return ExtractVector<float>(output_); } int input_size() { return input_size_; } @@ -258,7 +251,6 @@ TEST(RnnOpTest, BlackBoxTest) { rnn.SetBias(rnn_bias); rnn.SetRecurrentWeights(rnn_recurrent_weights); - rnn.ResetHiddenState(); const int input_sequence_size = sizeof(rnn_input) / sizeof(float) / (rnn.input_size() * rnn.num_batches()); @@ -286,7 +278,6 @@ TEST(HybridRnnOpTest, BlackBoxTest) { rnn.SetBias(rnn_bias); rnn.SetRecurrentWeights(rnn_recurrent_weights); - rnn.ResetHiddenState(); const int input_sequence_size = sizeof(rnn_input) / sizeof(float) / (rnn.input_size() * rnn.num_batches()); diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc index 262e1aeab1..fe2865dfb9 100644 --- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc +++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include <string.h> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -125,14 +125,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } #define TF_LITE_BATCH_TO_SPACE_ND(type, scalar) \ - type::BatchToSpaceND(GetTensorData<scalar>(op_context.input), \ - GetTensorDims(op_context.input), \ + type::BatchToSpaceND(GetTensorShape(op_context.input), \ + GetTensorData<scalar>(op_context.input), \ + GetTensorShape(op_context.block_shape), \ GetTensorData<int32_t>(op_context.block_shape), \ - GetTensorDims(op_context.block_shape), \ + GetTensorShape(op_context.crops), \ GetTensorData<int32_t>(op_context.crops), \ - GetTensorDims(op_context.crops), \ - GetTensorData<scalar>(op_context.output), \ - GetTensorDims(op_context.output)) + GetTensorShape(op_context.output), \ + GetTensorData<scalar>(op_context.output)) switch (op_context.input->type) { // Already know in/out types are same. case kTfLiteFloat32: if (kernel_type == kReference) { @@ -163,8 +163,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } break; default: - context->ReportError(context, - "Type is currently not supported by BatchToSpace."); + context->ReportError( + context, "Type %d is currently not supported by BatchToSpace.", + op_context.input->type); return kTfLiteError; } #undef TF_LITE_BATCH_TO_SPACE_ND diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc index 3425288f02..a326827b1e 100644 --- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc +++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -21,12 +20,13 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/lstm_eval.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" namespace tflite { @@ -95,18 +95,54 @@ constexpr int kBwProjectionWeightsTensor = 33; // Optional // Projection bias tensor of size {n_output} constexpr int kBwProjectionBiasTensor = 34; // Optional -// Output tensors. -constexpr int kFwOutputStateTensor = 0; -constexpr int kFwCellStateTensor = 1; -constexpr int kFwOutputTensor = 2; +// Stateful input tensors that are variables and will be modified by the Op. +// Activation state tensors of size {n_batch, n_output} +constexpr int kFwInputActivationStateTensor = 35; +// Cell state tensors of size {n_batch, n_cell} +constexpr int kFwInputCellStateTensor = 36; +// Activation state tensors of size {n_batch, n_output} +constexpr int kBwInputActivationStateTensor = 37; +// Cell state tensors of size {n_batch, n_cell} +constexpr int kBwInputCellStateTensor = 38; + +// Auxiliary input and weights when stacking. +constexpr int kAuxInputTensor = 39; // Optional +// Forward weights. +constexpr int kFwAuxInputToInputWeightsTensor = 40; // Optional +constexpr int kFwAuxInputToForgetWeightsTensor = 41; // Optional +constexpr int kFwAuxInputToCellWeightsTensor = 42; // Optional +constexpr int kFwAuxInputToOutputWeightsTensor = 43; // Optional +// Backward weights. +constexpr int kBwAuxInputToInputWeightsTensor = 44; // Optional +constexpr int kBwAuxInputToForgetWeightsTensor = 45; // Optional +constexpr int kBwAuxInputToCellWeightsTensor = 46; // Optional +constexpr int kBwAuxInputToOutputWeightsTensor = 47; // Optional -constexpr int kBwOutputStateTensor = 3; -constexpr int kBwCellStateTensor = 4; -constexpr int kBwOutputTensor = 5; +// Output tensors. +constexpr int kFwOutputTensor = 0; +constexpr int kBwOutputTensor = 1; // Ignored if merge_outputs is set. + +// Temporary tensors. +enum TemporaryTensor { + // Scratch buffers for input, forget, etc. gates + kFwScratchBuffer = 0, + kBwScratchBuffer = 1, + // Quantized tensors needed for the hybrid kernel. + kInputQuantized = 2, + kAuxInputQuantized = 3, // Quantized tensor needed for auxiliary input. + kFwActivationStateQuantized = 4, + kBwActivationStateQuantized = 5, + kFwCellStateQuantized = 6, + kBwCellStateQuantized = 7, + kScalingFactors = 8, + kProductScalingFactors = 9, + kRecoveredCellWeights = 10, + kNumTemporaryTensors = 11 +}; void* Init(TfLiteContext* context, const char* buffer, size_t length) { auto* scratch_tensor_index = new int; - context->AddTensors(context, 2, scratch_tensor_index); + context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index); return scratch_tensor_index; } @@ -127,7 +163,8 @@ TfLiteStatus CheckLstmTensorDimensions( int input_gate_bias_tensor, int forget_gate_bias_tensor, int cell_gate_bias_tensor, int output_gate_bias_tensor, int projection_weights_tensor, int projection_bias_tensor) { - auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data); + const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>( + node->builtin_data); // Making sure clipping parameters have valid values. // == 0 means no clipping @@ -276,45 +313,55 @@ TfLiteStatus CheckLstmTensorDimensions( TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context, TfLiteNode* node, int n_input, int n_output, int n_cell) { - CheckLstmTensorDimensions( - context, node, n_input, n_output, n_cell, kFwInputToInputWeightsTensor, - kFwInputToForgetWeightsTensor, kFwInputToCellWeightsTensor, - kFwInputToOutputWeightsTensor, kFwRecurrentToInputWeightsTensor, - kFwRecurrentToForgetWeightsTensor, kFwRecurrentToCellWeightsTensor, - kFwRecurrentToOutputWeightsTensor, kFwCellToInputWeightsTensor, - kFwCellToForgetWeightsTensor, kFwCellToOutputWeightsTensor, - kFwInputGateBiasTensor, kFwForgetGateBiasTensor, kFwCellGateBiasTensor, - kFwOutputGateBiasTensor, kFwProjectionWeightsTensor, - kFwProjectionBiasTensor); - - CheckLstmTensorDimensions( - context, node, n_input, n_output, n_cell, kBwInputToInputWeightsTensor, - kBwInputToForgetWeightsTensor, kBwInputToCellWeightsTensor, - kBwInputToOutputWeightsTensor, kBwRecurrentToInputWeightsTensor, - kBwRecurrentToForgetWeightsTensor, kBwRecurrentToCellWeightsTensor, - kBwRecurrentToOutputWeightsTensor, kBwCellToInputWeightsTensor, - kBwCellToForgetWeightsTensor, kBwCellToOutputWeightsTensor, - kBwInputGateBiasTensor, kBwForgetGateBiasTensor, kBwCellGateBiasTensor, - kBwOutputGateBiasTensor, kBwProjectionWeightsTensor, - kBwProjectionBiasTensor); + TF_LITE_ENSURE_OK( + context, + CheckLstmTensorDimensions( + context, node, n_input, n_output, n_cell, + kFwInputToInputWeightsTensor, kFwInputToForgetWeightsTensor, + kFwInputToCellWeightsTensor, kFwInputToOutputWeightsTensor, + kFwRecurrentToInputWeightsTensor, kFwRecurrentToForgetWeightsTensor, + kFwRecurrentToCellWeightsTensor, kFwRecurrentToOutputWeightsTensor, + kFwCellToInputWeightsTensor, kFwCellToForgetWeightsTensor, + kFwCellToOutputWeightsTensor, kFwInputGateBiasTensor, + kFwForgetGateBiasTensor, kFwCellGateBiasTensor, + kFwOutputGateBiasTensor, kFwProjectionWeightsTensor, + kFwProjectionBiasTensor)); + + TF_LITE_ENSURE_OK( + context, + CheckLstmTensorDimensions( + context, node, n_input, n_output, n_cell, + kBwInputToInputWeightsTensor, kBwInputToForgetWeightsTensor, + kBwInputToCellWeightsTensor, kBwInputToOutputWeightsTensor, + kBwRecurrentToInputWeightsTensor, kBwRecurrentToForgetWeightsTensor, + kBwRecurrentToCellWeightsTensor, kBwRecurrentToOutputWeightsTensor, + kBwCellToInputWeightsTensor, kBwCellToForgetWeightsTensor, + kBwCellToOutputWeightsTensor, kBwInputGateBiasTensor, + kBwForgetGateBiasTensor, kBwCellGateBiasTensor, + kBwOutputGateBiasTensor, kBwProjectionWeightsTensor, + kBwProjectionBiasTensor)); // Check if Forward and Backward tensors match along required dimensions. return kTfLiteOk; } -// Resize the output, state and scratch tensors based on the sizes of the input +// Resize the output and scratch tensors based on the sizes of the input // tensors. Also check that the size of the input tensors match each other. TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data); + const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>( + node->builtin_data); // Check we have all the inputs and outputs we need. - TF_LITE_ENSURE_EQ(context, node->inputs->size, 35); - TF_LITE_ENSURE_EQ(context, node->outputs->size, 6); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 48); + TF_LITE_ENSURE_EQ(context, node->outputs->size, + params->merge_outputs ? 1 : 2); // Inferring batch size, number of outputs and sequence length and // number of cells from the input tensors. const TfLiteTensor* input = GetInput(context, node, kInputTensor); - TF_LITE_ENSURE(context, input->dims->size > 1); + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + TF_LITE_ENSURE_EQ(context, input->dims->size, 3); const int max_time = input->dims->data[0]; const int n_batch = input->dims->data[1]; const int n_input = input->dims->data[2]; @@ -326,6 +373,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->data[1], n_input); + const TfLiteTensor* bw_input_to_output_weights = + GetInput(context, node, kBwInputToOutputWeightsTensor); + const int n_bw_cell = bw_input_to_output_weights->dims->data[0]; + TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1], + n_input); + const TfLiteTensor* fw_recurrent_to_output_weights = GetInput(context, node, kFwRecurrentToOutputWeightsTensor); TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->size, 2); @@ -333,49 +387,105 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { n_fw_cell); const int n_fw_output = fw_recurrent_to_output_weights->dims->data[1]; + const TfLiteTensor* bw_recurrent_to_output_weights = + GetInput(context, node, kBwRecurrentToOutputWeightsTensor); + TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0], + n_bw_cell); + const int n_bw_output = bw_recurrent_to_output_weights->dims->data[1]; + // Check that input tensor dimensions matches with each other. - CheckInputTensorDimensions(context, node, n_input, n_fw_output, n_fw_cell); + TF_LITE_ENSURE_OK( + context, CheckInputTensorDimensions(context, node, n_input, n_fw_output, + n_fw_cell)); + + // Get (optional) auxiliary inputs and weights. + const TfLiteTensor* aux_input = + GetOptionalInputTensor(context, node, kAuxInputTensor); + const TfLiteTensor* fw_aux_input_to_input_weights = + GetOptionalInputTensor(context, node, kFwAuxInputToInputWeightsTensor); + const TfLiteTensor* fw_aux_input_to_forget_weights = + GetOptionalInputTensor(context, node, kFwAuxInputToForgetWeightsTensor); + const TfLiteTensor* fw_aux_input_to_cell_weights = + GetOptionalInputTensor(context, node, kFwAuxInputToCellWeightsTensor); + const TfLiteTensor* fw_aux_input_to_output_weights = + GetOptionalInputTensor(context, node, kFwAuxInputToOutputWeightsTensor); + const TfLiteTensor* bw_aux_input_to_input_weights = + GetOptionalInputTensor(context, node, kBwAuxInputToInputWeightsTensor); + const TfLiteTensor* bw_aux_input_to_forget_weights = + GetOptionalInputTensor(context, node, kBwAuxInputToForgetWeightsTensor); + const TfLiteTensor* bw_aux_input_to_cell_weights = + GetOptionalInputTensor(context, node, kBwAuxInputToCellWeightsTensor); + const TfLiteTensor* bw_aux_input_to_output_weights = + GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor); + + const bool aux_inputs_all_or_none = + ((aux_input != nullptr) && (fw_aux_input_to_cell_weights != nullptr) && + (fw_aux_input_to_forget_weights != nullptr) && + (fw_aux_input_to_output_weights != nullptr) && + (bw_aux_input_to_cell_weights != nullptr) && + (bw_aux_input_to_forget_weights != nullptr) && + (bw_aux_input_to_output_weights != nullptr)) || + ((fw_aux_input_to_cell_weights == nullptr) && + (fw_aux_input_to_forget_weights == nullptr) && + (fw_aux_input_to_output_weights == nullptr) && + (bw_aux_input_to_cell_weights == nullptr) && + (bw_aux_input_to_forget_weights == nullptr) && + (bw_aux_input_to_output_weights == nullptr)); + TF_LITE_ENSURE(context, aux_inputs_all_or_none); + const bool has_aux_input = (aux_input != nullptr); + + if (has_aux_input) { + // Check that aux_input has the same dimensions (except last) as the input. + TF_LITE_ASSERT_EQ(aux_input->dims->data[0], input->dims->data[0]); + TF_LITE_ASSERT_EQ(aux_input->dims->data[1], input->dims->data[1]); + } - // Get the pointer to output, state and scratch buffer tensors. + // Get the pointer to output, activation_state and cell_state buffer tensors. TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor); - TfLiteTensor* fw_output_state = - GetOutput(context, node, kFwOutputStateTensor); - TfLiteTensor* fw_cell_state = GetOutput(context, node, kFwCellStateTensor); - - // Resize the output, output_state and cell_state tensors. + TfLiteTensor* fw_activation_state = + GetVariableInput(context, node, kFwInputActivationStateTensor); + TfLiteTensor* fw_cell_state = + GetVariableInput(context, node, kFwInputCellStateTensor); + + // Check the shape of input state tensors. + // These tensor may be 1D or 2D. It's fine as long as the total size is + // correct. + TF_LITE_ENSURE_EQ(context, NumElements(fw_activation_state), + n_batch * n_fw_output); + TF_LITE_ENSURE_EQ(context, NumElements(fw_cell_state), n_batch * n_fw_cell); + + // Resize the output tensors. TfLiteIntArray* fw_output_size = TfLiteIntArrayCreate(3); fw_output_size->data[0] = max_time; fw_output_size->data[1] = n_batch; - fw_output_size->data[2] = n_fw_output; + fw_output_size->data[2] = + params->merge_outputs ? n_bw_output + n_fw_output : n_fw_output; TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_output, fw_output_size)); - TfLiteIntArray* fw_output_state_size = TfLiteIntArrayCreate(2); - fw_output_state_size->data[0] = n_batch; - fw_output_state_size->data[1] = n_fw_output; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_output_state, - fw_output_state_size)); + // The weights are of consistent type, so it suffices to check one. + const bool is_hybrid_op = (fw_input_to_output_weights->type == kTfLiteUInt8); - TfLiteIntArray* fw_cell_size = TfLiteIntArrayCreate(2); - fw_cell_size->data[0] = n_batch; - fw_cell_size->data[1] = n_fw_cell; - TF_LITE_ENSURE_OK( - context, context->ResizeTensor(context, fw_cell_state, fw_cell_size)); - - // Create a scratch buffer tensor. TfLiteIntArrayFree(node->temporaries); - node->temporaries = TfLiteIntArrayCreate(2); - node->temporaries->data[0] = *scratch_tensor_index; - TfLiteTensor* fw_scratch_buffer = GetTemporary(context, node, /*index=*/0); + if (is_hybrid_op) { + node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors); + } else { + node->temporaries = TfLiteIntArrayCreate(2); // the two scratch buffers. + } + // Create a scratch buffer tensor. + node->temporaries->data[kFwScratchBuffer] = *scratch_tensor_index; + TfLiteTensor* fw_scratch_buffer = + GetTemporary(context, node, kFwScratchBuffer); fw_scratch_buffer->type = input->type; fw_scratch_buffer->allocation_type = kTfLiteArenaRw; - // Mark state tensors as persistent tensors. - fw_output_state->allocation_type = kTfLiteArenaRwPersistent; - fw_cell_state->allocation_type = kTfLiteArenaRwPersistent; - const TfLiteTensor* fw_input_to_input_weights = GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor); + if (has_aux_input) { + TF_LITE_ENSURE_EQ(context, fw_aux_input_to_input_weights->dims->data[0], + fw_input_to_input_weights->dims->data[0]); + } const bool fw_use_cifg = (fw_input_to_input_weights == nullptr); TfLiteIntArray* fw_scratch_buffer_size = TfLiteIntArrayCreate(2); fw_scratch_buffer_size->data[0] = n_batch; @@ -389,61 +499,50 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_scratch_buffer, fw_scratch_buffer_size)); // Same for the backward cell. - const TfLiteTensor* bw_input_to_output_weights = - GetInput(context, node, kBwInputToOutputWeightsTensor); - const int n_bw_cell = bw_input_to_output_weights->dims->data[0]; - TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2); - TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1], - n_input); - - const TfLiteTensor* bw_recurrent_to_output_weights = - GetInput(context, node, kBwRecurrentToOutputWeightsTensor); - TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2); - TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0], - n_bw_cell); - const int n_bw_output = bw_recurrent_to_output_weights->dims->data[1]; // Check that input tensor dimensions matches with each other. - CheckInputTensorDimensions(context, node, n_input, n_bw_output, n_bw_cell); - - // Get the pointer to output, output_state and cell_state buffer tensors. - TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor); - TfLiteTensor* bw_output_state = - GetOutput(context, node, kBwOutputStateTensor); - TfLiteTensor* bw_cell_state = GetOutput(context, node, kBwCellStateTensor); - - // Resize the output, output_state and cell_state tensors. - TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3); - bw_output_size->data[0] = max_time; - bw_output_size->data[1] = n_batch; - bw_output_size->data[2] = n_bw_output; - TF_LITE_ENSURE_OK(context, - context->ResizeTensor(context, bw_output, bw_output_size)); - - TfLiteIntArray* bw_output_state_size = TfLiteIntArrayCreate(2); - bw_output_state_size->data[0] = n_batch; - bw_output_state_size->data[1] = n_bw_output; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_output_state, - bw_output_state_size)); - - TfLiteIntArray* bw_cell_size = TfLiteIntArrayCreate(2); - bw_cell_size->data[0] = n_batch; - bw_cell_size->data[1] = n_bw_cell; TF_LITE_ENSURE_OK( - context, context->ResizeTensor(context, bw_cell_state, bw_cell_size)); + context, CheckInputTensorDimensions(context, node, n_input, n_bw_output, + n_bw_cell)); + + // Get the pointer to activation_state and cell_state buffer tensors. + TfLiteTensor* bw_activation_state = + GetVariableInput(context, node, kBwInputActivationStateTensor); + TfLiteTensor* bw_cell_state = + GetVariableInput(context, node, kBwInputCellStateTensor); + + // Resize the output tensors. + if (!params->merge_outputs) { + TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor); + TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3); + bw_output_size->data[0] = max_time; + bw_output_size->data[1] = n_batch; + bw_output_size->data[2] = n_bw_output; + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, bw_output, bw_output_size)); + } + + // Check the shape of input state tensors. + // These tensor may be 1D or 2D. It's fine as long as the total size is + // correct. + TF_LITE_ENSURE_EQ(context, NumElements(bw_activation_state), + n_batch * n_bw_output); + TF_LITE_ENSURE_EQ(context, NumElements(bw_cell_state), n_batch * n_bw_cell); // Create a scratch buffer tensor. - node->temporaries->data[1] = *(scratch_tensor_index) + 1; - TfLiteTensor* bw_scratch_buffer = GetTemporary(context, node, /*index=*/1); + node->temporaries->data[kBwScratchBuffer] = + *(scratch_tensor_index) + kBwScratchBuffer; + TfLiteTensor* bw_scratch_buffer = + GetTemporary(context, node, kBwScratchBuffer); bw_scratch_buffer->type = input->type; bw_scratch_buffer->allocation_type = kTfLiteArenaRw; - // Mark state tensors as persistent tensors. - bw_output_state->allocation_type = kTfLiteArenaRwPersistent; - bw_cell_state->allocation_type = kTfLiteArenaRwPersistent; - const TfLiteTensor* bw_input_to_input_weights = GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor); + if (has_aux_input) { + TF_LITE_ENSURE_EQ(context, bw_aux_input_to_input_weights->dims->data[0], + bw_input_to_input_weights->dims->data[0]); + } const bool bw_use_cifg = (bw_input_to_input_weights == nullptr); TfLiteIntArray* bw_scratch_buffer_size = TfLiteIntArrayCreate(2); bw_scratch_buffer_size->data[0] = n_batch; @@ -456,18 +555,153 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_scratch_buffer, bw_scratch_buffer_size)); + if (is_hybrid_op) { + // Allocate temporary tensors to store quantized values of input, aux_input + // (if present), activation_state and cell_state tensors. + node->temporaries->data[kInputQuantized] = + *scratch_tensor_index + kInputQuantized; + TfLiteTensor* input_quantized = + GetTemporary(context, node, kInputQuantized); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } + + if (has_aux_input) { + node->temporaries->data[kAuxInputQuantized] = + *scratch_tensor_index + kAuxInputQuantized; + TfLiteTensor* aux_input_quantized = + GetTemporary(context, node, kAuxInputQuantized); + aux_input_quantized->type = kTfLiteUInt8; + aux_input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) { + TfLiteIntArray* aux_input_quantized_size = + TfLiteIntArrayCopy(aux_input->dims); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, aux_input_quantized, + aux_input_quantized_size)); + } + } + + node->temporaries->data[kFwActivationStateQuantized] = + *scratch_tensor_index + kFwActivationStateQuantized; + TfLiteTensor* fw_activation_state_quantized = + GetTemporary(context, node, kFwActivationStateQuantized); + fw_activation_state_quantized->type = kTfLiteUInt8; + fw_activation_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(fw_activation_state_quantized->dims, + fw_activation_state->dims)) { + TfLiteIntArray* fw_activation_state_quantized_size = + TfLiteIntArrayCopy(fw_activation_state->dims); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, fw_activation_state_quantized, + fw_activation_state_quantized_size)); + } + node->temporaries->data[kBwActivationStateQuantized] = + *scratch_tensor_index + kBwActivationStateQuantized; + TfLiteTensor* bw_activation_state_quantized = + GetTemporary(context, node, kBwActivationStateQuantized); + bw_activation_state_quantized->type = kTfLiteUInt8; + bw_activation_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(bw_activation_state_quantized->dims, + bw_activation_state->dims)) { + TfLiteIntArray* bw_activation_state_quantized_size = + TfLiteIntArrayCopy(bw_activation_state->dims); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, bw_activation_state_quantized, + bw_activation_state_quantized_size)); + } + node->temporaries->data[kFwCellStateQuantized] = + *scratch_tensor_index + kFwCellStateQuantized; + TfLiteTensor* fw_cell_state_quantized = + GetTemporary(context, node, kFwCellStateQuantized); + fw_cell_state_quantized->type = kTfLiteUInt8; + fw_cell_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(fw_cell_state_quantized->dims, + fw_cell_state->dims)) { + TfLiteIntArray* fw_cell_state_quantized_size = + TfLiteIntArrayCopy(fw_cell_state->dims); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, fw_cell_state_quantized, + fw_cell_state_quantized_size)); + } + node->temporaries->data[kBwCellStateQuantized] = + *scratch_tensor_index + kBwCellStateQuantized; + TfLiteTensor* bw_cell_state_quantized = + GetTemporary(context, node, kBwCellStateQuantized); + bw_cell_state_quantized->type = kTfLiteUInt8; + bw_cell_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(bw_cell_state_quantized->dims, + bw_cell_state->dims)) { + TfLiteIntArray* bw_cell_state_quantized_size = + TfLiteIntArrayCopy(bw_cell_state->dims); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, bw_cell_state_quantized, + bw_cell_state_quantized_size)); + } + + // Allocate temporary tensors to store scaling factors and product scaling + // factors. The latter is a convenience storage which allows to quantize + // a vector once (which produces the scaling factors) and multiply it with + // different matrices (which requires multiplying the scaling factors with + // the scaling factor of the matrix). + node->temporaries->data[kScalingFactors] = + *scratch_tensor_index + kScalingFactors; + TfLiteTensor* scaling_factors = + GetTemporary(context, node, kScalingFactors); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + scaling_factors_size->data[0] = n_batch; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } + node->temporaries->data[kProductScalingFactors] = + *scratch_tensor_index + kProductScalingFactors; + TfLiteTensor* prod_scaling_factors = + GetTemporary(context, node, kProductScalingFactors); + prod_scaling_factors->type = kTfLiteFloat32; + prod_scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1); + prod_scaling_factors_size->data[0] = n_batch; + if (!TfLiteIntArrayEqual(prod_scaling_factors->dims, + prod_scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, prod_scaling_factors, + prod_scaling_factors_size)); + } + + // Allocate a temporary tensor to store the recovered cell weights. Since + // this is used for diagonal matrices, only need to store n_cell values. + node->temporaries->data[kRecoveredCellWeights] = + *scratch_tensor_index + kRecoveredCellWeights; + TfLiteTensor* recovered_cell_weights = + GetTemporary(context, node, kRecoveredCellWeights); + recovered_cell_weights->type = kTfLiteFloat32; + recovered_cell_weights->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1); + recovered_cell_weights_size->data[0] = n_fw_cell; + if (!TfLiteIntArrayEqual(recovered_cell_weights->dims, + recovered_cell_weights_size)) { + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, recovered_cell_weights, + recovered_cell_weights_size)); + } + } return kTfLiteOk; } // The LSTM Op engine. TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data); + const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>( + node->builtin_data); // Input tensor. const TfLiteTensor* input = GetInput(context, node, kInputTensor); - const int max_time = input->dims->data[0]; - const int n_batch = input->dims->data[1]; - const int n_input = input->dims->data[2]; // Tensors for the forward cell. const TfLiteTensor* fw_input_to_input_weights = @@ -509,9 +743,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* fw_projection_bias = GetOptionalInputTensor(context, node, kFwProjectionBiasTensor); - TfLiteTensor* fw_output_state = - GetOutput(context, node, kFwOutputStateTensor); - TfLiteTensor* fw_cell_state = GetOutput(context, node, kFwCellStateTensor); + TfLiteTensor* fw_activation_state = + GetVariableInput(context, node, kFwInputActivationStateTensor); + TfLiteTensor* fw_cell_state = + GetVariableInput(context, node, kFwInputCellStateTensor); TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor); // Tensors for the backward cell. @@ -554,154 +789,144 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* bw_projection_bias = GetOptionalInputTensor(context, node, kBwProjectionBiasTensor); - TfLiteTensor* bw_output_state = - GetOutput(context, node, kBwOutputStateTensor); - TfLiteTensor* bw_cell_state = GetOutput(context, node, kBwCellStateTensor); - TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor); - - // n_cell and n_output will be the same size when there is no projection. - const int n_fw_cell = fw_input_to_output_weights->dims->data[0]; - const int n_fw_output = fw_recurrent_to_output_weights->dims->data[1]; - - // Since we have already checked that weights are all there or none, we can - // check the existense of only one to the get the condition. - const bool fw_use_cifg = (fw_input_to_input_weights == nullptr); - const bool fw_use_peephole = (fw_cell_to_output_weights != nullptr); + // State tensors. + TfLiteTensor* bw_activation_state = + GetVariableInput(context, node, kBwInputActivationStateTensor); + TfLiteTensor* bw_cell_state = + GetVariableInput(context, node, kBwInputCellStateTensor); + TfLiteTensor* bw_output = params->merge_outputs + ? nullptr + : GetOutput(context, node, kBwOutputTensor); - // Index the scratch buffers pointers to the global scratch buffer. + // Temporary tensors. TfLiteTensor* fw_scratch_buffer = - &context->tensors[node->temporaries->data[0]]; - float* fw_input_gate_scratch = nullptr; - float* fw_cell_scratch = nullptr; - float* fw_forget_gate_scratch = nullptr; - float* fw_output_gate_scratch = nullptr; - if (fw_use_cifg) { - fw_cell_scratch = fw_scratch_buffer->data.f; - fw_forget_gate_scratch = fw_scratch_buffer->data.f + n_fw_cell * n_batch; - fw_output_gate_scratch = - fw_scratch_buffer->data.f + 2 * n_fw_cell * n_batch; - } else { - fw_input_gate_scratch = fw_scratch_buffer->data.f; - fw_cell_scratch = fw_scratch_buffer->data.f + n_fw_cell * n_batch; - fw_forget_gate_scratch = - fw_scratch_buffer->data.f + 2 * n_fw_cell * n_batch; - fw_output_gate_scratch = - fw_scratch_buffer->data.f + 3 * n_fw_cell * n_batch; - } - - // Check optional tensors, the respective pointers can be null. - const float* fw_input_to_input_weights_ptr = - (fw_use_cifg) ? nullptr : fw_input_to_input_weights->data.f; - const float* fw_recurrent_to_input_weights_ptr = - (fw_use_cifg) ? nullptr : fw_recurrent_to_input_weights->data.f; - const float* fw_input_gate_bias_ptr = - (fw_use_cifg) ? nullptr : fw_input_gate_bias->data.f; - const float* fw_cell_to_input_weights_ptr = - (fw_use_peephole && !fw_use_cifg) ? fw_cell_to_input_weights->data.f - : nullptr; - const float* fw_cell_to_forget_weights_ptr = - (fw_use_peephole) ? fw_cell_to_forget_weights->data.f : nullptr; - const float* fw_cell_to_output_weights_ptr = - (fw_use_peephole) ? fw_cell_to_output_weights->data.f : nullptr; - const float* fw_projection_weights_ptr = (fw_projection_weights == nullptr) - ? nullptr - : fw_projection_weights->data.f; - const float* fw_projection_bias_ptr = - (fw_projection_bias == nullptr) ? nullptr : fw_projection_bias->data.f; - - // Loop through the sequence. - for (int t = 0; t < max_time; t++) { - const float* input_ptr_batch = input->data.f + t * n_batch * n_input; - float* output_ptr_time = fw_output->data.f + t * n_batch * n_fw_output; - - kernel_utils::LstmStep( - input_ptr_batch, fw_input_to_input_weights_ptr, - fw_input_to_forget_weights->data.f, fw_input_to_cell_weights->data.f, - fw_input_to_output_weights->data.f, fw_recurrent_to_input_weights_ptr, - fw_recurrent_to_forget_weights->data.f, - fw_recurrent_to_cell_weights->data.f, - fw_recurrent_to_output_weights->data.f, fw_cell_to_input_weights_ptr, - fw_cell_to_forget_weights_ptr, fw_cell_to_output_weights_ptr, - fw_input_gate_bias_ptr, fw_forget_gate_bias->data.f, - fw_cell_bias->data.f, fw_output_gate_bias->data.f, - fw_projection_weights_ptr, fw_projection_bias_ptr, params, n_batch, - n_fw_cell, n_input, n_fw_output, fw_output_state->data.f, - fw_cell_state->data.f, fw_input_gate_scratch, fw_forget_gate_scratch, - fw_cell_scratch, fw_output_gate_scratch, output_ptr_time); - } - - // n_cell and n_output will be the same size when there is no projection. - const int n_bw_cell = bw_input_to_output_weights->dims->data[0]; - const int n_bw_output = bw_recurrent_to_output_weights->dims->data[1]; - - // Since we have already checked that weights are all there or none, we can - // check the existense of only one to the get the condition. - const bool bw_use_cifg = (bw_input_to_input_weights == nullptr); - const bool bw_use_peephole = (bw_cell_to_output_weights != nullptr); - - // Index the scratch buffers pointers to the global scratch buffer. + GetTemporary(context, node, kFwScratchBuffer); TfLiteTensor* bw_scratch_buffer = - &context->tensors[node->temporaries->data[1]]; - float* bw_input_gate_scratch = nullptr; - float* bw_cell_scratch = nullptr; - float* bw_forget_gate_scratch = nullptr; - float* bw_output_gate_scratch = nullptr; - if (bw_use_cifg) { - bw_cell_scratch = bw_scratch_buffer->data.f; - bw_forget_gate_scratch = bw_scratch_buffer->data.f + n_bw_cell * n_batch; - bw_output_gate_scratch = - bw_scratch_buffer->data.f + 2 * n_bw_cell * n_batch; - } else { - bw_input_gate_scratch = bw_scratch_buffer->data.f; - bw_cell_scratch = bw_scratch_buffer->data.f + n_bw_cell * n_batch; - bw_forget_gate_scratch = - bw_scratch_buffer->data.f + 2 * n_bw_cell * n_batch; - bw_output_gate_scratch = - bw_scratch_buffer->data.f + 3 * n_bw_cell * n_batch; - } - - // Check optional tensors, the respective pointers can be null. - const float* bw_input_to_input_weights_ptr = - (bw_use_cifg) ? nullptr : bw_input_to_input_weights->data.f; - const float* bw_recurrent_to_input_weights_ptr = - (bw_use_cifg) ? nullptr : bw_recurrent_to_input_weights->data.f; - const float* bw_input_gate_bias_ptr = - (bw_use_cifg) ? nullptr : bw_input_gate_bias->data.f; - const float* bw_cell_to_input_weights_ptr = - (bw_use_peephole && !bw_use_cifg) ? bw_cell_to_input_weights->data.f - : nullptr; - const float* bw_cell_to_forget_weights_ptr = - (bw_use_peephole) ? bw_cell_to_forget_weights->data.f : nullptr; - const float* bw_cell_to_output_weights_ptr = - (bw_use_peephole) ? bw_cell_to_output_weights->data.f : nullptr; - const float* bw_projection_weights_ptr = (bw_projection_weights == nullptr) - ? nullptr - : bw_projection_weights->data.f; - const float* bw_projection_bias_ptr = - (bw_projection_bias == nullptr) ? nullptr : bw_projection_bias->data.f; - - // Loop through the sequence backwards. - for (int t = max_time - 1; t >= 0; t--) { - const float* input_ptr_batch = input->data.f + t * n_batch * n_input; - float* output_ptr_time = bw_output->data.f + t * n_batch * n_bw_output; - - kernel_utils::LstmStep( - input_ptr_batch, bw_input_to_input_weights_ptr, - bw_input_to_forget_weights->data.f, bw_input_to_cell_weights->data.f, - bw_input_to_output_weights->data.f, bw_recurrent_to_input_weights_ptr, - bw_recurrent_to_forget_weights->data.f, - bw_recurrent_to_cell_weights->data.f, - bw_recurrent_to_output_weights->data.f, bw_cell_to_input_weights_ptr, - bw_cell_to_forget_weights_ptr, bw_cell_to_output_weights_ptr, - bw_input_gate_bias_ptr, bw_forget_gate_bias->data.f, - bw_cell_bias->data.f, bw_output_gate_bias->data.f, - bw_projection_weights_ptr, bw_projection_bias_ptr, params, n_batch, - n_bw_cell, n_input, n_bw_output, bw_output_state->data.f, - bw_cell_state->data.f, bw_input_gate_scratch, bw_forget_gate_scratch, - bw_cell_scratch, bw_output_gate_scratch, output_ptr_time); + GetTemporary(context, node, kBwScratchBuffer); + + // (Optional) auxiliary inputs. + const TfLiteTensor* aux_input = + GetOptionalInputTensor(context, node, kAuxInputTensor); + const TfLiteTensor* fw_aux_input_to_input_weights = + GetOptionalInputTensor(context, node, kFwAuxInputToInputWeightsTensor); + const TfLiteTensor* fw_aux_input_to_forget_weights = + GetOptionalInputTensor(context, node, kFwAuxInputToForgetWeightsTensor); + const TfLiteTensor* fw_aux_input_to_cell_weights = + GetOptionalInputTensor(context, node, kFwAuxInputToCellWeightsTensor); + const TfLiteTensor* fw_aux_input_to_output_weights = + GetOptionalInputTensor(context, node, kFwAuxInputToOutputWeightsTensor); + const TfLiteTensor* bw_aux_input_to_input_weights = + GetOptionalInputTensor(context, node, kBwAuxInputToInputWeightsTensor); + const TfLiteTensor* bw_aux_input_to_forget_weights = + GetOptionalInputTensor(context, node, kBwAuxInputToForgetWeightsTensor); + const TfLiteTensor* bw_aux_input_to_cell_weights = + GetOptionalInputTensor(context, node, kBwAuxInputToCellWeightsTensor); + const TfLiteTensor* bw_aux_input_to_output_weights = + GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor); + + // Populate a TfLiteLSTMParams struct for the evaluation functions. + TfLiteLSTMParams lstm_params = {params->activation, params->cell_clip, + params->proj_clip, kTfLiteLSTMFullKernel}; + + const int bw_output_offset = + params->merge_outputs ? fw_recurrent_to_output_weights->dims->data[1] : 0; + const auto actual_bw_output = params->merge_outputs ? fw_output : bw_output; + + switch (fw_input_to_output_weights->type) { + case kTfLiteFloat32: { + TfLiteStatus fw_pass_status = lstm_eval::EvalFloat( + input, fw_input_to_input_weights, fw_input_to_forget_weights, + fw_input_to_cell_weights, fw_input_to_output_weights, + fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights, + fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights, + fw_cell_to_input_weights, fw_cell_to_forget_weights, + fw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights, + fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights, + fw_aux_input_to_output_weights, fw_input_gate_bias, + fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias, + fw_projection_weights, fw_projection_bias, &lstm_params, + /*forward_sequence=*/true, /*output_offset=*/0, fw_scratch_buffer, + fw_activation_state, fw_cell_state, fw_output); + TF_LITE_ENSURE_OK(context, fw_pass_status); + + TfLiteStatus bw_pass_status = lstm_eval::EvalFloat( + input, bw_input_to_input_weights, bw_input_to_forget_weights, + bw_input_to_cell_weights, bw_input_to_output_weights, + bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights, + bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights, + bw_cell_to_input_weights, bw_cell_to_forget_weights, + bw_cell_to_output_weights, aux_input, bw_aux_input_to_input_weights, + bw_aux_input_to_forget_weights, bw_aux_input_to_cell_weights, + bw_aux_input_to_output_weights, bw_input_gate_bias, + bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias, + bw_projection_weights, bw_projection_bias, &lstm_params, + /*forward_sequence=*/false, bw_output_offset, bw_scratch_buffer, + bw_activation_state, bw_cell_state, actual_bw_output); + TF_LITE_ENSURE_OK(context, bw_pass_status); + return kTfLiteOk; + } + case kTfLiteUInt8: { + TfLiteTensor* input_quantized = + GetTemporary(context, node, kInputQuantized); + TfLiteTensor* aux_input_quantized = + GetTemporary(context, node, kAuxInputQuantized); + TfLiteTensor* fw_activation_state_quantized = + GetTemporary(context, node, kFwActivationStateQuantized); + TfLiteTensor* bw_activation_state_quantized = + GetTemporary(context, node, kBwActivationStateQuantized); + TfLiteTensor* fw_cell_state_quantized = + GetTemporary(context, node, kFwCellStateQuantized); + TfLiteTensor* bw_cell_state_quantized = + GetTemporary(context, node, kBwCellStateQuantized); + TfLiteTensor* scaling_factors = + GetTemporary(context, node, kScalingFactors); + TfLiteTensor* prod_scaling_factors = + GetTemporary(context, node, kProductScalingFactors); + TfLiteTensor* recovered_cell_weights = + GetTemporary(context, node, kRecoveredCellWeights); + + TfLiteStatus fw_pass_status = lstm_eval::EvalHybrid( + input, fw_input_to_input_weights, fw_input_to_forget_weights, + fw_input_to_cell_weights, fw_input_to_output_weights, + fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights, + fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights, + fw_cell_to_input_weights, fw_cell_to_forget_weights, + fw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights, + fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights, + fw_aux_input_to_output_weights, fw_input_gate_bias, + fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias, + fw_projection_weights, fw_projection_bias, &lstm_params, + /*forward_sequence=*/true, /*output_offset=*/0, fw_scratch_buffer, + scaling_factors, prod_scaling_factors, recovered_cell_weights, + input_quantized, aux_input_quantized, fw_activation_state_quantized, + fw_cell_state_quantized, fw_activation_state, fw_cell_state, + fw_output); + TF_LITE_ENSURE_OK(context, fw_pass_status); + + TfLiteStatus bw_pass_status = lstm_eval::EvalHybrid( + input, bw_input_to_input_weights, bw_input_to_forget_weights, + bw_input_to_cell_weights, bw_input_to_output_weights, + bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights, + bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights, + bw_cell_to_input_weights, bw_cell_to_forget_weights, + bw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights, + fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights, + fw_aux_input_to_output_weights, bw_input_gate_bias, + bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias, + bw_projection_weights, bw_projection_bias, &lstm_params, + /*forward_sequence=*/false, bw_output_offset, bw_scratch_buffer, + scaling_factors, prod_scaling_factors, recovered_cell_weights, + input_quantized, aux_input_quantized, bw_activation_state_quantized, + bw_cell_state_quantized, bw_activation_state, bw_cell_state, + actual_bw_output); + TF_LITE_ENSURE_OK(context, bw_pass_status); + return kTfLiteOk; + } + default: + context->ReportError(context, "Type %d is not currently supported.", + fw_input_to_output_weights->type); + return kTfLiteError; } - - // Backward step. return kTfLiteOk; } diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc index a18e1bce34..9cc04907e1 100644 --- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc +++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc @@ -35,8 +35,8 @@ class BidirectionalLSTMOpModel : public SingleOpModel { BidirectionalLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, int sequence_length, bool use_cifg, bool use_peephole, bool use_projection_weights, - bool use_projection_bias, float cell_clip, - float proj_clip, + bool use_projection_bias, bool merge_outputs, + float cell_clip, float proj_clip, const std::vector<std::vector<int>>& input_shapes) : n_batch_(n_batch), n_input_(n_input), @@ -102,10 +102,6 @@ class BidirectionalLSTMOpModel : public SingleOpModel { fw_projection_bias_ = AddNullInput(); } - fw_output_state_ = AddOutput(TensorType_FLOAT32); - fw_cell_state_ = AddOutput(TensorType_FLOAT32); - fw_output_ = AddOutput(TensorType_FLOAT32); - if (use_cifg) { bw_input_to_input_weights_ = AddNullInput(); } else { @@ -161,14 +157,43 @@ class BidirectionalLSTMOpModel : public SingleOpModel { bw_projection_bias_ = AddNullInput(); } - bw_output_state_ = AddOutput(TensorType_FLOAT32); - bw_cell_state_ = AddOutput(TensorType_FLOAT32); - bw_output_ = AddOutput(TensorType_FLOAT32); + // Adding the 2 input state tensors. + fw_input_activation_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_fw_output_ * n_batch_}}, + /*is_variable=*/true); + fw_input_cell_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_fw_cell_ * n_batch_}}, + /*is_variable=*/true); + + // Adding the 2 input state tensors. + bw_input_activation_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_bw_output_ * n_batch_}}, + /*is_variable=*/true); + bw_input_cell_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_bw_cell_ * n_batch_}}, + /*is_variable=*/true); + + fw_output_ = AddOutput(TensorType_FLOAT32); + + if (!merge_outputs) { + bw_output_ = AddOutput(TensorType_FLOAT32); + } + + aux_input_ = AddNullInput(); + fw_aux_input_to_input_weights_ = AddNullInput(); + fw_aux_input_to_forget_weights_ = AddNullInput(); + fw_aux_input_to_cell_weights_ = AddNullInput(); + fw_aux_input_to_output_weights_ = AddNullInput(); + bw_aux_input_to_input_weights_ = AddNullInput(); + bw_aux_input_to_forget_weights_ = AddNullInput(); + bw_aux_input_to_cell_weights_ = AddNullInput(); + bw_aux_input_to_output_weights_ = AddNullInput(); SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, - BuiltinOptions_LSTMOptions, - CreateLSTMOptions(builder_, ActivationFunctionType_TANH, - cell_clip, proj_clip) + BuiltinOptions_BidirectionalSequenceLSTMOptions, + CreateBidirectionalSequenceLSTMOptions( + builder_, ActivationFunctionType_TANH, cell_clip, + proj_clip, merge_outputs) .Union()); BuildInterpreter(input_shapes); } @@ -259,26 +284,6 @@ class BidirectionalLSTMOpModel : public SingleOpModel { PopulateTensor(bw_projection_bias_, f); } - void ResetFwOutputAndCellStates() { - const int zero_buffer_size = n_fw_cell_ * n_batch_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(fw_output_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - PopulateTensor(fw_cell_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - - void ResetBwOutputAndCellStates() { - const int zero_buffer_size = n_bw_cell_ * n_batch_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(bw_output_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - PopulateTensor(bw_cell_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - void SetInput(int offset, float* begin, float* end) { PopulateTensor(input_, offset, begin, end); } @@ -340,13 +345,23 @@ class BidirectionalLSTMOpModel : public SingleOpModel { int bw_projection_weights_; int bw_projection_bias_; - int fw_output_; - int fw_output_state_; - int fw_cell_state_; + int fw_input_activation_state_; + int fw_input_cell_state_; + int bw_input_activation_state_; + int bw_input_cell_state_; + int fw_output_; int bw_output_; - int bw_output_state_; - int bw_cell_state_; + + int aux_input_; + int fw_aux_input_to_input_weights_; + int fw_aux_input_to_forget_weights_; + int fw_aux_input_to_cell_weights_; + int fw_aux_input_to_output_weights_; + int bw_aux_input_to_input_weights_; + int bw_aux_input_to_forget_weights_; + int bw_aux_input_to_cell_weights_; + int bw_aux_input_to_output_weights_; int n_batch_; int n_input_; @@ -368,7 +383,8 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { BidirectionalLSTMOpModel lstm( n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false, /*use_peephole=*/false, /*use_projection_weights=*/false, - /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0, + /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0, + /*proj_clip=*/0.0, { {sequence_length, n_batch, n_input}, // input tensor @@ -417,6 +433,22 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { {0, 0}, // projection_weight tensor {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, sequence_length, 0}, // aux_input tensor + {n_cell, 0}, // aux_fw_input_to_input tensor + {n_cell, 0}, // aux_fw_input_to_forget tensor + {n_cell, 0}, // aux_fw_input_to_cell tensor + {n_cell, 0}, // aux_fw_input_to_output tensor + {n_cell, 0}, // aux_bw_input_to_input tensor + {n_cell, 0}, // aux_bw_input_to_forget tensor + {n_cell, 0}, // aux_bw_input_to_cell tensor + {n_cell, 0}, // aux_bw_input_to_output tensor }); lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589, @@ -474,10 +506,6 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { -0.0332076, 0.123838, 0.309777, -0.17621, -0.0490733, 0.0739237, 0.067706, -0.0208124}; - // Resetting cell_state and output_state - lstm.ResetFwOutputAndCellStates(); - lstm.ResetBwOutputAndCellStates(); - float* batch0_start = lstm_input; float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length(); @@ -500,34 +528,318 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { bw_expected.insert(bw_expected.end(), bw_golden_start, bw_golden_end); EXPECT_THAT(lstm.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected))); +} + +// Same as the previous test, yet with a single merged output tensor. +TEST(LSTMOpTest, BlackBoxTestMergedOutput) { + const int n_batch = 1; + const int n_input = 2; + // n_cell and n_output have the same size when there is no projection. + const int n_cell = 4; + const int n_output = 4; + const int sequence_length = 3; + + BidirectionalLSTMOpModel lstm( + n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false, + /*use_peephole=*/false, /*use_projection_weights=*/false, + /*use_projection_bias=*/false, /*merge_outputs=*/true, /*cell_clip=*/0.0, + /*proj_clip=*/0.0, + { + {sequence_length, n_batch, n_input}, // input tensor + + // Forward cell + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {0}, // cell_to_input_weight tensor + {0}, // cell_to_forget_weight tensor + {0}, // cell_to_output_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + + // Backward cell + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {0}, // cell_to_input_weight tensor + {0}, // cell_to_forget_weight tensor + {0}, // cell_to_output_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, sequence_length, 0}, // aux_input tensor + {n_cell, 0}, // aux_fw_input_to_input tensor + {n_cell, 0}, // aux_fw_input_to_forget tensor + {n_cell, 0}, // aux_fw_input_to_cell tensor + {n_cell, 0}, // aux_fw_input_to_output tensor + {n_cell, 0}, // aux_bw_input_to_input tensor + {n_cell, 0}, // aux_bw_input_to_forget tensor + {n_cell, 0}, // aux_bw_input_to_cell tensor + {n_cell, 0}, // aux_bw_input_to_output tensor + }); + + lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589, + -0.34550029, 0.04266912, -0.15680569, + -0.34856534, 0.43890524}); + + lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163, + -0.20583314, 0.44344562, 0.22077113, + -0.29909778}); + + lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935, + -0.31343272, -0.40032279, 0.44781327, + 0.01387155, -0.35593212}); + + lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829, + 0.40525138, 0.44272184, 0.03897077, -0.1556896, + 0.19487578}); + + lstm.SetInputGateBias({0., 0., 0., 0.}); + + lstm.SetCellBias({0., 0., 0., 0.}); + + lstm.SetForgetGateBias({1., 1., 1., 1.}); + + lstm.SetOutputGateBias({0., 0., 0., 0.}); + + lstm.SetRecurrentToInputWeights( + {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324, + -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322, + -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296}); + + lstm.SetRecurrentToCellWeights( + {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841, + -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659, + -0.46367589, 0.26016325, -0.03894562, -0.16368064}); + + lstm.SetRecurrentToForgetWeights( + {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892, + -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436, + 0.28053468, 0.01560611, -0.20127171, -0.01140004}); + lstm.SetRecurrentToOutputWeights( + {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793, + 0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421, + -0.51818722, -0.15390486, 0.0468148, 0.39922136}); + + // Input should have n_input * sequence_length many values. + static float lstm_input[] = {2., 3., 3., 4., 1., 1.}; + static float lstm_fw_golden_output[] = { + -0.02973187, 0.1229473, 0.20885126, -0.15358765, + -0.03716109, 0.12507336, 0.41193449, -0.20860538, + -0.15053082, 0.09120187, 0.24278517, -0.12222792}; + static float lstm_bw_golden_output[] = { + -0.0806187, 0.139077, 0.400476, -0.197842, -0.0332076, 0.123838, + 0.309777, -0.17621, -0.0490733, 0.0739237, 0.067706, -0.0208124}; + + float* batch0_start = lstm_input; + float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length(); + + lstm.SetInput(0, batch0_start, batch0_end); + + lstm.Invoke(); + + std::vector<float> merged_expected; + for (int k = 0; k < lstm.sequence_length(); k++) { + merged_expected.insert( + merged_expected.end(), + lstm_fw_golden_output + k * lstm.num_fw_outputs(), + lstm_fw_golden_output + (k + 1) * lstm.num_fw_outputs()); + merged_expected.insert( + merged_expected.end(), + lstm_bw_golden_output + k * lstm.num_bw_outputs(), + lstm_bw_golden_output + (k + 1) * lstm.num_bw_outputs()); + } + EXPECT_THAT(lstm.GetFwOutput(), + ElementsAreArray(ArrayFloatNear(merged_expected))); +} + +TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) { + const int n_batch = 1; + const int n_input = 2; + // n_cell and n_output have the same size when there is no projection. + const int n_cell = 4; + const int n_output = 4; + const int sequence_length = 3; + + BidirectionalLSTMOpModel lstm( + n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false, + /*use_peephole=*/false, /*use_projection_weights=*/false, + /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0, + /*proj_clip=*/0.0, + { + {sequence_length, n_batch, n_input}, // input tensor + + // Forward cell + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {0}, // cell_to_input_weight tensor + {0}, // cell_to_forget_weight tensor + {0}, // cell_to_output_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + + // Backward cell + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {0}, // cell_to_input_weight tensor + {0}, // cell_to_forget_weight tensor + {0}, // cell_to_output_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, sequence_length, 0}, // aux_input tensor + {n_cell, 0}, // aux_fw_input_to_input tensor + {n_cell, 0}, // aux_fw_input_to_forget tensor + {n_cell, 0}, // aux_fw_input_to_cell tensor + {n_cell, 0}, // aux_fw_input_to_output tensor + {n_cell, 0}, // aux_bw_input_to_input tensor + {n_cell, 0}, // aux_bw_input_to_forget tensor + {n_cell, 0}, // aux_bw_input_to_cell tensor + {n_cell, 0}, // aux_bw_input_to_output tensor + }); + + lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589, + -0.34550029, 0.04266912, -0.15680569, + -0.34856534, 0.43890524}); + + lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163, + -0.20583314, 0.44344562, 0.22077113, + -0.29909778}); + + lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935, + -0.31343272, -0.40032279, 0.44781327, + 0.01387155, -0.35593212}); + + lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829, + 0.40525138, 0.44272184, 0.03897077, -0.1556896, + 0.19487578}); + + lstm.SetInputGateBias({0., 0., 0., 0.}); + + lstm.SetCellBias({0., 0., 0., 0.}); + + lstm.SetForgetGateBias({1., 1., 1., 1.}); + + lstm.SetOutputGateBias({0., 0., 0., 0.}); + + lstm.SetRecurrentToInputWeights( + {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324, + -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322, + -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296}); + + lstm.SetRecurrentToCellWeights( + {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841, + -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659, + -0.46367589, 0.26016325, -0.03894562, -0.16368064}); + + lstm.SetRecurrentToForgetWeights( + {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892, + -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436, + 0.28053468, 0.01560611, -0.20127171, -0.01140004}); + + lstm.SetRecurrentToOutputWeights( + {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793, + 0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421, + -0.51818722, -0.15390486, 0.0468148, 0.39922136}); + + // Input should have n_input * sequence_length many values. // Check reversed inputs. static float lstm_input_reversed[] = {1., 1., 3., 4., 2., 3.}; + static float lstm_fw_golden_output[] = { + -0.02973187, 0.1229473, 0.20885126, -0.15358765, + -0.03716109, 0.12507336, 0.41193449, -0.20860538, + -0.15053082, 0.09120187, 0.24278517, -0.12222792}; + static float lstm_bw_golden_output[] = { + -0.0806187, 0.139077, 0.400476, -0.197842, -0.0332076, 0.123838, + 0.309777, -0.17621, -0.0490733, 0.0739237, 0.067706, -0.0208124}; - // Resetting cell_state and output_state - lstm.ResetFwOutputAndCellStates(); - lstm.ResetBwOutputAndCellStates(); - - batch0_start = lstm_input_reversed; - batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length(); + float* batch0_start = lstm_input_reversed; + float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length(); lstm.SetInput(0, batch0_start, batch0_end); lstm.Invoke(); - fw_expected.clear(); + std::vector<float> fw_expected; for (int s = 0; s < lstm.sequence_length(); s++) { - fw_golden_start = lstm_fw_golden_output + s * lstm.num_fw_outputs(); - fw_golden_end = fw_golden_start + lstm.num_fw_outputs(); + float* fw_golden_start = lstm_fw_golden_output + s * lstm.num_fw_outputs(); + float* fw_golden_end = fw_golden_start + lstm.num_fw_outputs(); fw_expected.insert(fw_expected.begin(), fw_golden_start, fw_golden_end); } EXPECT_THAT(lstm.GetBwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected))); - bw_expected.clear(); + std::vector<float> bw_expected; for (int s = 0; s < lstm.sequence_length(); s++) { - bw_golden_start = lstm_bw_golden_output + s * lstm.num_bw_outputs(); - bw_golden_end = bw_golden_start + lstm.num_bw_outputs(); + float* bw_golden_start = lstm_bw_golden_output + s * lstm.num_bw_outputs(); + float* bw_golden_end = bw_golden_start + lstm.num_bw_outputs(); bw_expected.insert(bw_expected.begin(), bw_golden_start, bw_golden_end); } EXPECT_THAT(lstm.GetFwOutput(), @@ -545,7 +857,8 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { BidirectionalLSTMOpModel lstm( n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true, /*use_peephole=*/true, /*use_projection_weights=*/false, - /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0, + /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0, + /*proj_clip=*/0.0, { {sequence_length, n_batch, n_input}, // input tensor @@ -592,6 +905,22 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { {0, 0}, // projection_weight tensor {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, sequence_length, 0}, // aux_input tensor + {n_cell, 0}, // aux_fw_input_to_input tensor + {n_cell, 0}, // aux_fw_input_to_forget tensor + {n_cell, 0}, // aux_fw_input_to_cell tensor + {n_cell, 0}, // aux_fw_input_to_output tensor + {n_cell, 0}, // aux_bw_input_to_input tensor + {n_cell, 0}, // aux_bw_input_to_forget tensor + {n_cell, 0}, // aux_bw_input_to_cell tensor + {n_cell, 0}, // aux_bw_input_to_output tensor }); lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781, @@ -642,10 +971,6 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { -0.401685, -0.0232794, 0.288642, -0.123074, -0.42915, -0.00871577, 0.20912, -0.103567, -0.166398, -0.00486649, 0.0697471, -0.0537578}; - // Resetting cell_state and output_state - lstm.ResetFwOutputAndCellStates(); - lstm.ResetBwOutputAndCellStates(); - float* batch0_start = lstm_input; float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length(); @@ -668,34 +993,154 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { bw_expected.insert(bw_expected.end(), bw_golden_start, bw_golden_end); EXPECT_THAT(lstm.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected))); +} - // Check reversed inputs. - static float lstm_input_reversed[] = {1., 1., 3., 4., 2., 3.}; +TEST(LSTMOpTest, + BlackBoxTestWithCifgWithPeepholeNoProjectionNoClippingReversed) { + const int n_batch = 1; + const int n_input = 2; + // n_cell and n_output have the same size when there is no projection. + const int n_cell = 4; + const int n_output = 4; + const int sequence_length = 3; - // Resetting cell_state and output_state - lstm.ResetFwOutputAndCellStates(); - lstm.ResetBwOutputAndCellStates(); + BidirectionalLSTMOpModel lstm( + n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true, + /*use_peephole=*/true, /*use_projection_weights=*/false, + /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0, + /*proj_clip=*/0.0, + { + {sequence_length, n_batch, n_input}, // input tensor - batch0_start = lstm_input_reversed; - batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length(); + {0, 0}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {0, 0}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {0}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor + + {0}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + + {0, 0}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {0, 0}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {0}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor + + {0}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, sequence_length, 0}, // aux_input tensor + {n_cell, 0}, // aux_fw_input_to_input tensor + {n_cell, 0}, // aux_fw_input_to_forget tensor + {n_cell, 0}, // aux_fw_input_to_cell tensor + {n_cell, 0}, // aux_fw_input_to_output tensor + {n_cell, 0}, // aux_bw_input_to_input tensor + {n_cell, 0}, // aux_bw_input_to_forget tensor + {n_cell, 0}, // aux_bw_input_to_cell tensor + {n_cell, 0}, // aux_bw_input_to_output tensor + }); + + lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781, + 0.04717243, 0.48944736, -0.38535351, + -0.17212132}); + + lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988, + -0.3633365, -0.22755712, 0.28253698, 0.24407166, + 0.33826375}); + + lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593, + -0.09426838, -0.44257352, 0.54939759, + 0.01533556, 0.42751634}); + + lstm.SetCellBias({0., 0., 0., 0.}); + + lstm.SetForgetGateBias({1., 1., 1., 1.}); + + lstm.SetOutputGateBias({0., 0., 0., 0.}); + + lstm.SetRecurrentToCellWeights( + {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711, + 0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004, + 0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288, + 0.21193194}); + + lstm.SetRecurrentToForgetWeights( + {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827, + 0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795, + -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349}); + + lstm.SetRecurrentToOutputWeights( + {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908, + -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835, + 0.50248802, 0.26114327, -0.43736315, 0.33149987}); + + lstm.SetCellToForgetWeights( + {0.47485286, -0.51955009, -0.24458408, 0.31544167}); + lstm.SetCellToOutputWeights( + {-0.17135078, 0.82760304, 0.85573703, -0.77109635}); + + static float lstm_input_reversed[] = {1., 1., 3., 4., 2., 3.}; + static float lstm_fw_golden_output[] = { + -0.36444446, -0.00352185, 0.12886585, -0.05163646, + -0.42312205, -0.01218222, 0.24201041, -0.08124574, + -0.358325, -0.04621704, 0.21641694, -0.06471302}; + static float lstm_bw_golden_output[] = { + -0.401685, -0.0232794, 0.288642, -0.123074, -0.42915, -0.00871577, + 0.20912, -0.103567, -0.166398, -0.00486649, 0.0697471, -0.0537578}; + + float* batch0_start = lstm_input_reversed; + float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length(); lstm.SetInput(0, batch0_start, batch0_end); lstm.Invoke(); - fw_expected.clear(); + std::vector<float> fw_expected; for (int s = 0; s < lstm.sequence_length(); s++) { - fw_golden_start = lstm_fw_golden_output + s * lstm.num_fw_outputs(); - fw_golden_end = fw_golden_start + lstm.num_fw_outputs(); + float* fw_golden_start = lstm_fw_golden_output + s * lstm.num_fw_outputs(); + float* fw_golden_end = fw_golden_start + lstm.num_fw_outputs(); fw_expected.insert(fw_expected.begin(), fw_golden_start, fw_golden_end); } EXPECT_THAT(lstm.GetBwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected))); - bw_expected.clear(); + std::vector<float> bw_expected; for (int s = 0; s < lstm.sequence_length(); s++) { - bw_golden_start = lstm_bw_golden_output + s * lstm.num_bw_outputs(); - bw_golden_end = bw_golden_start + lstm.num_bw_outputs(); + float* bw_golden_start = lstm_bw_golden_output + s * lstm.num_bw_outputs(); + float* bw_golden_end = bw_golden_start + lstm.num_bw_outputs(); bw_expected.insert(bw_expected.begin(), bw_golden_start, bw_golden_end); } EXPECT_THAT(lstm.GetFwOutput(), @@ -712,7 +1157,8 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) { BidirectionalLSTMOpModel lstm( n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false, /*use_peephole=*/true, /*use_projection_weights=*/true, - /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0, + /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0, + /*proj_clip=*/0.0, { {sequence_length, n_batch, n_input}, // input tensor @@ -759,6 +1205,22 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) { {n_output, n_cell}, // projection_weight tensor {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + + {n_batch, sequence_length, 0}, // aux_input tensor + {n_cell, 0}, // aux_fw_input_to_input tensor + {n_cell, 0}, // aux_fw_input_to_forget tensor + {n_cell, 0}, // aux_fw_input_to_cell tensor + {n_cell, 0}, // aux_fw_input_to_output tensor + {n_cell, 0}, // aux_bw_input_to_input tensor + {n_cell, 0}, // aux_bw_input_to_forget tensor + {n_cell, 0}, // aux_bw_input_to_cell tensor + {n_cell, 0}, // aux_bw_input_to_output tensor }); lstm.SetInputToInputWeights( @@ -1343,10 +1805,6 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) { 0.065133, 0.024321, 0.038473, 0.062438 }}; - // Resetting cell_state and output_state - lstm.ResetFwOutputAndCellStates(); - lstm.ResetBwOutputAndCellStates(); - for (int i = 0; i < lstm.sequence_length(); i++) { float* batch0_start = lstm_input[0] + i * lstm.num_inputs(); float* batch0_end = batch0_start + lstm.num_inputs(); diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc index aa24c1f34c..c22a457a71 100644 --- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc +++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdlib> @@ -20,10 +19,11 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" namespace tflite { @@ -36,34 +36,84 @@ constexpr int kInputTensor = 0; constexpr int kFwWeightsTensor = 1; constexpr int kFwRecurrentWeightsTensor = 2; constexpr int kFwBiasTensor = 3; -constexpr int kBwWeightsTensor = 4; -constexpr int kBwRecurrentWeightsTensor = 5; -constexpr int kBwBiasTensor = 6; -// State and output tensors. -constexpr int kFwHiddenStateTensor = 0; -constexpr int kFwOutputTensor = 1; -constexpr int kBwHiddenStateTensor = 2; -constexpr int kBwOutputTensor = 3; +constexpr int kFwHiddenStateTensor = 4; +constexpr int kBwWeightsTensor = 5; +constexpr int kBwRecurrentWeightsTensor = 6; +constexpr int kBwBiasTensor = 7; +constexpr int kBwHiddenStateTensor = 8; +// Auxiliary inputs. +constexpr int kAuxInputTensor = 9; // Optional. +constexpr int kFwAuxWeightsTensor = 10; // Optional. +constexpr int kBwAuxWeightsTensor = 11; // Optional. +// Output tensors. +constexpr int kFwOutputTensor = 0; +constexpr int kBwOutputTensor = 1; // Only if merge_outputs is false. + +// Temporary tensors. +enum TemporaryTensor { + kInputQuantized = 0, + kFwHiddenStateQuantized = 1, + kBwHiddenStateQuantized = 2, + kScalingFactors = 3, + kAuxInputQuantized = 4, + kNumTemporaryTensors = 5 +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* scratch_tensor_index = new int; + context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index); + return scratch_tensor_index; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<int*>(buffer); +} TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceRNNParams*>( + node->builtin_data); + // Check we have all the inputs and outputs we need. - TF_LITE_ENSURE_EQ(context, node->inputs->size, 7); - TF_LITE_ENSURE_EQ(context, node->outputs->size, 4); - - TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]]; - TfLiteTensor* fw_input_weights = - &context->tensors[node->inputs->data[kFwWeightsTensor]]; - TfLiteTensor* fw_recurrent_weights = - &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]]; - TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]]; - TfLiteTensor* bw_input_weights = - &context->tensors[node->inputs->data[kBwWeightsTensor]]; - TfLiteTensor* bw_recurrent_weights = - &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]]; - TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]]; + TF_LITE_ENSURE_EQ(context, node->inputs->size, 12); + TF_LITE_ENSURE_EQ(context, node->outputs->size, + params->merge_outputs ? 1 : 2); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + const TfLiteTensor* fw_input_weights = + GetInput(context, node, kFwWeightsTensor); + const TfLiteTensor* fw_recurrent_weights = + GetInput(context, node, kFwRecurrentWeightsTensor); + const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor); + const TfLiteTensor* fw_hidden_state = + GetInput(context, node, kFwHiddenStateTensor); + const TfLiteTensor* bw_input_weights = + GetInput(context, node, kBwWeightsTensor); + const TfLiteTensor* bw_recurrent_weights = + GetInput(context, node, kBwRecurrentWeightsTensor); + const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor); + const TfLiteTensor* bw_hidden_state = + GetInput(context, node, kBwHiddenStateTensor); + + const TfLiteTensor* aux_input = + GetOptionalInputTensor(context, node, kAuxInputTensor); + const TfLiteTensor* fw_aux_input_weights = + GetOptionalInputTensor(context, node, kFwAuxWeightsTensor); + const TfLiteTensor* bw_aux_input_weights = + GetOptionalInputTensor(context, node, kBwAuxWeightsTensor); + + const bool aux_inputs_all_or_none = + ((aux_input != nullptr) && (fw_aux_input_weights != nullptr) && + (bw_aux_input_weights != nullptr)) || + ((aux_input == nullptr) && (fw_aux_input_weights == nullptr) && + (bw_aux_input_weights == nullptr)); + TF_LITE_ENSURE(context, aux_inputs_all_or_none); + const bool has_aux_input = (aux_input != nullptr); // Check all the parameters of tensor match within themselves and match the // input configuration. + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + + TF_LITE_ENSURE_EQ(context, input->dims->size, 3); const int batch_size = input->dims->data[0]; const int max_time = input->dims->data[1]; const int fw_num_units = fw_input_weights->dims->data[0]; @@ -76,77 +126,150 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { fw_bias->dims->data[0]); TF_LITE_ASSERT_EQ(bw_recurrent_weights->dims->data[1], bw_bias->dims->data[0]); + TF_LITE_ENSURE_EQ(context, NumDimensions(fw_hidden_state), 2); + TF_LITE_ENSURE_EQ(context, fw_hidden_state->dims->data[0], batch_size); + TF_LITE_ENSURE_EQ(context, fw_hidden_state->dims->data[1], fw_num_units); + TF_LITE_ENSURE_EQ(context, NumDimensions(bw_hidden_state), 2); + TF_LITE_ENSURE_EQ(context, bw_hidden_state->dims->data[0], batch_size); + TF_LITE_ENSURE_EQ(context, bw_hidden_state->dims->data[1], bw_num_units); - TfLiteTensor* fw_output = - &context->tensors[node->outputs->data[kFwOutputTensor]]; - TfLiteTensor* bw_output = - &context->tensors[node->outputs->data[kBwOutputTensor]]; + if (has_aux_input) { + // Check that aux_input has the same dimensions (except last) as the input. + TF_LITE_ASSERT_EQ(aux_input->dims->data[0], input->dims->data[0]); + TF_LITE_ASSERT_EQ(aux_input->dims->data[1], input->dims->data[1]); + // Check that aux_input_weights has the same dimensions (except last) as + // the input_weights. + TF_LITE_ASSERT_EQ(fw_aux_input_weights->dims->data[0], fw_num_units); + TF_LITE_ASSERT_EQ(bw_aux_input_weights->dims->data[0], bw_num_units); + TF_LITE_ASSERT_EQ(aux_input->dims->data[2], + fw_aux_input_weights->dims->data[1]); + TF_LITE_ASSERT_EQ(aux_input->dims->data[2], + bw_aux_input_weights->dims->data[1]); + } - // Resize hidden states. - TfLiteIntArray* fw_hidden_state_size_array = TfLiteIntArrayCreate(2); - fw_hidden_state_size_array->data[0] = batch_size; - fw_hidden_state_size_array->data[1] = fw_num_units; - TfLiteTensor* fw_hidden_state = - &context->tensors[node->outputs->data[kFwHiddenStateTensor]]; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_hidden_state, - fw_hidden_state_size_array)); + const bool is_hybrid_op = + (fw_input_weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32); - TfLiteIntArray* bw_hidden_state_size_array = TfLiteIntArrayCreate(2); - bw_hidden_state_size_array->data[0] = batch_size; - bw_hidden_state_size_array->data[1] = fw_num_units; - TfLiteTensor* bw_hidden_state = - &context->tensors[node->outputs->data[kBwHiddenStateTensor]]; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_hidden_state, - bw_hidden_state_size_array)); + if (is_hybrid_op) { + int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data); + + TfLiteIntArrayFree(node->temporaries); + if (has_aux_input) { + node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors); + } else { + // No need to create a temporary tensor for the non-existent aux_input. + node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors - 1); + } + + node->temporaries->data[kInputQuantized] = + *scratch_tensor_index + kInputQuantized; + TfLiteTensor* input_quantized = + GetTemporary(context, node, kInputQuantized); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } - // Mark hidden states as a persistent tensor. - fw_hidden_state->allocation_type = kTfLiteArenaRwPersistent; - bw_hidden_state->allocation_type = kTfLiteArenaRwPersistent; + node->temporaries->data[kFwHiddenStateQuantized] = + *scratch_tensor_index + kFwHiddenStateQuantized; + TfLiteTensor* fw_hidden_state_quantized = + GetTemporary(context, node, kFwHiddenStateQuantized); + fw_hidden_state_quantized->type = kTfLiteUInt8; + fw_hidden_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(fw_hidden_state_quantized->dims, + fw_hidden_state->dims)) { + TfLiteIntArray* fw_hidden_state_quantized_size = + TfLiteIntArrayCopy(fw_hidden_state->dims); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, fw_hidden_state_quantized, + fw_hidden_state_quantized_size)); + } + + node->temporaries->data[kBwHiddenStateQuantized] = + *scratch_tensor_index + kBwHiddenStateQuantized; + TfLiteTensor* bw_hidden_state_quantized = + GetTemporary(context, node, kBwHiddenStateQuantized); + bw_hidden_state_quantized->type = kTfLiteUInt8; + bw_hidden_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(bw_hidden_state_quantized->dims, + bw_hidden_state->dims)) { + TfLiteIntArray* bw_hidden_state_quantized_size = + TfLiteIntArrayCopy(bw_hidden_state->dims); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, bw_hidden_state_quantized, + bw_hidden_state_quantized_size)); + } + + // Allocate temporary tensors to store scaling factors of quantization. + node->temporaries->data[kScalingFactors] = + *scratch_tensor_index + kScalingFactors; + TfLiteTensor* scaling_factors = + GetTemporary(context, node, kScalingFactors); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + scaling_factors_size->data[0] = batch_size; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } + + if (has_aux_input) { + node->temporaries->data[kAuxInputQuantized] = + *scratch_tensor_index + kAuxInputQuantized; + TfLiteTensor* aux_input_quantized = + GetTemporary(context, node, kAuxInputQuantized); + aux_input_quantized->type = kTfLiteUInt8; + aux_input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) { + TfLiteIntArray* aux_input_quantized_size = + TfLiteIntArrayCopy(aux_input->dims); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, aux_input_quantized, + aux_input_quantized_size)); + } + } + } // Resize outputs. + TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor); TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3); fw_output_size_array->data[0] = batch_size; fw_output_size_array->data[1] = max_time; - fw_output_size_array->data[2] = fw_num_units; + fw_output_size_array->data[2] = + params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units; TF_LITE_ENSURE_OK( context, context->ResizeTensor(context, fw_output, fw_output_size_array)); - TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3); - bw_output_size_array->data[0] = batch_size; - bw_output_size_array->data[1] = max_time; - bw_output_size_array->data[2] = bw_num_units; - TF_LITE_ENSURE_OK( - context, context->ResizeTensor(context, bw_output, bw_output_size_array)); + if (!params->merge_outputs) { + TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor); + TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3); + bw_output_size_array->data[0] = batch_size; + bw_output_size_array->data[1] = max_time; + bw_output_size_array->data[2] = bw_num_units; + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_output, + bw_output_size_array)); + } return kTfLiteOk; } -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data); - - TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]]; - TfLiteTensor* fw_input_weights = - &context->tensors[node->inputs->data[kFwWeightsTensor]]; - TfLiteTensor* fw_recurrent_weights = - &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]]; - TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]]; - TfLiteTensor* fw_hidden_state = - &context->tensors[node->outputs->data[kFwHiddenStateTensor]]; - TfLiteTensor* fw_output = - &context->tensors[node->outputs->data[kFwOutputTensor]]; - - TfLiteTensor* bw_input_weights = - &context->tensors[node->inputs->data[kBwWeightsTensor]]; - TfLiteTensor* bw_recurrent_weights = - &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]]; - TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]]; - TfLiteTensor* bw_hidden_state = - &context->tensors[node->outputs->data[kBwHiddenStateTensor]]; - TfLiteTensor* bw_output = - &context->tensors[node->outputs->data[kBwOutputTensor]]; - +TfLiteStatus EvalFloat( + const TfLiteTensor* input, const TfLiteTensor* fw_input_weights, + const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias, + const TfLiteTensor* bw_input_weights, + const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias, + const TfLiteTensor* aux_input, const TfLiteTensor* fw_aux_input_weights, + const TfLiteTensor* bw_aux_input_weights, + const TfLiteBidirectionalSequenceRNNParams* params, + TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output, + TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) { const int batch_size = input->dims->data[0]; const int max_time = input->dims->data[1]; const int input_size = input->dims->data[2]; + const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0; const int fw_num_units = fw_input_weights->dims->data[0]; const float* fw_bias_ptr = fw_bias->data.f; @@ -158,45 +281,258 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const float* bw_input_weights_ptr = bw_input_weights->data.f; const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f; + const float* fw_aux_input_weights_ptr = (fw_aux_input_weights != nullptr) + ? fw_aux_input_weights->data.f + : nullptr; + const float* bw_aux_input_weights_ptr = (bw_aux_input_weights != nullptr) + ? bw_aux_input_weights->data.f + : nullptr; + + const int fw_output_step = + params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units; + const int bw_output_step = + params->merge_outputs ? fw_num_units + bw_num_units : bw_num_units; for (int b = 0; b < batch_size; b++) { // Forward cell. float* fw_hidden_state_ptr_batch = fw_hidden_state->data.f + b * fw_num_units; + float* fw_output_offset = fw_output->data.f + b * fw_output_step * max_time; for (int s = 0; s < max_time; s++) { const float* input_ptr_batch = input->data.f + b * input_size * max_time + s * input_size; - float* output_ptr_batch = - fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units; + const float* aux_input_ptr_batch = + (aux_input != nullptr) + ? aux_input->data.f + b * input_size * max_time + s * input_size + : nullptr; + float* output_ptr_batch = fw_output_offset + s * fw_output_step; kernel_utils::RnnBatchStep( - input_ptr_batch, fw_input_weights_ptr, fw_recurrent_weights_ptr, - fw_bias_ptr, input_size, fw_num_units, /*batch_size=*/1, + input_ptr_batch, fw_input_weights_ptr, aux_input_ptr_batch, + fw_aux_input_weights_ptr, fw_recurrent_weights_ptr, fw_bias_ptr, + input_size, aux_input_size, fw_num_units, /*batch_size=*/1, params->activation, fw_hidden_state_ptr_batch, output_ptr_batch); } // Backward cell. float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f + b * bw_num_units; + float* bw_output_offset = + params->merge_outputs + ? fw_output->data.f + b * bw_output_step * max_time + fw_num_units + : bw_output->data.f + b * bw_output_step * max_time; for (int s = max_time - 1; s >= 0; s--) { const float* input_ptr_batch = input->data.f + b * input_size * max_time + s * input_size; - float* output_ptr_batch = - bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units; + const float* aux_input_ptr_batch = + (aux_input != nullptr) + ? aux_input->data.f + b * input_size * max_time + s * input_size + : nullptr; + float* output_ptr_batch = bw_output_offset + s * bw_output_step; kernel_utils::RnnBatchStep( - input_ptr_batch, bw_input_weights_ptr, bw_recurrent_weights_ptr, - bw_bias_ptr, input_size, bw_num_units, /*batch_size=*/1, + input_ptr_batch, bw_input_weights_ptr, aux_input_ptr_batch, + bw_aux_input_weights_ptr, bw_recurrent_weights_ptr, bw_bias_ptr, + input_size, aux_input_size, bw_num_units, /*batch_size=*/1, params->activation, bw_hidden_state_ptr_batch, output_ptr_batch); } } return kTfLiteOk; } +TfLiteStatus EvalHybrid( + const TfLiteTensor* input, const TfLiteTensor* fw_input_weights, + const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias, + const TfLiteTensor* bw_input_weights, + const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias, + const TfLiteTensor* aux_input, const TfLiteTensor* aux_fw_input_weights, + const TfLiteTensor* aux_bw_input_weights, + const TfLiteBidirectionalSequenceRNNParams* params, + TfLiteTensor* scaling_factors, TfLiteTensor* input_quantized, + TfLiteTensor* aux_input_quantized, TfLiteTensor* fw_hidden_state_quantized, + TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output, + TfLiteTensor* bw_hidden_state_quantized, TfLiteTensor* bw_hidden_state, + TfLiteTensor* bw_output) { + const int batch_size = input->dims->data[0]; + const int max_time = input->dims->data[1]; + const int input_size = input->dims->data[2]; + const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0; + + const int fw_num_units = fw_input_weights->dims->data[0]; + const float* fw_bias_ptr = fw_bias->data.f; + const int8_t* fw_input_weights_ptr = + reinterpret_cast<const int8_t*>(fw_input_weights->data.uint8); + float fw_input_weights_scale = fw_input_weights->params.scale; + const int8_t* fw_recurrent_weights_ptr = + reinterpret_cast<const int8_t*>(fw_recurrent_weights->data.uint8); + float fw_recurrent_weights_scale = fw_recurrent_weights->params.scale; + + const int bw_num_units = bw_input_weights->dims->data[0]; + const float* bw_bias_ptr = bw_bias->data.f; + const int8_t* bw_input_weights_ptr = + reinterpret_cast<const int8_t*>(bw_input_weights->data.uint8); + float bw_input_weights_scale = bw_input_weights->params.scale; + const int8_t* bw_recurrent_weights_ptr = + reinterpret_cast<const int8_t*>(bw_recurrent_weights->data.uint8); + float bw_recurrent_weights_scale = bw_recurrent_weights->params.scale; + + // Set the auxiliary pointers and scales if needed. + int8_t* aux_fw_input_weights_ptr = nullptr; + float aux_fw_input_weights_scale = 0.0f; + int8_t* aux_bw_input_weights_ptr = nullptr; + float aux_bw_input_weights_scale = 0.0f; + int8_t* aux_quantized_input_ptr = nullptr; + if (aux_input_size > 0) { + aux_fw_input_weights_ptr = + reinterpret_cast<int8_t*>(aux_fw_input_weights->data.uint8); + aux_fw_input_weights_scale = aux_fw_input_weights->params.scale; + aux_bw_input_weights_ptr = + reinterpret_cast<int8_t*>(aux_bw_input_weights->data.uint8); + aux_bw_input_weights_scale = aux_bw_input_weights->params.scale; + aux_quantized_input_ptr = reinterpret_cast<int8_t*>(aux_input_quantized); + } + + // Initialize temporary storage for quantized values. + int8_t* quantized_input_ptr = + reinterpret_cast<int8_t*>(input_quantized->data.uint8); + int8_t* fw_quantized_hidden_state_ptr = + reinterpret_cast<int8_t*>(fw_hidden_state_quantized->data.uint8); + int8_t* bw_quantized_hidden_state_ptr = + reinterpret_cast<int8_t*>(bw_hidden_state_quantized->data.uint8); + float* scaling_factors_ptr = scaling_factors->data.f; + + const int fw_output_step = + params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units; + const int bw_output_step = + params->merge_outputs ? fw_num_units + bw_num_units : bw_num_units; + for (int b = 0; b < batch_size; b++) { + // Forward cell. + float* fw_hidden_state_ptr_batch = + fw_hidden_state->data.f + b * fw_num_units; + float* fw_output_offset = fw_output->data.f + b * fw_output_step * max_time; + for (int s = 0; s < max_time; s++) { + const float* input_ptr_batch = + input->data.f + b * input_size * max_time + s * input_size; + const float* aux_input_ptr_batch = + (aux_input != nullptr) + ? aux_input->data.f + b * input_size * max_time + s * input_size + : nullptr; + float* output_ptr_batch = fw_output_offset + s * fw_output_step; + + kernel_utils::RnnBatchStep( + input_ptr_batch, fw_input_weights_ptr, fw_input_weights_scale, + aux_input_ptr_batch, aux_fw_input_weights_ptr, + aux_fw_input_weights_scale, fw_recurrent_weights_ptr, + fw_recurrent_weights_scale, fw_bias_ptr, input_size, aux_input_size, + fw_num_units, /*batch_size=*/1, params->activation, + quantized_input_ptr, aux_quantized_input_ptr, + fw_quantized_hidden_state_ptr, scaling_factors_ptr, + fw_hidden_state_ptr_batch, output_ptr_batch); + } + // Backward cell. + float* bw_hidden_state_ptr_batch = + bw_hidden_state->data.f + b * bw_num_units; + float* bw_output_offset = + params->merge_outputs + ? fw_output->data.f + b * bw_output_step * max_time + : bw_output->data.f + b * bw_output_step * max_time; + for (int s = max_time - 1; s >= 0; s--) { + const float* input_ptr_batch = + input->data.f + b * input_size * max_time + s * input_size; + const float* aux_input_ptr_batch = + (aux_input != nullptr) + ? aux_input->data.f + b * input_size * max_time + s * input_size + : nullptr; + float* output_ptr_batch = bw_output_offset + s * bw_output_step; + + kernel_utils::RnnBatchStep( + input_ptr_batch, bw_input_weights_ptr, bw_input_weights_scale, + aux_input_ptr_batch, aux_bw_input_weights_ptr, + aux_bw_input_weights_scale, bw_recurrent_weights_ptr, + bw_recurrent_weights_scale, bw_bias_ptr, input_size, aux_input_size, + bw_num_units, /*batch_size=*/1, params->activation, + quantized_input_ptr, aux_quantized_input_ptr, + bw_quantized_hidden_state_ptr, scaling_factors_ptr, + bw_hidden_state_ptr_batch, output_ptr_batch); + } + } + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceRNNParams*>( + node->builtin_data); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + const TfLiteTensor* fw_input_weights = + GetInput(context, node, kFwWeightsTensor); + const TfLiteTensor* fw_recurrent_weights = + GetInput(context, node, kFwRecurrentWeightsTensor); + const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor); + const TfLiteTensor* bw_input_weights = + GetInput(context, node, kBwWeightsTensor); + const TfLiteTensor* bw_recurrent_weights = + GetInput(context, node, kBwRecurrentWeightsTensor); + const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor); + + // Get auxiliary inputs. + const TfLiteTensor* aux_input = + GetOptionalInputTensor(context, node, kAuxInputTensor); + const TfLiteTensor* fw_aux_input_weights = + GetOptionalInputTensor(context, node, kFwAuxWeightsTensor); + const TfLiteTensor* bw_aux_input_weights = + GetOptionalInputTensor(context, node, kBwAuxWeightsTensor); + + TfLiteTensor* fw_hidden_state = + GetVariableInput(context, node, kFwHiddenStateTensor); + TfLiteTensor* bw_hidden_state = + GetVariableInput(context, node, kBwHiddenStateTensor); + + TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor); + TfLiteTensor* bw_output = params->merge_outputs + ? nullptr + : GetOutput(context, node, kBwOutputTensor); + + switch (fw_input_weights->type) { + case kTfLiteFloat32: + return EvalFloat(input, fw_input_weights, fw_recurrent_weights, fw_bias, + bw_input_weights, bw_recurrent_weights, bw_bias, + aux_input, fw_aux_input_weights, bw_aux_input_weights, + params, fw_hidden_state, fw_output, bw_hidden_state, + bw_output); + case kTfLiteUInt8: { + TfLiteTensor* input_quantized = + GetTemporary(context, node, kInputQuantized); + TfLiteTensor* fw_hidden_state_quantized = + GetTemporary(context, node, kFwHiddenStateQuantized); + TfLiteTensor* bw_hidden_state_quantized = + GetTemporary(context, node, kBwHiddenStateQuantized); + TfLiteTensor* scaling_factors = + GetTemporary(context, node, kScalingFactors); + TfLiteTensor* aux_input_quantized = + (aux_input != nullptr) + ? GetTemporary(context, node, kAuxInputQuantized) + : nullptr; + + return EvalHybrid(input, fw_input_weights, fw_recurrent_weights, fw_bias, + bw_input_weights, bw_recurrent_weights, bw_bias, + aux_input, fw_aux_input_weights, bw_aux_input_weights, + params, scaling_factors, input_quantized, + aux_input_quantized, fw_hidden_state_quantized, + fw_hidden_state, fw_output, bw_hidden_state_quantized, + bw_hidden_state, bw_output); + } + default: + context->ReportError(context, "Type not currently supported."); + return kTfLiteError; + } + return kTfLiteOk; +} + } // namespace bidirectional_sequence_rnn TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN() { - static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr, - bidirectional_sequence_rnn::Prepare, - bidirectional_sequence_rnn::Eval}; + static TfLiteRegistration r = { + bidirectional_sequence_rnn::Init, bidirectional_sequence_rnn::Free, + bidirectional_sequence_rnn::Prepare, bidirectional_sequence_rnn::Eval}; return &r; } diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc index 911b108eaa..f555c472f5 100644 --- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc +++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc @@ -654,7 +654,7 @@ const std::initializer_list<float> recurrent_weights = { class BidirectionalRNNOpModel : public SingleOpModel { public: BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units, - int bw_units, int input_size) + int bw_units, int input_size, bool merge_outputs) : batches_(batches), sequence_len_(sequence_len), fw_units_(fw_units), @@ -664,26 +664,40 @@ class BidirectionalRNNOpModel : public SingleOpModel { fw_weights_ = AddInput(TensorType_FLOAT32); fw_recurrent_weights_ = AddInput(TensorType_FLOAT32); fw_bias_ = AddInput(TensorType_FLOAT32); - fw_hidden_state_ = AddOutput(TensorType_FLOAT32); - fw_output_ = AddOutput(TensorType_FLOAT32); + fw_hidden_state_ = AddInput(TensorType_FLOAT32, true); bw_weights_ = AddInput(TensorType_FLOAT32); bw_recurrent_weights_ = AddInput(TensorType_FLOAT32); bw_bias_ = AddInput(TensorType_FLOAT32); - bw_hidden_state_ = AddOutput(TensorType_FLOAT32); - bw_output_ = AddOutput(TensorType_FLOAT32); + bw_hidden_state_ = AddInput(TensorType_FLOAT32, true); + + aux_input_ = AddNullInput(); + aux_fw_weights_ = AddNullInput(); + aux_bw_weights_ = AddNullInput(); + + fw_output_ = AddOutput(TensorType_FLOAT32); + if (!merge_outputs) { + bw_output_ = AddOutput(TensorType_FLOAT32); + } + SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, - BuiltinOptions_SequenceRNNOptions, - CreateSequenceRNNOptions(builder_, /*time_major=*/false, - ActivationFunctionType_RELU) + BuiltinOptions_BidirectionalSequenceRNNOptions, + CreateBidirectionalSequenceRNNOptions( + builder_, /*time_major=*/false, + ActivationFunctionType_RELU, merge_outputs) .Union()); BuildInterpreter({ {batches_, sequence_len_, input_size_}, // input {fw_units_, input_size_}, // fw_weights {fw_units_, fw_units_}, // fw_recurrent_weights {fw_units_}, // fw_bias + {batches_, fw_units_}, // fw_hidden_state {bw_units_, input_size_}, // bw_weights {bw_units_, bw_units_}, // bw_recurrent_weights - {bw_units_} // bw_bias + {bw_units_}, // bw_bias + {batches_, bw_units_}, // bw_hidden_state + {batches_, sequence_len_, 0}, // aux_input + {fw_units_, 0}, // aux_fw_weights + {bw_units_, 0}, // aux_bw_weights }); } @@ -719,19 +733,6 @@ class BidirectionalRNNOpModel : public SingleOpModel { PopulateTensor(input_, offset, begin, end); } - void ResetHiddenStates() { - const int fw_zero_buffer_size = fw_units_ * batches_; - std::unique_ptr<float[]> fw_zero_buffer(new float[fw_zero_buffer_size]); - memset(fw_zero_buffer.get(), 0, fw_zero_buffer_size * sizeof(float)); - PopulateTensor(fw_hidden_state_, 0, fw_zero_buffer.get(), - fw_zero_buffer.get() + fw_zero_buffer_size); - const int bw_zero_buffer_size = bw_units_ * batches_; - std::unique_ptr<float[]> bw_zero_buffer(new float[bw_zero_buffer_size]); - memset(bw_zero_buffer.get(), 0, bw_zero_buffer_size * sizeof(float)); - PopulateTensor(bw_hidden_state_, 0, bw_zero_buffer.get(), - bw_zero_buffer.get() + bw_zero_buffer_size); - } - std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); } std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); } @@ -753,6 +754,9 @@ class BidirectionalRNNOpModel : public SingleOpModel { int bw_bias_; int bw_hidden_state_; int bw_output_; + int aux_input_; + int aux_fw_weights_; + int aux_bw_weights_; int batches_; int sequence_len_; @@ -766,7 +770,7 @@ class BidirectionalRNNOpModel : public SingleOpModel { TEST(BidirectionalRNNOpTest, BlackBoxTest) { BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16, /*fw_units=*/16, /*bw_units=*/16, - /*input_size=*/8); + /*input_size=*/8, /*merge_outputs=*/false); rnn.SetFwWeights(weights); rnn.SetBwWeights(weights); rnn.SetFwBias(biases); @@ -774,7 +778,6 @@ TEST(BidirectionalRNNOpTest, BlackBoxTest) { rnn.SetFwRecurrentWeights(recurrent_weights); rnn.SetBwRecurrentWeights(recurrent_weights); - rnn.ResetHiddenStates(); const int input_sequence_size = rnn.input_size() * rnn.sequence_len(); float* batch_start = rnn_input; float* batch_end = batch_start + input_sequence_size; @@ -800,12 +803,49 @@ TEST(BidirectionalRNNOpTest, BlackBoxTest) { EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected))); } +// Same as the previous test, yet with merged outputs. +TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) { + BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16, + /*fw_units=*/16, /*bw_units=*/16, + /*input_size=*/8, /*merge_outputs=*/true); + rnn.SetFwWeights(weights); + rnn.SetBwWeights(weights); + rnn.SetFwBias(biases); + rnn.SetBwBias(biases); + rnn.SetFwRecurrentWeights(recurrent_weights); + rnn.SetBwRecurrentWeights(recurrent_weights); + + const int input_sequence_size = rnn.input_size() * rnn.sequence_len(); + float* batch_start = rnn_input; + float* batch_end = batch_start + input_sequence_size; + rnn.SetInput(0, batch_start, batch_end); + rnn.SetInput(input_sequence_size, batch_start, batch_end); + + rnn.Invoke(); + + std::vector<float> merged_expected; + for (int bid = 0; bid < rnn.num_batches(); bid++) { + for (int step = 0; step < rnn.sequence_len(); step++) { + merged_expected.insert( + merged_expected.end(), + rnn_golden_fw_output + rnn.num_fw_units() * step, + rnn_golden_fw_output + rnn.num_fw_units() * (step + 1)); + merged_expected.insert( + merged_expected.end(), + rnn_golden_bw_output + rnn.num_bw_units() * step, + rnn_golden_bw_output + rnn.num_bw_units() * (step + 1)); + } + } + EXPECT_THAT(rnn.GetFwOutput(), + ElementsAreArray(ArrayFloatNear(merged_expected))); +} + // Check that if the input sequence is reversed the outputs are the same just // forward and backward are swapped (and reversed). TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) { BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16, /*fw_units=*/16, /*bw_units=*/16, - /*input_size=*/8); + /*input_size=*/8, /*merge_outputs=*/false); rnn.SetFwWeights(weights); rnn.SetBwWeights(weights); rnn.SetFwBias(biases); @@ -813,8 +853,6 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) { rnn.SetFwRecurrentWeights(recurrent_weights); rnn.SetBwRecurrentWeights(recurrent_weights); - rnn.ResetHiddenStates(); - // Reverse inputs in each batch: in_1, in_2,..., in_k is inserted in the // following order: [in_k,..., in_2, in_1, in_k,...,in_2, in_1]. for (int i = 0; i < rnn.sequence_len(); i++) { @@ -853,7 +891,7 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) { TEST(BidirectionalRNNOpTest, EndToEndTest) { BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4, /*fw_units=*/16, /*bw_units=*/16, - /*input_size=*/8); + /*input_size=*/8, /*merge_outputs=*/false); const int output_size = 4; float dnn_weights[] = { -0.5782342, -0.052212059, 0.73036242, -0.81216097, -0.80088139, @@ -880,8 +918,6 @@ TEST(BidirectionalRNNOpTest, EndToEndTest) { rnn.SetFwRecurrentWeights(recurrent_weights); rnn.SetBwRecurrentWeights(recurrent_weights); - rnn.ResetHiddenStates(); - const int input_sequence_size = rnn.input_size() * rnn.sequence_len(); const int output_sequence_size = output_size * rnn.sequence_len(); const int num_examples = 64; diff --git a/tensorflow/contrib/lite/kernels/cast.cc b/tensorflow/contrib/lite/kernels/cast.cc index 60770ca0aa..a7972140ac 100644 --- a/tensorflow/contrib/lite/kernels/cast.cc +++ b/tensorflow/contrib/lite/kernels/cast.cc @@ -14,8 +14,9 @@ limitations under the License. ==============================================================================*/ #include <string.h> #include <algorithm> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include <complex> +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -53,6 +54,20 @@ void copyCast(const FromT* in, ToT* out, int num_elements) { [](FromT a) { return static_cast<ToT>(a); }); } +template <typename ToT> +void copyCast(const std::complex<float>* in, ToT* out, int num_elements) { + std::transform(in, in + num_elements, out, [](std::complex<float> a) { + return static_cast<ToT>(std::real(a)); + }); +} + +template <> +void copyCast(const std::complex<float>* in, std::complex<float>* out, + int num_elements) { + std::transform(in, in + num_elements, out, + [](std::complex<float> a) { return a; }); +} + template <typename FromT> TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out, int num_elements) { @@ -72,6 +87,10 @@ TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out, case kTfLiteBool: copyCast(in, out->data.b, num_elements); break; + case kTfLiteComplex64: + copyCast(in, reinterpret_cast<std::complex<float>*>(out->data.c64), + num_elements); + break; default: // Unsupported type. return kTfLiteError; @@ -95,6 +114,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return copyToTensor(input->data.f, output, num_elements); case kTfLiteBool: return copyToTensor(input->data.b, output, num_elements); + case kTfLiteComplex64: + return copyToTensor( + reinterpret_cast<std::complex<float>*>(input->data.c64), output, + num_elements); default: // Unsupported type. return kTfLiteError; diff --git a/tensorflow/contrib/lite/kernels/cast_test.cc b/tensorflow/contrib/lite/kernels/cast_test.cc index 53e2000737..954f998206 100644 --- a/tensorflow/contrib/lite/kernels/cast_test.cc +++ b/tensorflow/contrib/lite/kernels/cast_test.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include <complex> + #include <gtest/gtest.h> #include "tensorflow/contrib/lite/interpreter.h" #include "tensorflow/contrib/lite/kernels/register.h" @@ -73,6 +75,71 @@ TEST(CastOpModel, CastBoolToFloat) { ElementsAreArray({1.f, 1.0f, 0.f, 1.0f, 0.0f, 1.0f})); } +TEST(CastOpModel, CastComplex64ToFloat) { + CastOpModel m({TensorType_COMPLEX64, {2, 3}}, {TensorType_FLOAT32, {2, 3}}); + m.PopulateTensor<std::complex<float>>( + m.input(), + {std::complex<float>(1.0f, 11.0f), std::complex<float>(2.0f, 12.0f), + std::complex<float>(3.0f, 13.0f), std::complex<float>(4.0f, 14.0f), + std::complex<float>(5.0f, 15.0f), std::complex<float>(6.0f, 16.0f)}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<float>(m.output()), + ElementsAreArray({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f})); +} + +TEST(CastOpModel, CastFloatToComplex64) { + CastOpModel m({TensorType_FLOAT32, {2, 3}}, {TensorType_COMPLEX64, {2, 3}}); + m.PopulateTensor<float>(m.input(), {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + m.Invoke(); + EXPECT_THAT( + m.ExtractVector<std::complex<float>>(m.output()), + ElementsAreArray( + {std::complex<float>(1.0f, 0.0f), std::complex<float>(2.0f, 0.0f), + std::complex<float>(3.0f, 0.0f), std::complex<float>(4.0f, 0.0f), + std::complex<float>(5.0f, 0.0f), std::complex<float>(6.0f, 0.0f)})); +} + +TEST(CastOpModel, CastComplex64ToInt) { + CastOpModel m({TensorType_COMPLEX64, {2, 3}}, {TensorType_INT32, {2, 3}}); + m.PopulateTensor<std::complex<float>>( + m.input(), + {std::complex<float>(1.0f, 11.0f), std::complex<float>(2.0f, 12.0f), + std::complex<float>(3.0f, 13.0f), std::complex<float>(4.0f, 14.0f), + std::complex<float>(5.0f, 15.0f), std::complex<float>(6.0f, 16.0f)}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<int>(m.output()), + ElementsAreArray({1, 2, 3, 4, 5, 6})); +} + +TEST(CastOpModel, CastIntToComplex64) { + CastOpModel m({TensorType_INT32, {2, 3}}, {TensorType_COMPLEX64, {2, 3}}); + m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6}); + m.Invoke(); + EXPECT_THAT( + m.ExtractVector<std::complex<float>>(m.output()), + ElementsAreArray( + {std::complex<float>(1.0f, 0.0f), std::complex<float>(2.0f, 0.0f), + std::complex<float>(3.0f, 0.0f), std::complex<float>(4.0f, 0.0f), + std::complex<float>(5.0f, 0.0f), std::complex<float>(6.0f, 0.0f)})); +} + +TEST(CastOpModel, CastComplex64ToComplex64) { + CastOpModel m({TensorType_COMPLEX64, {2, 3}}, {TensorType_COMPLEX64, {2, 3}}); + m.PopulateTensor<std::complex<float>>( + m.input(), + {std::complex<float>(1.0f, 11.0f), std::complex<float>(2.0f, 12.0f), + std::complex<float>(3.0f, 13.0f), std::complex<float>(4.0f, 14.0f), + std::complex<float>(5.0f, 15.0f), std::complex<float>(6.0f, 16.0f)}); + m.Invoke(); + EXPECT_THAT( + m.ExtractVector<std::complex<float>>(m.output()), + ElementsAreArray( + {std::complex<float>(1.0f, 11.0f), std::complex<float>(2.0f, 12.0f), + std::complex<float>(3.0f, 13.0f), std::complex<float>(4.0f, 14.0f), + std::complex<float>(5.0f, 15.0f), + std::complex<float>(6.0f, 16.0f)})); +} + } // namespace } // namespace tflite int main(int argc, char** argv) { diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/contrib/lite/kernels/comparisons.cc index b948334b6d..3926af5b97 100644 --- a/tensorflow/contrib/lite/kernels/comparisons.cc +++ b/tensorflow/contrib/lite/kernels/comparisons.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -23,6 +23,7 @@ namespace tflite { namespace ops { namespace builtin { namespace comparisons { +namespace { constexpr int kInputTensor1 = 0; constexpr int kInputTensor2 = 1; @@ -56,23 +57,131 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) { return context->ResizeTensor(context, output, output_size); } -#define TF_LITE_COMPARISON(type, opname, requires_broadcast) \ - requires_broadcast \ - ? reference_ops::Broadcast##opname( \ - GetTensorData<type>(input1), GetTensorDims(input1), \ - GetTensorData<type>(input2), GetTensorDims(input2), \ - GetTensorData<bool>(output), GetTensorDims(output)) \ - : reference_ops::opname( \ - GetTensorData<type>(input1), GetTensorDims(input1), \ - GetTensorData<type>(input2), GetTensorDims(input2), \ - GetTensorData<bool>(output), GetTensorDims(output)); +// TODO(ruic): optimize macros below to using template functions. +#define TF_LITE_QUANTIZE_COMPARISON(opname) \ + void EvalQuantized##opname(TfLiteContext* context, TfLiteNode* node, \ + const TfLiteTensor* input1, \ + const TfLiteTensor* input2, TfLiteTensor* output, \ + bool requires_broadcast) { \ + if (input1->type == kTfLiteUInt8) { \ + auto input1_offset = -input1->params.zero_point; \ + auto input2_offset = -input2->params.zero_point; \ + const int left_shift = 8; \ + \ + int32 input1_multiplier; \ + int input1_shift; \ + QuantizeMultiplierSmallerThanOneExp(input1->params.scale, \ + &input1_multiplier, &input1_shift); \ + int32 input2_multiplier; \ + int input2_shift; \ + QuantizeMultiplierSmallerThanOneExp(input2->params.scale, \ + &input2_multiplier, &input2_shift); \ + \ + ComparisonParams op_params; \ + op_params.left_shift = left_shift; \ + op_params.input1_offset = input1_offset; \ + op_params.input1_multiplier = input1_multiplier; \ + op_params.input1_shift = input1_shift; \ + op_params.input2_offset = input2_offset; \ + op_params.input2_multiplier = input2_multiplier; \ + op_params.input2_shift = input2_shift; \ + if (requires_broadcast) { \ + reference_ops::Broadcast4DSlow##opname##WithScaling( \ + op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1), \ + GetTensorShape(input2), GetTensorData<uint8_t>(input2), \ + GetTensorShape(output), GetTensorData<bool>(output)); \ + } else { \ + reference_ops::opname##WithScaling( \ + op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1), \ + GetTensorShape(input2), GetTensorData<uint8_t>(input2), \ + GetTensorShape(output), GetTensorData<bool>(output)); \ + } \ + } \ + } +TF_LITE_QUANTIZE_COMPARISON(Equal); +TF_LITE_QUANTIZE_COMPARISON(NotEqual); +TF_LITE_QUANTIZE_COMPARISON(Greater); +TF_LITE_QUANTIZE_COMPARISON(GreaterEqual); +TF_LITE_QUANTIZE_COMPARISON(Less); +TF_LITE_QUANTIZE_COMPARISON(LessEqual); +#undef TF_LITE_QUANTIZE_COMPARISON + +#define TF_LITE_COMPARISON(type, opname, requires_broadcast) \ + { \ + ComparisonParams op_params; \ + requires_broadcast \ + ? reference_ops::Broadcast4DSlow##opname##NoScaling( \ + op_params, GetTensorShape(input1), GetTensorData<type>(input1), \ + GetTensorShape(input2), GetTensorData<type>(input2), \ + GetTensorShape(output), GetTensorData<bool>(output)) \ + : reference_ops::opname##NoScaling( \ + op_params, GetTensorShape(input1), GetTensorData<type>(input1), \ + GetTensorShape(input2), GetTensorData<type>(input2), \ + GetTensorShape(output), GetTensorData<bool>(output)); \ + } + +TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + bool requires_broadcast = !HaveSameShapes(input1, input2); + switch (input1->type) { + case kTfLiteFloat32: + TF_LITE_COMPARISON(float, Equal, requires_broadcast); + break; + case kTfLiteInt32: + TF_LITE_COMPARISON(int32_t, Equal, requires_broadcast); + break; + case kTfLiteInt64: + TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast); + break; + case kTfLiteUInt8: + EvalQuantizedEqual(context, node, input1, input2, output, + requires_broadcast); + break; + default: + context->ReportError(context, + "Does not support type %d, requires float|int|uint8", + input1->type); + return kTfLiteError; + } + return kTfLiteOk; +} + +// TODO(renjieliu): Refactor the logic to avoid duplications. +TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + bool requires_broadcast = !HaveSameShapes(input1, input2); + switch (input1->type) { + case kTfLiteFloat32: + TF_LITE_COMPARISON(float, NotEqual, requires_broadcast); + break; + case kTfLiteInt32: + TF_LITE_COMPARISON(int32_t, NotEqual, requires_broadcast); + break; + case kTfLiteInt64: + TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast); + break; + case kTfLiteUInt8: + EvalQuantizedNotEqual(context, node, input1, input2, output, + requires_broadcast); + break; + default: + context->ReportError(context, + "Does not support type %d, requires float|int|uint8", + input1->type); + return kTfLiteError; + } + return kTfLiteOk; +} TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, Greater, requires_broadcast); @@ -83,9 +192,14 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedGreater(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type other than float|int"); + "Does not support type %d, requires float|int|uint8", + input1->type); return kTfLiteError; } return kTfLiteOk; @@ -96,7 +210,6 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, GreaterEqual, requires_broadcast); @@ -107,9 +220,14 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedGreaterEqual(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type other than float|int"); + "Does not support type %d, requires float|int|uint8", + input1->type); return kTfLiteError; } return kTfLiteOk; @@ -120,7 +238,6 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, Less, requires_broadcast); @@ -131,9 +248,14 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, Less, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedLess(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type other than float|int"); + "Does not support type %d, requires float|int|uint8", + input1->type); return kTfLiteError; } return kTfLiteOk; @@ -144,7 +266,6 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, LessEqual, requires_broadcast); @@ -155,16 +276,35 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedLessEqual(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type other than float|int"); + "Does not support type %d, requires float|int|uint8", + input1->type); return kTfLiteError; } return kTfLiteOk; } +} // namespace } // namespace comparisons +TfLiteRegistration* Register_EQUAL() { + static TfLiteRegistration r = { + nullptr, nullptr, comparisons::ComparisonPrepare, comparisons::EqualEval}; + return &r; +} + +TfLiteRegistration* Register_NOT_EQUAL() { + static TfLiteRegistration r = {nullptr, nullptr, + comparisons::ComparisonPrepare, + comparisons::NotEqualEval}; + return &r; +} + TfLiteRegistration* Register_GREATER() { static TfLiteRegistration r = {nullptr, nullptr, comparisons::ComparisonPrepare, diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/contrib/lite/kernels/comparisons_test.cc index 835d238d36..04c8bf2e30 100644 --- a/tensorflow/contrib/lite/kernels/comparisons_test.cc +++ b/tensorflow/contrib/lite/kernels/comparisons_test.cc @@ -21,21 +21,29 @@ limitations under the License. namespace tflite { namespace { -using ::testing::ElementsAreArray; +using ::testing::ElementsAre; -class GreaterOpModel : public SingleOpModel { +class ComparisonOpModel : public SingleOpModel { public: - GreaterOpModel(std::initializer_list<int> input1_shape, - std::initializer_list<int> input2_shape, - TensorType input_type) { + ComparisonOpModel(std::initializer_list<int> input1_shape, + std::initializer_list<int> input2_shape, + TensorType input_type, BuiltinOperator op) { input1_ = AddInput(input_type); input2_ = AddInput(input_type); output_ = AddOutput(TensorType_BOOL); - SetBuiltinOp(BuiltinOperator_GREATER, BuiltinOptions_GreaterOptions, - CreateGreaterOptions(builder_).Union()); + ConfigureBuiltinOp(op); BuildInterpreter({input1_shape, input2_shape}); } + ComparisonOpModel(const TensorData& input1, const TensorData& input2, + TensorType input_type, BuiltinOperator op) { + input1_ = AddInput(input1); + input2_ = AddInput(input2); + output_ = AddOutput(TensorType_BOOL); + ConfigureBuiltinOp(op); + BuildInterpreter({GetShape(input1_), GetShape(input2_)}); + } + int input1() { return input1_; } int input2() { return input2_; } @@ -46,245 +54,510 @@ class GreaterOpModel : public SingleOpModel { int input1_; int input2_; int output_; + + void ConfigureBuiltinOp(BuiltinOperator op) { + switch (op) { + case BuiltinOperator_EQUAL: { + SetBuiltinOp(op, BuiltinOptions_EqualOptions, + CreateEqualOptions(builder_).Union()); + break; + } + case BuiltinOperator_NOT_EQUAL: { + SetBuiltinOp(op, BuiltinOptions_NotEqualOptions, + CreateNotEqualOptions(builder_).Union()); + break; + } + case BuiltinOperator_GREATER: { + SetBuiltinOp(op, BuiltinOptions_GreaterOptions, + CreateGreaterOptions(builder_).Union()); + break; + } + case BuiltinOperator_GREATER_EQUAL: { + SetBuiltinOp(op, BuiltinOptions_GreaterEqualOptions, + CreateGreaterEqualOptions(builder_).Union()); + break; + } + case BuiltinOperator_LESS: { + SetBuiltinOp(op, BuiltinOptions_LessOptions, + CreateLessOptions(builder_).Union()); + break; + } + case BuiltinOperator_LESS_EQUAL: { + SetBuiltinOp(op, BuiltinOptions_LessEqualOptions, + CreateLessEqualOptions(builder_).Union()); + break; + } + default: { FAIL() << "We shouldn't get here."; } + } + } }; -TEST(ComparisonsTest, GreaterFloat) { - GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32); +TEST(ComparisonsTest, EqualFloat) { + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32, + BuiltinOperator_EQUAL); model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3}); model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } -TEST(ComparisonsTest, GreaterInt) { - GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32); +TEST(ComparisonsTest, EqualInt) { + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_EQUAL); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, false, false})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, true, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } -TEST(ComparisonsTest, GreaterBroadcast) { - GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32); +TEST(ComparisonsTest, EqualBroadcast) { + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32, + BuiltinOperator_EQUAL); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); model.PopulateTensor<int>(model.input2(), {7}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, false, false})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, true, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } -TEST(ComparisonsTest, GreaterBroadcastTwoD) { - GreaterOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32); +TEST(ComparisonsTest, EqualBroadcastTwoD) { + ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_EQUAL); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8}); model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false, - false, true, false, true})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, false, false, + false, true, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4)); } -class GreaterEqualOpModel : public SingleOpModel { - public: - GreaterEqualOpModel(std::initializer_list<int> input1_shape, - std::initializer_list<int> input2_shape, - TensorType input_type) { - input1_ = AddInput(input_type); - input2_ = AddInput(input_type); - output_ = AddOutput(TensorType_BOOL); - SetBuiltinOp(BuiltinOperator_GREATER_EQUAL, - BuiltinOptions_GreaterEqualOptions, - CreateGreaterEqualOptions(builder_).Union()); - BuildInterpreter({input1_shape, input2_shape}); - } +TEST(ComparisonsTest, NotEqualFloat) { + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32, + BuiltinOperator_NOT_EQUAL); + model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3}); + model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5}); + model.Invoke(); - int input1() { return input1_; } - int input2() { return input2_; } + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} - std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); } - std::vector<int> GetOutputShape() { return GetTensorShape(output_); } +TEST(ComparisonsTest, NotEqualInt) { + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_NOT_EQUAL); + model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); + model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5}); + model.Invoke(); - private: - int input1_; - int input2_; - int output_; -}; + EXPECT_THAT(model.GetOutput(), ElementsAre(true, true, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +TEST(ComparisonsTest, NotEqualBroadcast) { + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32, + BuiltinOperator_NOT_EQUAL); + model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); + model.PopulateTensor<int>(model.input2(), {7}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, true, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +TEST(ComparisonsTest, NotEqualBroadcastTwoD) { + ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_NOT_EQUAL); + model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8}); + model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), + ElementsAre(true, true, true, true, true, true, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4)); +} + +TEST(ComparisonsTest, GreaterFloat) { + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32, + BuiltinOperator_GREATER); + model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3}); + model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +TEST(ComparisonsTest, GreaterInt) { + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_GREATER); + model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); + model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +TEST(ComparisonsTest, GreaterBroadcast) { + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32, + BuiltinOperator_GREATER); + model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); + model.PopulateTensor<int>(model.input2(), {7}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +TEST(ComparisonsTest, GreaterBroadcastTwoD) { + ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_GREATER); + model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8}); + model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), + ElementsAre(false, true, true, false, false, true, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4)); +} TEST(ComparisonsTest, GreaterEqualFloat) { - GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32); + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32, + BuiltinOperator_GREATER_EQUAL); model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3}); model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, true, true, false})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(true, true, true, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } TEST(ComparisonsTest, GreaterEqualInt) { - GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32); + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_GREATER_EQUAL); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } TEST(ComparisonsTest, GreaterEqualBroadcast) { - GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32); + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32, + BuiltinOperator_GREATER_EQUAL); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); model.PopulateTensor<int>(model.input2(), {7}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } TEST(ComparisonsTest, GreaterEqualBroadcastTwoD) { - GreaterEqualOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32); + ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_GREATER_EQUAL); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8}); model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false, - false, true, true, true})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4})); + EXPECT_THAT(model.GetOutput(), + ElementsAre(false, true, true, false, false, true, true, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4)); } -class LessOpModel : public SingleOpModel { - public: - LessOpModel(std::initializer_list<int> input1_shape, - std::initializer_list<int> input2_shape, TensorType input_type) { - input1_ = AddInput(input_type); - input2_ = AddInput(input_type); - output_ = AddOutput(TensorType_BOOL); - SetBuiltinOp(BuiltinOperator_LESS, BuiltinOptions_LessOptions, - CreateLessOptions(builder_).Union()); - BuildInterpreter({input1_shape, input2_shape}); - } - - int input1() { return input1_; } - int input2() { return input2_; } - - std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); } - std::vector<int> GetOutputShape() { return GetTensorShape(output_); } - - private: - int input1_; - int input2_; - int output_; -}; TEST(ComparisonsTest, LessFloat) { - LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32); + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32, + BuiltinOperator_LESS); model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3}); model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, false, false, true})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } TEST(ComparisonsTest, LessInt) { - LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32); + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_LESS); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); model.PopulateTensor<int>(model.input2(), {1, 2, 6, 5}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } TEST(ComparisonsTest, LessBroadcast) { - LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32); + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32, + BuiltinOperator_LESS); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); model.PopulateTensor<int>(model.input2(), {7}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } TEST(ComparisonsTest, LessBroadcastTwoD) { - LessOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32); + ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_LESS); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 6, 8}); model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true, - true, false, false, false})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4})); + EXPECT_THAT(model.GetOutput(), + ElementsAre(true, false, false, true, true, false, false, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4)); } -class LessEqualOpModel : public SingleOpModel { - public: - LessEqualOpModel(std::initializer_list<int> input1_shape, - std::initializer_list<int> input2_shape, - TensorType input_type) { - input1_ = AddInput(input_type); - input2_ = AddInput(input_type); - output_ = AddOutput(TensorType_BOOL); - SetBuiltinOp(BuiltinOperator_LESS_EQUAL, BuiltinOptions_LessEqualOptions, - CreateLessEqualOptions(builder_).Union()); - BuildInterpreter({input1_shape, input2_shape}); - } - - int input1() { return input1_; } - int input2() { return input2_; } - - std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); } - std::vector<int> GetOutputShape() { return GetTensorShape(output_); } - - private: - int input1_; - int input2_; - int output_; -}; - TEST(ComparisonsTest, LessEqualFloat) { - LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32); + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32, + BuiltinOperator_LESS_EQUAL); model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3}); model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } TEST(ComparisonsTest, LessEqualInt) { - LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32); + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_LESS_EQUAL); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, true, true})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } TEST(ComparisonsTest, LessEqualBroadcast) { - LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32); + ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32, + BuiltinOperator_LESS_EQUAL); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3}); model.PopulateTensor<int>(model.input2(), {7}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, true, true})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } TEST(ComparisonsTest, LessEqualBroadcastTwoD) { - LessEqualOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32); + ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32, + BuiltinOperator_LESS_EQUAL); model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8}); model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4}); model.Invoke(); - EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true, - true, false, true, false})); - EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4})); + EXPECT_THAT(model.GetOutput(), + ElementsAre(true, false, false, true, true, false, true, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4)); +} + +TEST(QuantizedComparisonsTest, EqualQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_EQUAL); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {1, 2, 7, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, false)); +} + +TEST(QuantizedComparisonsTest, NotEqualQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_NOT_EQUAL); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {1, 2, 7, 0}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, true)); +} + +TEST(ComparisonsTest, GreaterQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_GREATER); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {1, 2, 6, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false)); +} + +TEST(ComparisonsTest, GreaterQuantizedSmallRange) { + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, 0.0, 1.0}, + {TensorType_UINT8, {1, 2, 2, 1}, 0.0, 2.0}, + TensorType_UINT8, BuiltinOperator_GREATER); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {1.0, 0.5, 0.35, 0.1}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {1.01, 0.25, 0.3, 0.4}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false)); +} + +TEST(ComparisonsTest, GreaterEqualQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_GREATER_EQUAL); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {1, 2, 6, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, true, true, false)); +} + +TEST(ComparisonsTest, LessQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_LESS); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {1, 2, 6, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true)); +} + +TEST(ComparisonsTest, LessEqualQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_LESS_EQUAL); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {1, 2, 6, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true)); +} + +TEST(ComparisonsTest, QuantizedEqualWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_EQUAL); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {2}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(false, true, false, false, false, false)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_NOT_EQUAL); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {2}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(true, false, true, true, true, true)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_GREATER); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {8}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(true, false, false, false, true, true)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_GREATER_EQUAL); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {8}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(true, false, false, true, true, true)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedLessWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_LESS); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {8}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(false, true, true, false, false, false)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_LESS_EQUAL); + model.QuantizeAndPopulate<uint8_t>(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate<uint8_t>(model.input2(), {8}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(false, true, true, true, false, false)) + << "With shape number " << i; + } } } // namespace diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc index 45ea8d0049..7ad3399ffd 100644 --- a/tensorflow/contrib/lite/kernels/concatenation.cc +++ b/tensorflow/contrib/lite/kernels/concatenation.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -20,8 +19,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -58,7 +57,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, t0->dims->size <= 4); TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone); TF_LITE_ENSURE(context, - input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8); + input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 || + input_type == kTfLiteInt16 || input_type == kTfLiteInt32 || + input_type == kTfLiteInt64); // Output dimensions will match input dimensions, except 'axis', which // will be the sum of inputs @@ -99,20 +100,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { // allocate and populate these during Prepare(). // TODO(ycling): Activation function parameter is ignored. For now we dont have // a model with a Concatenation with fused activation function. -#define TF_LITE_CONCATENATION(type, scalar) \ - VectorOfTensors<scalar> all_inputs(*context, *node->inputs); \ - type::Concatenation<FusedActivationFunctionType::kNone, scalar>( \ - RemapDim(NumDimensions(output), axis), all_inputs.data(), \ - all_inputs.dims(), node->inputs->size, GetTensorData<scalar>(output), \ - GetTensorDims(output)) - -#define TF_LITE_CONCATENATION_QUANTIZED(type) \ - VectorOfQuantizedTensors all_inputs(*context, *node->inputs); \ - type::Concatenation( \ - RemapDim(NumDimensions(output), axis), all_inputs.data(), \ - all_inputs.dims(), all_inputs.zero_point(), all_inputs.scale(), \ - node->inputs->size, GetTensorData<uint8>(output), GetTensorDims(output), \ - output->params.zero_point, output->params.scale) +#define TF_LITE_CONCATENATION(type, scalar) \ + { \ + VectorOfTensors<scalar> all_inputs(*context, *node->inputs); \ + tflite::ConcatenationParams op_params; \ + op_params.axis = axis; \ + op_params.inputs_count = node->inputs->size; \ + type::Concatenation(op_params, all_inputs.shapes(), all_inputs.data(), \ + GetTensorShape(output), \ + GetTensorData<scalar>(output)); \ + } + +#define TF_LITE_CONCATENATION_QUANTIZED(type) \ + { \ + VectorOfQuantizedTensors all_inputs(*context, *node->inputs); \ + tflite::ConcatenationParams op_params; \ + op_params.axis = axis; \ + op_params.input_zeropoint = all_inputs.zero_point(); \ + op_params.input_scale = all_inputs.scale(); \ + op_params.inputs_count = node->inputs->size; \ + op_params.output_zeropoint = output->params.zero_point; \ + op_params.output_scale = output->params.scale; \ + type::ConcatenationWithScaling(op_params, all_inputs.shapes(), \ + all_inputs.data(), GetTensorShape(output), \ + GetTensorData<uint8>(output)); \ + } switch (output->type) { // Already know in/outtypes are same. case kTfLiteFloat32: @@ -122,6 +134,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_CONCATENATION(optimized_ops, float); } break; + case kTfLiteInt32: + if (kernel_type == kReference) { + TF_LITE_CONCATENATION(reference_ops, int32); + } else { + TF_LITE_CONCATENATION(optimized_ops, int32); + } + break; case kTfLiteUInt8: if (kernel_type == kReference) { TF_LITE_CONCATENATION_QUANTIZED(reference_ops); @@ -129,6 +148,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_CONCATENATION_QUANTIZED(optimized_ops); } break; + case kTfLiteInt64: + if (kernel_type == kReference) { + TF_LITE_CONCATENATION(reference_ops, int64_t); + } else { + TF_LITE_CONCATENATION(optimized_ops, int64_t); + } + break; + default: context->ReportError(context, "Only float32 and uint8 are currently supported."); diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc index 2b7e455e3e..dbcadbee14 100644 --- a/tensorflow/contrib/lite/kernels/conv.cc +++ b/tensorflow/contrib/lite/kernels/conv.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <algorithm> #include <cassert> #include <cmath> @@ -21,8 +20,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/eigen_support.h" #include "tensorflow/contrib/lite/kernels/gemm_support.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h" @@ -31,6 +30,7 @@ limitations under the License. #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" #include "tensorflow/contrib/lite/kernels/padding.h" @@ -61,6 +61,8 @@ struct OpData { // memory buffers. int im2col_id = kTensorNotAllocated; int hwcn_weights_id = kTensorNotAllocated; + int input_quantized_id = kTensorNotAllocated; + int scaling_factors_id = kTensorNotAllocated; TfLitePaddingValues padding; // The scaling factor from input to output (aka the 'real multiplier') can @@ -75,6 +77,8 @@ struct OpData { // of the allocated temporaries. int32_t im2col_index; int32_t hwcn_weights_index; + int32_t input_quantized_index; + int32_t scaling_factors_index; bool need_hwcn_weights; bool have_weights_been_transposed; bool need_im2col; @@ -82,6 +86,18 @@ struct OpData { bool run_multithreaded_kernel; }; +inline PaddingType RuntimePaddingType(TfLitePadding padding) { + switch (padding) { + case TfLitePadding::kTfLitePaddingSame: + return PaddingType::kSame; + case TfLitePadding::kTfLitePaddingValid: + return PaddingType::kValid; + case TfLitePadding::kTfLitePaddingUnknown: + default: + return PaddingType::kNone; + } +} + void* Init(TfLiteContext* context, const char* buffer, size_t length) { // This is a builtin op, so we don't use the contents in 'buffer', if any. // Instead, we allocate a new object to use as scratch space for im2col, and @@ -126,6 +142,9 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context, TfLiteTensor* input = &context->tensors[node->inputs->data[0]]; TfLiteTensor* filter = &context->tensors[node->inputs->data[1]]; + const bool is_hybrid = + (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8); + int filter_width = filter->dims->data[2]; int filter_height = filter->dims->data[1]; @@ -134,7 +153,9 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context, // optimized_ops.h, in order to avoid a DCHECK(!im2col_data). data->need_im2col = (params->stride_width != 1 || params->stride_height != 1 || - filter_width != 1 || filter_height != 1); + params->dilation_width_factor != 1 || + params->dilation_height_factor != 1 || filter_width != 1 || + filter_height != 1); // If we're using the optimized multithreaded EigenTensor implementation of // convolution, it expects the filter weights to be transposed compared to // the normal TF Lite buffer format. Typical TF Lite weights are @@ -144,8 +165,8 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context, // buffer to store the results. // This path is only used for float processing, so only create the buffer if // we're running with that data type. - data->need_hwcn_weights = - (input->type == kTfLiteFloat32 && data->run_multithreaded_kernel); + data->need_hwcn_weights = (input->type == kTfLiteFloat32 && + data->run_multithreaded_kernel && !is_hybrid); int temporaries_count = 0; if (data->need_im2col) { @@ -163,6 +184,25 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context, ++temporaries_count; } + if (is_hybrid) { + // Allocate tensor to store the on-the-fly quantized inputs. + data->input_quantized_index = temporaries_count; + if (data->input_quantized_id == kTensorNotAllocated) { + TF_LITE_ENSURE_OK( + context, context->AddTensors(context, 1, &data->input_quantized_id)); + } + ++temporaries_count; + + // Allocate tensor to store the quantization params computed during + // on-the-fly input quantization. + data->scaling_factors_index = temporaries_count; + if (data->scaling_factors_id == kTensorNotAllocated) { + TF_LITE_ENSURE_OK( + context, context->AddTensors(context, 1, &data->scaling_factors_id)); + } + ++temporaries_count; + } + TfLiteIntArrayFree(node->temporaries); node->temporaries = TfLiteIntArrayCreate(temporaries_count); @@ -173,13 +213,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data); OpData* data = reinterpret_cast<OpData*>(node->user_data); - data->run_multithreaded_kernel = context->recommended_num_threads != 1; - - TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(context, node)); - - bool hasBias = node->inputs->size == 3; + bool has_bias = node->inputs->size == 3; // Check number of inputs/outputs - TF_LITE_ENSURE(context, hasBias || node->inputs->size == 2); + TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2); TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); TfLiteTensor* output = &context->tensors[node->outputs->data[0]]; TfLiteTensor* input = &context->tensors[node->inputs->data[0]]; @@ -192,29 +228,40 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]); // Check types. (We assume that UINT8 refers to quantized tensors) - TfLiteType data_type = input->type; + TfLiteType input_type = input->type; TF_LITE_ENSURE(context, - data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8); - TF_LITE_ENSURE_EQ(context, output->type, data_type); - TF_LITE_ENSURE_EQ(context, filter->type, data_type); + input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8); + TF_LITE_ENSURE_EQ(context, output->type, input_type); TfLiteTensor* bias = nullptr; // TODO(ahentz): At this point the optimized versions require 'bias'. We can // either change that or document that convolution requires it. - TF_LITE_ENSURE(context, hasBias); + TF_LITE_ENSURE(context, has_bias); - if (hasBias) { + if (has_bias) { bias = &context->tensors[node->inputs->data[2]]; - if (data_type == kTfLiteUInt8) { + if (input_type == kTfLiteUInt8) { TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32); TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0); } else { - TF_LITE_ENSURE_EQ(context, bias->type, data_type); + TF_LITE_ENSURE_EQ(context, bias->type, input_type); } TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0)); } + const bool is_hybrid = + (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8); + + data->run_multithreaded_kernel = context->recommended_num_threads != 1; + // Hybrid kernels don't support multithreading yet. + if (is_hybrid) { + data->run_multithreaded_kernel = false; + } + + TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(context, node)); + + int channels_in = filter->dims->data[3]; int channels_out = filter->dims->data[0]; int width = input->dims->data[2]; int height = input->dims->data[1]; @@ -224,38 +271,41 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Matching GetWindowedOutputSize in TensorFlow. auto padding = params->padding; - auto computeOutSize = [padding](int imageSize, int filterSize, int stride, - int dilationRate) -> int { - int effectiveFilterSize = (filterSize - 1) * dilationRate + 1; + auto compute_out_size = [padding](int image_size, int filter_size, int stride, + int dilation_rate) -> int { + int effective_filter_size = (filter_size - 1) * dilation_rate + 1; return padding == kTfLitePaddingSame - ? (imageSize + stride - 1) / stride + ? (image_size + stride - 1) / stride : padding == kTfLitePaddingValid - ? (imageSize - effectiveFilterSize + stride) / stride + ? (image_size - effective_filter_size + stride) / stride : 0; }; - int outWidth = computeOutSize(width, filter_width, params->stride_width, - params->dilation_width_factor); - int outHeight = computeOutSize(height, filter_height, params->stride_height, - params->dilation_height_factor); + int out_width = compute_out_size(width, filter_width, params->stride_width, + params->dilation_width_factor); + int out_height = + compute_out_size(height, filter_height, params->stride_height, + params->dilation_height_factor); data->padding.height = ComputePadding(params->stride_height, params->dilation_height_factor, - height, filter_height, outHeight); + height, filter_height, out_height); data->padding.width = ComputePadding(params->stride_width, params->dilation_width_factor, width, - filter_width, outWidth); + filter_width, out_width); - TF_LITE_ENSURE(context, hasBias); + TF_LITE_ENSURE(context, has_bias); - // Note that quantized inference requires that all tensors have their + // Note that full fixed-point inference requires that all tensors have their // parameters set. This is usually done during quantized training. - if (data_type != kTfLiteFloat32) { + if (input_type != kTfLiteFloat32) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); - QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier, - &data->output_shift); + + int exponent; + QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent); + data->output_shift = -exponent; CalculateActivationRangeUint8(params->activation, output, &data->output_activation_min, &data->output_activation_max); @@ -263,8 +313,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteIntArray* output_size = TfLiteIntArrayCreate(4); output_size->data[0] = batches; - output_size->data[1] = outHeight; - output_size->data[2] = outWidth; + output_size->data[1] = out_height; + output_size->data[2] = out_width; output_size->data[3] = channels_out; auto output_status = context->ResizeTensor(context, output, output_size); @@ -283,7 +333,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* im2col = &context->tensors[node->temporaries->data[data->im2col_index]]; - im2col->type = data_type; + im2col->type = input->type; + if (is_hybrid) { + im2col->type = kTfLiteUInt8; + } im2col->allocation_type = kTfLiteArenaRw; auto im2col_status = context->ResizeTensor(context, im2col, im2col_size); if (im2col_status != kTfLiteOk) return im2col_status; @@ -303,19 +356,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* hwcn_weights = &context->tensors[node->temporaries->data[data->hwcn_weights_index]]; - hwcn_weights->type = data_type; - hwcn_weights->allocation_type = kTfLiteDynamic; - // Make sure we release any previous allocations before we reallocate. - // TODO(petewarden): Persistent arenas would be a better fit for this, but - // they aren't fully implemented yet. - if (hwcn_weights->data.raw) { - free(hwcn_weights->data.raw); - hwcn_weights->data.raw = nullptr; - } + hwcn_weights->type = input_type; + hwcn_weights->allocation_type = kTfLiteArenaRwPersistent; - // Note that hwcn_weights_status is a kTfLiteDynamic tensor, and - // ResizeTensor will actually allocate space for it. The would be more - // efficient if we placed hwcn_weights_status in the persistent arena. auto hwcn_weights_status = context->ResizeTensor(context, hwcn_weights, hwcn_weights_size); if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status; @@ -325,6 +368,36 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { data->have_weights_been_transposed = false; } + if (is_hybrid) { + node->temporaries->data[data->input_quantized_index] = + data->input_quantized_id; + TfLiteTensor* input_quantized = + GetTemporary(context, node, data->input_quantized_index); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } + + node->temporaries->data[data->scaling_factors_index] = + data->scaling_factors_id; + TfLiteTensor* scaling_factors = + GetTemporary(context, node, data->scaling_factors_index); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + // Only one scale factor per batch is typically necessary. See optimized + // implementation for why we need to allocate for the height of the inputs + // flattened to 2D. + scaling_factors_size->data[0] = NumElements(input) / channels_in; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } + } + return kTfLiteOk; } @@ -340,34 +413,70 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, auto filter_offset = -filter->params.zero_point; auto output_offset = output->params.zero_point; - switch (kernel_type) { - case kReference: + KernelType effective_kernel_type; + if ((kernel_type == kMultithreadOptimized || + kernel_type == kCblasOptimized) && + (params->dilation_width_factor != 1 || + params->dilation_height_factor != 1)) { + // kMultithreadOptimized and kCblasOptimized do not support dilation. + // Therefore, fallback to optimized. + effective_kernel_type = kGenericOptimized; + } else { + effective_kernel_type = kernel_type; + } + + switch (effective_kernel_type) { + case kReference: { + ConvParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = data->padding.width; + op_params.padding_values.height = data->padding.height; + op_params.stride_width = params->stride_width; + op_params.stride_height = params->stride_height; + op_params.dilation_width_factor = params->dilation_width_factor; + op_params.dilation_height_factor = params->dilation_height_factor; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = data->output_multiplier; + op_params.output_shift = -data->output_shift; + op_params.quantized_activation_min = data->output_activation_min; + op_params.quantized_activation_max = data->output_activation_max; reference_ops::Conv( - GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset, - GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset, - GetTensorData<int32_t>(bias), GetTensorDims(bias), - params->stride_width, params->stride_height, data->padding.width, - data->padding.height, output_offset, data->output_multiplier, - data->output_shift, data->output_activation_min, - data->output_activation_max, GetTensorData<uint8_t>(output), - GetTensorDims(output), GetTensorData<uint8_t>(im2col), - GetTensorDims(im2col), gemm_context); + op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), + GetTensorShape(filter), GetTensorData<uint8_t>(filter), + GetTensorShape(bias), GetTensorData<int32_t>(bias), + GetTensorShape(output), GetTensorData<uint8_t>(output), + GetTensorShape(im2col), GetTensorData<uint8_t>(im2col), gemm_context); break; + } case kGenericOptimized: case kMultithreadOptimized: - case kCblasOptimized: + case kCblasOptimized: { // There is only one optimized implementation for Quantized Conv. + ConvParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = data->padding.width; + op_params.padding_values.height = data->padding.height; + op_params.stride_width = params->stride_width; + op_params.stride_height = params->stride_height; + op_params.dilation_width_factor = params->dilation_width_factor; + op_params.dilation_height_factor = params->dilation_height_factor; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = data->output_multiplier; + op_params.output_shift = -data->output_shift; + op_params.quantized_activation_min = data->output_activation_min; + op_params.quantized_activation_max = data->output_activation_max; optimized_ops::Conv( - GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset, - GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset, - GetTensorData<int32_t>(bias), GetTensorDims(bias), - params->stride_width, params->stride_height, data->padding.width, - data->padding.height, output_offset, data->output_multiplier, - data->output_shift, data->output_activation_min, - data->output_activation_max, GetTensorData<uint8_t>(output), - GetTensorDims(output), GetTensorData<uint8_t>(im2col), - GetTensorDims(im2col), gemm_context); + op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), + GetTensorShape(filter), GetTensorData<uint8_t>(filter), + GetTensorShape(bias), GetTensorData<int32_t>(bias), + GetTensorShape(output), GetTensorData<uint8_t>(output), + GetTensorShape(im2col), GetTensorData<uint8_t>(im2col), gemm_context); break; + } } } @@ -377,42 +486,46 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col, TfLiteTensor* hwcn_weights, TfLiteTensor* output) { float output_activation_min, output_activation_max; - CalculateActivationRangeFloat(params->activation, &output_activation_min, - &output_activation_max); + CalculateActivationRange(params->activation, &output_activation_min, + &output_activation_max); KernelType effective_kernel_type; - if (((kernel_type == kMultithreadOptimized) || - (kernel_type == kCblasOptimized)) && - ((params->dilation_width_factor != 1) || - (params->dilation_height_factor != 1))) { + if ((kernel_type == kMultithreadOptimized || + kernel_type == kCblasOptimized) && + (params->dilation_width_factor != 1 || + params->dilation_height_factor != 1)) { // kMultithreadOptimized and kCblasOptimized do not support dilation. // Therefore, fallback to optimized. effective_kernel_type = kGenericOptimized; } else { effective_kernel_type = kernel_type; } + ConvParams op_params; + op_params.padding_type = RuntimePaddingType(params->padding); + op_params.padding_values.width = data->padding.width; + op_params.padding_values.height = data->padding.height; + op_params.stride_width = params->stride_width; + op_params.stride_height = params->stride_height; + op_params.dilation_width_factor = params->dilation_width_factor; + op_params.dilation_height_factor = params->dilation_height_factor; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; switch (effective_kernel_type) { case kReference: { - reference_ops::Conv( - GetTensorData<float>(input), GetTensorDims(input), - GetTensorData<float>(filter), GetTensorDims(filter), - GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width, - params->stride_height, params->dilation_width_factor, - params->dilation_height_factor, data->padding.width, - data->padding.height, output_activation_min, output_activation_max, - GetTensorData<float>(output), GetTensorDims(output), - GetTensorData<float>(im2col), GetTensorDims(im2col)); + reference_ops::Conv(op_params, GetTensorShape(input), + GetTensorData<float>(input), GetTensorShape(filter), + GetTensorData<float>(filter), GetTensorShape(bias), + GetTensorData<float>(bias), GetTensorShape(output), + GetTensorData<float>(output), GetTensorShape(im2col), + GetTensorData<float>(im2col)); break; } case kGenericOptimized: { - optimized_ops::Conv( - GetTensorData<float>(input), GetTensorDims(input), - GetTensorData<float>(filter), GetTensorDims(filter), - GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width, - params->stride_height, params->dilation_width_factor, - params->dilation_height_factor, data->padding.width, - data->padding.height, output_activation_min, output_activation_max, - GetTensorData<float>(output), GetTensorDims(output), - GetTensorData<float>(im2col), GetTensorDims(im2col)); + optimized_ops::Conv(op_params, GetTensorShape(input), + GetTensorData<float>(input), GetTensorShape(filter), + GetTensorData<float>(filter), GetTensorShape(bias), + GetTensorData<float>(bias), GetTensorShape(output), + GetTensorData<float>(output), GetTensorShape(im2col), + GetTensorData<float>(im2col)); break; } case kMultithreadOptimized: { @@ -423,24 +536,84 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, filter_data = GetTensorData<float>(filter); } multithreaded_ops::Conv( - GetTensorData<float>(input), GetTensorDims(input), filter_data, - GetTensorDims(filter), GetTensorData<float>(bias), - GetTensorDims(bias), params->stride_width, params->stride_height, - data->padding.width, data->padding.height, params->padding, - output_activation_min, output_activation_max, - GetTensorData<float>(output), GetTensorDims(output), - GetTensorData<float>(im2col), GetTensorDims(im2col)); + *eigen_support::GetThreadPoolDevice(context), op_params, + GetTensorShape(input), GetTensorData<float>(input), + GetTensorShape(filter), filter_data, GetTensorShape(bias), + GetTensorData<float>(bias), GetTensorShape(output), + GetTensorData<float>(output), GetTensorShape(im2col), + GetTensorData<float>(im2col)); break; } case kCblasOptimized: { - cblas_ops::Conv(GetTensorData<float>(input), GetTensorDims(input), - GetTensorData<float>(filter), GetTensorDims(filter), - GetTensorData<float>(bias), GetTensorDims(bias), - params->stride_width, params->stride_height, - data->padding.width, data->padding.height, - output_activation_min, output_activation_max, - GetTensorData<float>(output), GetTensorDims(output), - GetTensorData<float>(im2col), GetTensorDims(im2col)); + cblas_ops::Conv(op_params, GetTensorShape(input), + GetTensorData<float>(input), GetTensorShape(filter), + GetTensorData<float>(filter), GetTensorShape(bias), + GetTensorData<float>(bias), GetTensorShape(output), + GetTensorData<float>(output), GetTensorShape(im2col), + GetTensorData<float>(im2col)); + break; + } + } +} + +template <KernelType kernel_type> +void EvalHybrid(TfLiteContext* context, TfLiteNode* node, + TfLiteConvParams* params, OpData* data, TfLiteTensor* input, + TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col, + TfLiteTensor* hwcn_weights, TfLiteTensor* output) { + float output_activation_min, output_activation_max; + CalculateActivationRange(params->activation, &output_activation_min, + &output_activation_max); + + const int input_size = NumElements(input) / SizeOfDimension(input, 0); + const int batch_size = SizeOfDimension(input, 0); + + const TfLiteTensor* input_quantized = + GetTemporary(context, node, data->input_quantized_index); + int8_t* quantized_input_ptr_batch = + reinterpret_cast<int8_t*>(input_quantized->data.uint8); + float* scaling_factors_ptr = + GetTemporary(context, node, data->scaling_factors_index)->data.f; + + // Per-batch input quantization for higher accuracy. + for (int b = 0; b < batch_size; ++b) { + float unused_min, unused_max; + const int offset = b * input_size; + tensor_utils::SymmetricQuantizeFloats( + input->data.f + offset, input_size, quantized_input_ptr_batch + offset, + &unused_min, &unused_max, &scaling_factors_ptr[b]); + scaling_factors_ptr[b] *= filter->params.scale; + } + + int8_t* im2col_ptr = nullptr; + if (im2col != nullptr) { + im2col_ptr = reinterpret_cast<int8_t*>(im2col->data.uint8); + } + int8_t* filter_ptr = reinterpret_cast<int8_t*>(filter->data.uint8); + + switch (kernel_type) { + case kReference: + case kGenericOptimized: + case kMultithreadOptimized: + case kCblasOptimized: { + // There is only one implementation for hybrid kernel. Note + // this does not make use of gemmlowp nor supports multithreading. + ConvParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = data->padding.width; + op_params.padding_values.height = data->padding.height; + op_params.stride_width = params->stride_width; + op_params.stride_height = params->stride_height; + op_params.dilation_width_factor = 1; + op_params.dilation_height_factor = 1; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + optimized_ops::HybridConv( + op_params, scaling_factors_ptr, GetTensorShape(input), + quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr, + GetTensorShape(bias), GetTensorData<float>(bias), + GetTensorShape(output), GetTensorData<float>(output), + GetTensorShape(im2col), im2col_ptr); break; } } @@ -454,9 +627,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = &context->tensors[node->outputs->data[0]]; TfLiteTensor* input = &context->tensors[node->inputs->data[0]]; TfLiteTensor* filter = &context->tensors[node->inputs->data[1]]; - bool hasBias = node->inputs->size == 3; + bool has_bias = node->inputs->size == 3; TfLiteTensor* bias = - hasBias ? &context->tensors[node->inputs->data[2]] : nullptr; + has_bias ? &context->tensors[node->inputs->data[2]] : nullptr; TfLiteTensor* im2col = data->need_im2col ? &context->tensors[node->temporaries->data[data->im2col_index]] @@ -475,7 +648,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { // separate ops to avoid dispatch overhead here. switch (input->type) { // Already know in/outtypes are same. case kTfLiteFloat32: - if (data->run_multithreaded_kernel) { + if (filter->type == kTfLiteUInt8) { + EvalHybrid<kernel_type>(context, node, params, data, input, filter, + bias, im2col, hwcn_weights, output); + } else if (data->run_multithreaded_kernel) { EvalFloat<kernel_type>(context, node, params, data, input, filter, bias, im2col, hwcn_weights, output); } else { @@ -488,7 +664,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { bias, im2col, hwcn_weights, output); break; default: - context->ReportError(context, "Type not currently supported."); + context->ReportError(context, "Type %d not currently supported.", + input->type); return kTfLiteError; } return kTfLiteOk; diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc index 0dcfc826fd..f7e6f083ed 100644 --- a/tensorflow/contrib/lite/kernels/conv_test.cc +++ b/tensorflow/contrib/lite/kernels/conv_test.cc @@ -64,12 +64,6 @@ class BaseConvolutionOpModel : public SingleOpModel { } output_ = AddOutput(output); - if (input.type != TensorType_FLOAT32) { - // The following is required by quantized inference. It is the unittest's - // responsibility to make sure the output scale falls into the correct - // range. - CHECK_LT(GetScale(input_) * GetScale(filter_), GetScale(output_)); - } SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions, CreateConv2DOptions( @@ -148,6 +142,128 @@ TEST_P(ConvolutionOpTest, SimpleTestFloat32) { })); } +// This test's output is equivalent to the SimpleTestFloat32 +// because we break each input into two channels, each with half of the value, +// while keeping the filters for each channel equivalent. +// +// 2 * (A/2) * B = A * B, where the left side is this new test. +TEST_P(ConvolutionOpTest, SimpleTestFloat32WithChannels) { + ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}}, + {TensorType_FLOAT32, {3, 2, 2, 2}}, + {TensorType_FLOAT32, {}}); + + m.SetInput({ + // First batch + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, // row = 1 + 1, 1, 1, 1, 1, 1, 1, 1, // row = 2 + // Second batch + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, // row = 1 + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2 // row = 2 + }); + m.SetFilter({ + 1, 1, 2, 2, 3, 3, 4, 4, // first 2x2 filter + -1, -1, 1, 1, -1, -1, 1, 1, // second 2x2 filter + -1, -1, -1, -1, 1, 1, 1, 1 // third 2x2 filter + }); + m.SetBias({1, 2, 3}); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 18, 2, 5, // first batch, left + 18, 2, 5, // first batch, right + 17, 4, 3, // second batch, left + 37, 4, 3, // second batch, right + })); +} + +TEST_P(ConvolutionOpTest, InputAndFilterSameWidthHeight) { + ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}}, + {TensorType_FLOAT32, {1, 2, 4, 1}}, + {TensorType_FLOAT32, {}}); + + m.SetInput({ + // First batch + 1, 1, 1, 1, // row = 1 + 2, 2, 2, 2, // row = 2 + // Second batch + 1, 2, 3, 4, // row = 1 + 1, 2, 3, 4, // row = 2 + }); + m.SetFilter({ + 1, 2, 3, 4, // row = 1 + -1, -1, 1, 1, // row = 2 + }); + m.SetBias({0}); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray({10, 34})); +} + +TEST_P(ConvolutionOpTest, PointwiseFloat32) { + ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}}, + {TensorType_FLOAT32, {1, 1, 1, 2}}, + {TensorType_FLOAT32, {}}, 1, 1); + + m.SetInput({ + // First batch + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, // row = 1 + 1, 1, 1, 1, 1, 1, 1, 1, // row = 2 + // Second batch + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, // row = 1 + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2 // row = 2 + }); + + m.SetFilter({ + 1, 2, // first filter + }); + m.SetBias({0}); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + // First batch + 1.5, 1.5, 1.5, 1.5, // row = 1 + 3., 3., 3., 3., // row = 2 + // Second batch + 1.5, 3., 4.5, 6., // row = 1 + 1.5, 3., 4.5, 6., // row = 2 + })); +} + +// TODO(alanchiao): this passes locally, but fails on continuous build system. +// Re-enable when root cause found. +TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterFloat32) { + ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}}, + {TensorType_FLOAT32, {2, 1, 1, 2}}, + {TensorType_FLOAT32, {}}, 1, 1); + + m.SetInput({ + // First batch + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, // row = 1 + 1, 1, 1, 1, 1, 1, 1, 1, // row = 2 + // Second batch + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, // row = 1 + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2 // row = 2 + }); + + m.SetFilter({ + 1, 2, // first filter + 2, 3, // second filter + }); + m.SetBias({0}); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), + ElementsAreArray({ + 1.5, 2.5, 1.5, 2.5, 1.5, 2.5, 1.5, 2.5, 3., 5., 3., + 5., 3., 5., 3., 5., 1.5, 2.5, 3., 5., 4.5, 7.5, + 6., 10., 1.5, 2.5, 3., 5., 4.5, 7.5, 6., 10., + })); +} + TEST_P(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) { ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 6, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}}, @@ -376,6 +492,65 @@ TEST_P(ConvolutionOpTest, HandCalculatedValidFloat32) { EXPECT_THAT(m.GetOutput(), ElementsAreArray({312, 357})); } +TEST_P(ConvolutionOpTest, SimpleTestFloatWithDilation) { + const int depth = 1; + const int image_width = 9; + const int image_height = 9; + const int image_batch_count = 1; + const int filter_size = 3; + const int filter_count = 1; + const int stride_width = 1; + const int stride_height = 1; + const int dilation_width_factor = 3; + const int dilation_height_factor = 3; + const Padding padding = Padding_VALID; + ConvolutionOpModel m( + GetRegistration(), + {TensorType_FLOAT32, + {image_batch_count, image_height, image_width, depth}}, + {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}}, + {TensorType_FLOAT32, {}}, stride_width, stride_height, padding, + ActivationFunctionType_NONE, dilation_width_factor, + dilation_height_factor); + + // The image matrix is: + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // clang-format off + m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0}); + // clang-format on + // The filter matrix is: + // | 1 | 2 | 3 | + // | 4 | 5 | 6 | + // | 7 | 8 | 9 | + m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9}); + // No bias for this test. + m.SetBias({0}); + m.Invoke(); + + // Since the dilation rate is 3 this will reduce the size of the output from + // 10x10 to 3x3 of all 5s. Specifically: + // | 5 | 5 | 5 | + // | 5 | 5 | 5 | + // | 5 | 5 | 5 | + EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5})); +} + class QuantizedConvolutionOpModel : public BaseConvolutionOpModel { public: using BaseConvolutionOpModel::BaseConvolutionOpModel; @@ -441,6 +616,44 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantized) { })); } +TEST_P(ConvolutionOpTest, SimpleTestQuantizedOutputMultiplierGreaterThan1) { + // output_multiplier = 1.0118 + QuantizedConvolutionOpModel quant_op( + GetRegistration(), {TensorType_UINT8, {2, 2, 4, 1}, -128.5, 128}, + {TensorType_UINT8, {3, 2, 2, 1}, -128.5, 128}, + {TensorType_UINT8, {}, -127, 128}); + ConvolutionOpModel float_op( + GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}}, + {TensorType_FLOAT32, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}}); + std::initializer_list<float> input = { + // First batch + 1, 1, 1, 1, // row = 1 + 2, 2, 2, 2, // row = 2 + // Second batch + 1, 2, 3, 4, // row = 1 + 1, 2, 3, 4, // row = 2 + }; + std::initializer_list<float> filter = { + 1, 2, 3, 4, // first 2x2 filter + -1, 1, -1, 1, // second 2x2 filter + -1, -1, 1, 1, // third 2x2 filter + }; + std::initializer_list<float> bias = {1, 2, 3}; + + quant_op.SetInput(input); + quant_op.SetFilter(filter); + quant_op.SetBias(bias); + quant_op.Invoke(); + + float_op.SetInput(input); + float_op.SetFilter(filter); + float_op.SetBias(bias); + float_op.Invoke(); + + EXPECT_THAT(quant_op.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear(float_op.GetOutput(), 1))); +} + TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) { QuantizedConvolutionOpModel m(GetRegistration(), {TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64}, @@ -468,6 +681,257 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) { })); } +TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithDilation) { + const int depth = 1; + const int image_width = 9; + const int image_height = 9; + const int image_batch_count = 1; + const int filter_size = 3; + const int filter_count = 1; + const int stride_width = 1; + const int stride_height = 1; + const int dilation_width_factor = 3; + const int dilation_height_factor = 3; + const Padding padding = Padding_VALID; + QuantizedConvolutionOpModel m( + GetRegistration(), + {TensorType_UINT8, + {image_batch_count, image_height, image_width, depth}, + 0, + 255}, + {TensorType_UINT8, + {depth, filter_size, filter_size, filter_count}, + 0, + 255}, + {TensorType_UINT8, {}, 0, 255}, stride_width, stride_height, padding, + ActivationFunctionType_NONE, dilation_width_factor, + dilation_height_factor); + + // The image matrix is: + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // clang-format off + m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0}); + // clang-format on + // The filter matrix is: + // | 1 | 2 | 3 | + // | 4 | 5 | 6 | + // | 7 | 8 | 9 | + m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9}); + // No bias for this test. + m.SetBias({0}); + m.Invoke(); + + // Since the dilation rate is 3 this will reduce the size of the output from + // 10x10 to 3x3 of all 5s. Specifically: + // | 5 | 5 | 5 | + // | 5 | 5 | 5 | + // | 5 | 5 | 5 | + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5})); +} + +class HybridConvolutionOpModel : public BaseConvolutionOpModel { + public: + using BaseConvolutionOpModel::BaseConvolutionOpModel; + + void SetInput(std::initializer_list<float> data) { + PopulateTensor(input_, data); + } + + void SetFilter(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(filter_, f); + } + + void SetBias(std::initializer_list<float> data) { + PopulateTensor(bias_, data); + } + + std::vector<float> GetOutput() { return ExtractVector<float>(output_); } +}; + +TEST_P(ConvolutionOpTest, SimpleTestHybrid) { + HybridConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}}, + {TensorType_UINT8, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}}); + + m.SetInput({ + // First batch + 1, 1, 1, 1, // row = 1 + 2, 2, 2, 2, // row = 2 + // Second batch + 1, 2, 3, 4, // row = 1 + 1, 2, 3, 4, // row = 2 + }); + m.SetFilter({ + 1, 2, 3, 4, // first 2x2 filter + -1, 1, -1, 1, // second 2x2 filter + -1, -1, 1, 1, // third 2x2 filter + }); + m.SetBias({1, 2, 3}); + + m.Invoke(); + + // Example: we get 17.1577 instead of 17. + // + // Second batch: + // 1 2 3 4 -> 32 64 95 127 with scale factor 127/4. + // 1 2 3 4 32 64 95 127 + // + // First filter: + // 1 2 -> 32 64 with scale factor of 127/4. + // 3 4 95 127 + // + // The left half of the input gives us 16288. Multiply by (4/127)^2 for + // dequantization and adding 1 for the bias gives us the result. and adding + // the bias gives us the result. + // + // The optimized kernel converts the input into this matrix via Im2Col + // + // 1 1 2 2 + // 1 1 2 2 + // 1 2 1 2 + // 3 4 3 4 + // + // and multiplies it with the filter directly. + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear( + { + 18, 2, 5, // first batch, left + 18, 2, 5, // first batch, right + 17, 4, 3, // second batch, left + 37, 4, 3, // second batch, right + }, + 0.16))); +} + +// This test's output is equivalent to the SimpleTestHybrid +// because we break each input into two channels, each with half of the value, +// while keeping the filters for each channel equivalent. +// +// 2 * (A/2) * B = A * B, where the left side is this new test. +TEST_P(ConvolutionOpTest, SimpleTestHybridWithChannels) { + HybridConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}}, + {TensorType_UINT8, {3, 2, 2, 2}}, {TensorType_FLOAT32, {}}); + + m.SetInput({ + // First batch + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, // row = 1 + 1, 1, 1, 1, 1, 1, 1, 1, // row = 2 + // Second batch + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, // row = 1 + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2 // row = 2 + }); + m.SetFilter({ + 1, 1, 2, 2, 3, 3, 4, 4, // first 2x2 filter + -1, -1, 1, 1, -1, -1, 1, 1, // second 2x2 filter + -1, -1, -1, -1, 1, 1, 1, 1 // third 2x2 filter + }); + m.SetBias({1, 2, 3}); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear( + { + 18, 2, 5, // first batch, left + 18, 2, 5, // first batch, right + 17, 4, 3, // second batch, left + 37, 4, 3, // second batch, right + }, + 0.16))); +} + +TEST_P(ConvolutionOpTest, PointwiseHybrid) { + HybridConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}}, + {TensorType_UINT8, {1, 1, 1, 2}}, {TensorType_FLOAT32, {}}, 1, 1); + + m.SetInput({ + // First batch + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, // row = 1 + 1, 1, 1, 1, 1, 1, 1, 1, // row = 2 + // Second batch + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, // row = 1 + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2 // row = 2 + }); + + m.SetFilter({ + 1, 2, // first filter + }); + m.SetBias({0}); + + m.Invoke(); + + // Example: we get 3.03156 instead of 3. + // + // Second batch: + // 0.5 0.5 1 1 1.5 1.5 2 2 -> 32 32 64 64 95 95 127 127 with scale factor + // 127/2. We care about the two 64's. + // + // Filter: + // 64 127 with scale factor of 127/2. + // + // (64 * 64 + 64 * 127) * (2/127)^2 gives us the expected result. + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear( + { + 1.5, 1.5, 1.5, 1.5, // first batch, row = 1 + 3., 3., 3., 3., // first batch, row = 2 + 1.5, 3., 4.5, 6., // second batch, row = 1 + 1.5, 3., 4.5, 6., // second batch, row = 2 + }, + 0.0316))); +} + +// TODO(alanchiao): this passes locally, but fails on continuous build system. +// Re-enable when root cause found. +TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) { + HybridConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}}, + {TensorType_UINT8, {2, 1, 1, 2}}, {TensorType_FLOAT32, {}}, 1, 1); + + m.SetInput({ + // First batch + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, // row = 1 + 1, 1, 1, 1, 1, 1, 1, 1, // row = 2 + // Second batch + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2, // row = 1 + 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2 // row = 2 + }); + + m.SetFilter({ + 1, 2, // first filter + 2, 3, // second filter + }); + m.SetBias({0}); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear( + { + 1.5, 2.5, 1.5, 2.5, 1.5, 2.5, 1.5, 2.5, 3., 5., 3., + 5., 3., 5., 3., 5., 1.5, 2.5, 3., 5., 4.5, 7.5, + 6., 10., 1.5, 2.5, 3., 5., 4.5, 7.5, 6., 10., + }, + 0.0474))); +} + INSTANTIATE_TEST_CASE_P( ConvolutionOpTest, ConvolutionOpTest, ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc index 3ad8d7d4e1..19958844a1 100644 --- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc +++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -20,8 +19,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" @@ -127,23 +126,28 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Matching GetWindowedOutputSize in TensorFlow. auto padding = params->padding; - auto compute_out_size = [padding](int imageSize, int filterSize, - int stride) -> int { + auto compute_out_size = [padding](int image_size, int filter_size, int stride, + int dilation_rate) -> int { + int effective_filter_size = (filter_size - 1) * dilation_rate + 1; return padding == kTfLitePaddingSame - ? (imageSize + stride - 1) / stride + ? (image_size + stride - 1) / stride : padding == kTfLitePaddingValid - ? (imageSize - filterSize + stride) / stride + ? (image_size - effective_filter_size + stride) / stride : 0; }; - int out_width = compute_out_size(width, filter_width, params->stride_width); + int out_width = compute_out_size(width, filter_width, params->stride_width, + params->dilation_width_factor); int out_height = - compute_out_size(height, filter_height, params->stride_height); + compute_out_size(height, filter_height, params->stride_height, + params->dilation_height_factor); - data->padding.height = ComputePadding(params->stride_height, 1, height, - filter_height, out_height); + data->padding.height = + ComputePadding(params->stride_height, params->dilation_height_factor, + height, filter_height, out_height); data->padding.width = - ComputePadding(params->stride_width, 1, width, filter_width, out_width); + ComputePadding(params->stride_width, params->dilation_width_factor, width, + filter_width, out_width); // Note that quantized inference requires that all tensors have their // parameters set. This is usually done during quantized training. @@ -151,8 +155,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); - QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier, - &data->output_shift); + int exponent; + QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent); + data->output_shift = -exponent; CalculateActivationRangeUint8(params->activation, output, &data->output_activation_min, &data->output_activation_max); @@ -172,25 +177,34 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output) { float output_activation_min, output_activation_max; - CalculateActivationRangeFloat(params->activation, &output_activation_min, - &output_activation_max); + CalculateActivationRange(params->activation, &output_activation_min, + &output_activation_max); - void (*depthwise_conv)(const float*, const Dims<4>&, const float*, - const Dims<4>&, const float*, const Dims<4>&, int, int, - int, int, int, float, float, float*, const Dims<4>&); + void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&, + const float*, const RuntimeShape&, const float*, + const RuntimeShape&, const float*, const RuntimeShape&, + float*); if (kernel_type == kReference) { depthwise_conv = &reference_ops::DepthwiseConv; } else { depthwise_conv = &optimized_ops::DepthwiseConv; } - depthwise_conv( - GetTensorData<float>(input), GetTensorDims(input), - GetTensorData<float>(filter), GetTensorDims(filter), - GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width, - params->stride_height, data->padding.width, data->padding.height, - params->depth_multiplier, output_activation_min, output_activation_max, - GetTensorData<float>(output), GetTensorDims(output)); + DepthwiseParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = data->padding.width; + op_params.padding_values.height = data->padding.height; + op_params.stride_width = params->stride_width; + op_params.stride_height = params->stride_height; + op_params.dilation_width_factor = params->dilation_width_factor; + op_params.dilation_height_factor = params->dilation_height_factor; + op_params.depth_multiplier = params->depth_multiplier; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + depthwise_conv(op_params, GetTensorShape(input), GetTensorData<float>(input), + GetTensorShape(filter), GetTensorData<float>(filter), + GetTensorShape(bias), GetTensorData<float>(bias), + GetTensorShape(output), GetTensorData<float>(output)); } template <KernelType kernel_type> @@ -202,25 +216,38 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, auto filter_offset = -filter->params.zero_point; auto output_offset = output->params.zero_point; - void (*depthwise_conv)(const uint8*, const Dims<4>&, int32, const uint8*, - const Dims<4>&, int32, const int32*, const Dims<4>&, - int, int, int, int, int, int32, int32, int, int32, - int32, uint8*, const Dims<4>&); + void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&, + const uint8*, const RuntimeShape&, const uint8*, + const RuntimeShape&, const int32*, const RuntimeShape&, + uint8*); + if (kernel_type == kReference) { depthwise_conv = &reference_ops::DepthwiseConv; } else { depthwise_conv = &optimized_ops::DepthwiseConv; } - depthwise_conv( - GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset, - GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset, - GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width, - params->stride_height, data->padding.width, data->padding.height, - params->depth_multiplier, output_offset, data->output_multiplier, - data->output_shift, data->output_activation_min, - data->output_activation_max, GetTensorData<uint8_t>(output), - GetTensorDims(output)); + DepthwiseParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = data->padding.width; + op_params.padding_values.height = data->padding.height; + op_params.stride_width = params->stride_width; + op_params.stride_height = params->stride_height; + op_params.dilation_width_factor = params->dilation_width_factor; + op_params.dilation_height_factor = params->dilation_height_factor; + op_params.depth_multiplier = params->depth_multiplier; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = data->output_multiplier; + op_params.output_shift = -data->output_shift; + op_params.quantized_activation_min = data->output_activation_min; + op_params.quantized_activation_max = data->output_activation_max; + depthwise_conv(op_params, GetTensorShape(input), + GetTensorData<uint8_t>(input), GetTensorShape(filter), + GetTensorData<uint8_t>(filter), GetTensorShape(bias), + GetTensorData<int32_t>(bias), GetTensorShape(output), + GetTensorData<uint8_t>(output)); } template <KernelType kernel_type> @@ -247,7 +274,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { bias, output); break; default: - context->ReportError(context, "Type not currently supported."); + context->ReportError(context, "Type %d not currently supported.", + input->type); return kTfLiteError; } return kTfLiteOk; diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc index 1439c8bce1..4a33a0319d 100644 --- a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc +++ b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc @@ -14,12 +14,24 @@ limitations under the License. ==============================================================================*/ #include <cstdarg> #include <gtest/gtest.h> +#include "absl/memory/memory.h" #include "tensorflow/contrib/lite/interpreter.h" #include "tensorflow/contrib/lite/kernels/register.h" #include "tensorflow/contrib/lite/kernels/test_util.h" #include "tensorflow/contrib/lite/model.h" namespace tflite { + +namespace ops { +namespace builtin { + +TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF(); +TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT(); +TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT(); + +} // namespace builtin +} // namespace ops + namespace { using ::testing::ElementsAreArray; @@ -28,9 +40,12 @@ class BaseDepthwiseConvolutionOpModel : public SingleOpModel { public: // TODO(ahentz): Also test different activation types, bias, padding types, // stride values. - BaseDepthwiseConvolutionOpModel(const TensorData& input, + BaseDepthwiseConvolutionOpModel(TfLiteRegistration* registration, + const TensorData& input, const TensorData& filter, - const TensorData& output) { + const TensorData& output, + Padding padding_type, + int dilation_factor = 1) { input_ = AddInput(input); filter_ = AddInput(filter); @@ -47,12 +62,6 @@ class BaseDepthwiseConvolutionOpModel : public SingleOpModel { } output_ = AddOutput(output); - if (input.type != TensorType_FLOAT32) { - // The following is required by quantized inference. It is the unittest's - // responsibility to make sure the output scale falls into the correct - // range. - CHECK_LT(GetScale(input_) * GetScale(filter_), GetScale(output_)); - } int input_depth = GetShape(input_)[3]; int output_depth = GetShape(filter_)[3]; @@ -61,10 +70,14 @@ class BaseDepthwiseConvolutionOpModel : public SingleOpModel { SetBuiltinOp( BuiltinOperator_DEPTHWISE_CONV_2D, BuiltinOptions_DepthwiseConv2DOptions, - CreateDepthwiseConv2DOptions(builder_, Padding_VALID, 1, 1, depth_mul, - ActivationFunctionType_NONE) + CreateDepthwiseConv2DOptions(builder_, padding_type, 1, 1, depth_mul, + ActivationFunctionType_NONE, + dilation_factor, dilation_factor) .Union()); + resolver_ = absl::make_unique<SingleOpResolver>( + BuiltinOperator_DEPTHWISE_CONV_2D, registration); + BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)}); } @@ -90,10 +103,25 @@ class DepthwiseConvolutionOpModel : public BaseDepthwiseConvolutionOpModel { std::vector<float> GetOutput() { return ExtractVector<float>(output_); } }; -TEST(DepthwiseConvolutionOpTest, SimpleTest) { - DepthwiseConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 2, 2}}, +const auto kKernelMap = new std::map<string, TfLiteRegistration*>({ + {"Reference", ops::builtin::Register_DEPTHWISE_CONVOLUTION_REF()}, + {"GenericOptimized", + ops::builtin::Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT()}, + {"NeonOptimized", ops::builtin::Register_DEPTHWISE_CONVOLUTION_NEON_OPT()}, +}); + +class DepthwiseConvolutionOpTest : public SingleOpTest { + protected: + const std::map<string, TfLiteRegistration*>& GetKernelMap() override { + return *kKernelMap; + } +}; + +TEST_P(DepthwiseConvolutionOpTest, SimpleTest) { + DepthwiseConvolutionOpModel m(GetRegistration(), + {TensorType_FLOAT32, {1, 3, 2, 2}}, {TensorType_FLOAT32, {1, 2, 2, 4}}, - {TensorType_FLOAT32, {}}); + {TensorType_FLOAT32, {}}, Padding_VALID); m.SetInput({ 1, 2, 7, 8, // column 1 @@ -116,6 +144,94 @@ TEST(DepthwiseConvolutionOpTest, SimpleTest) { })); } +TEST_P(DepthwiseConvolutionOpTest, SimpleDilatedTestPaddingValid) { + const int depth = 1; + const int image_width = 9; + const int image_height = 9; + const int image_batch_count = 1; + const int filter_size = 3; + const int filter_count = 1; + const int dilation_factor = 3; + DepthwiseConvolutionOpModel m( + GetRegistration(), + {TensorType_FLOAT32, + {image_batch_count, image_height, image_width, depth}}, + {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}}, + {TensorType_FLOAT32, {}}, Padding_VALID, dilation_factor); + + // The image matrix is: + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // clang-format off + m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0}); + // clang-format on + // The filter matrix is: + // | 1 | 2 | 3 | + // | 4 | 5 | 6 | + // | 7 | 8 | 9 | + m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9}); + // No bias for this test. + m.SetBias({0}); + m.Invoke(); + + // Since the dilation rate is 3 this will reduce the size of the output from + // 10x10 to 3x3 of all 5s. Specifically: + // | 5 | 5 | 5 | + // | 5 | 5 | 5 | + // | 5 | 5 | 5 | + EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5})); +} + +TEST_P(DepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) { + const int depth = 1; + const int image_width = 3; + const int image_height = 3; + const int image_batch_count = 1; + const int filter_size = 2; + const int filter_count = 1; + const int dilation_factor = 2; + DepthwiseConvolutionOpModel m( + GetRegistration(), + {TensorType_FLOAT32, + {image_batch_count, image_height, image_width, depth}}, + {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}}, + {TensorType_FLOAT32, {}}, Padding_SAME, dilation_factor); + + // The image matrix is: + // | 1 | 1 | 1 | + // | 1 | 1 | 1 | + // | 1 | 1 | 1 | + m.SetInput({1, 1, 1, 1, 1, 1, 1, 1, 1}); + // The filter matrix is: + // | 1 | 2 | + // | 3 | 4 | + m.SetFilter({1, 2, 3, 4}); + // No bias for this test. + m.SetBias({0}); + m.Invoke(); + + // Output: + // | 4 | 7 | 3 | + // | 6 |10 | 4 | + // | 2 | 3 | 1 | + EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1})); +} + class QuantizedDepthwiseConvolutionOpModel : public BaseDepthwiseConvolutionOpModel { public: @@ -140,13 +256,20 @@ class QuantizedDepthwiseConvolutionOpModel } }; +class QuantizedDepthwiseConvolutionOpTest : public SingleOpTest { + protected: + const std::map<string, TfLiteRegistration*>& GetKernelMap() override { + return *kKernelMap; + } +}; + // In this test we set the input and output scales so that the results match // exactly the 'non-quantized' version. -TEST(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) { +TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) { QuantizedDepthwiseConvolutionOpModel m( - {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64}, + GetRegistration(), {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64}, {TensorType_UINT8, {1, 2, 2, 4}, -63.5, 64}, - {TensorType_UINT8, {}, -127, 128}); + {TensorType_UINT8, {}, -127, 128}, Padding_VALID); m.SetInput({ 1, 2, 7, 8, // column 1 @@ -176,6 +299,152 @@ TEST(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) { })); } +TEST_P(QuantizedDepthwiseConvolutionOpTest, + SimpleTestQuantizedFilterMultiplierGreaterThan1) { + QuantizedDepthwiseConvolutionOpModel quant_op( + GetRegistration(), {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64}, + {TensorType_UINT8, {1, 2, 2, 4}, -128.5, 128}, + {TensorType_UINT8, {}, -127, 128}, Padding_VALID); + DepthwiseConvolutionOpModel float_op(GetRegistration(), + {TensorType_FLOAT32, {1, 3, 2, 2}}, + {TensorType_FLOAT32, {1, 2, 2, 4}}, + {TensorType_FLOAT32, {}}, Padding_VALID); + + std::initializer_list<float> input = { + 1, 2, 7, 8, // column 1 + 3, 4, 9, 10, // column 2 + 5, 6, 11, 12, // column 3 + }; + std::initializer_list<float> filter = { + 1, 2, 3, 4, // + -9, 10, -11, 12, // + 5, 6, 7, 8, // + 13, -14, 15, -16, // + }; + std::initializer_list<float> bias = {1, 2, 3, 4}; + + quant_op.SetInput(input); + quant_op.SetFilter(filter); + quant_op.SetBias(bias); + quant_op.Invoke(); + + float_op.SetInput(input); + float_op.SetFilter(filter); + float_op.SetBias(bias); + float_op.Invoke(); + + EXPECT_THAT(quant_op.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear(float_op.GetOutput(), 1))); +} + +TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingValid) { + const int depth = 1; + const int image_width = 9; + const int image_height = 9; + const int image_batch_count = 1; + const int filter_size = 3; + const int filter_count = 1; + const int dilation_factor = 3; + QuantizedDepthwiseConvolutionOpModel m( + GetRegistration(), + {TensorType_UINT8, + {image_batch_count, image_height, image_width, depth}, + 0, + 255}, + {TensorType_UINT8, + {depth, filter_size, filter_size, filter_count}, + 0, + 255}, + {TensorType_UINT8, {}, 0, 255}, Padding_VALID, dilation_factor); + + // The image matrix is: + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + // clang-format off + m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0}); + // clang-format on + // The filter matrix is: + // | 1 | 2 | 3 | + // | 4 | 5 | 6 | + // | 7 | 8 | 9 | + m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9}); + // No bias for this test. + m.SetBias({0}); + m.Invoke(); + + // Since the dilation rate is 3 this will reduce the size of the output from + // 10x10 to 3x3 of all 5s. Specifically: + // | 5 | 5 | 5 | + // | 5 | 5 | 5 | + // | 5 | 5 | 5 | + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5})); +} + +TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) { + const int depth = 1; + const int image_width = 3; + const int image_height = 3; + const int image_batch_count = 1; + const int filter_size = 2; + const int filter_count = 1; + const int dilation_factor = 2; + QuantizedDepthwiseConvolutionOpModel m( + GetRegistration(), + {TensorType_UINT8, + {image_batch_count, image_height, image_width, depth}, + 0, + 255}, + {TensorType_UINT8, + {depth, filter_size, filter_size, filter_count}, + 0, + 255}, + {TensorType_UINT8, {}, 0, 255}, Padding_SAME, dilation_factor); + + // The image matrix is: + // | 1 | 1 | 1 | + // | 1 | 1 | 1 | + // | 1 | 1 | 1 | + m.SetInput({1, 1, 1, 1, 1, 1, 1, 1, 1}); + // The filter matrix is: + // | 1 | 2 | + // | 3 | 4 | + m.SetFilter({1, 2, 3, 4}); + // No bias for this test. + m.SetBias({0}); + m.Invoke(); + + // Output: + // | 4 | 7 | 3 | + // | 6 |10 | 4 | + // | 2 | 3 | 1 | + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1})); +} + +INSTANTIATE_TEST_CASE_P( + DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest, + ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); + +INSTANTIATE_TEST_CASE_P( + QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest, + ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/dequantize.cc b/tensorflow/contrib/lite/kernels/dequantize.cc index 672b2170e4..59bf64e0af 100644 --- a/tensorflow/contrib/lite/kernels/dequantize.cc +++ b/tensorflow/contrib/lite/kernels/dequantize.cc @@ -15,8 +15,8 @@ limitations under the License. #include <string.h> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -36,6 +36,21 @@ struct OpContext { TfLiteTensor* output; }; +struct OpData { + // This boolean value is only used when the input tensor is constant. + bool float_dequantized_weights_initialized; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* op_data = new OpData(); + op_data->float_dequantized_weights_initialized = false; + return op_data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<OpData*>(buffer); +} + TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); @@ -45,28 +60,43 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8); op_context.output->type = kTfLiteFloat32; + // If the input tensor is constant, we can persist the dequantized value in + // the output tensor. Otherwise we run dequantize upon each eval. + if (IsConstantTensor(op_context.input)) { + op_context.output->allocation_type = kTfLiteArenaRwPersistent; + } return context->ResizeTensor(context, op_context.output, TfLiteIntArrayCopy(op_context.input->dims)); } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + OpData* op_data = reinterpret_cast<OpData*>(node->user_data); OpContext op_context(context, node); + if (IsConstantTensor(op_context.input) && + op_data->float_dequantized_weights_initialized) { + return kTfLiteOk; + } - auto zero_point = op_context.input->params.zero_point; - auto scale = op_context.input->params.scale; + tflite::DequantizationParams op_params; + op_params.zero_point = op_context.input->params.zero_point; + op_params.scale = op_context.input->params.scale; + optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input), + GetTensorData<uint8_t>(op_context.input), + GetTensorShape(op_context.output), + GetTensorData<float>(op_context.output)); + + if (IsConstantTensor(op_context.input)) { + op_data->float_dequantized_weights_initialized = true; + } - optimized_ops::Dequantize(GetTensorData<uint8_t>(op_context.input), - GetTensorDims(op_context.input), zero_point, scale, - GetTensorData<float>(op_context.output), - GetTensorDims(op_context.output)); return kTfLiteOk; } } // namespace dequantize TfLiteRegistration* Register_DEQUANTIZE_OPT() { - static TfLiteRegistration r = {nullptr, nullptr, dequantize::Prepare, - dequantize::Eval}; + static TfLiteRegistration r = {dequantize::Init, dequantize::Free, + dequantize::Prepare, dequantize::Eval}; return &r; } diff --git a/tensorflow/contrib/lite/kernels/detection_postprocess.cc b/tensorflow/contrib/lite/kernels/detection_postprocess.cc new file mode 100644 index 0000000000..e21dc5ced9 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/detection_postprocess.cc @@ -0,0 +1,591 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <string.h> +#include <numeric> +#include <vector> +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace custom { +namespace detection_postprocess { + +// Input tensors +constexpr int kInputTensorBoxEncodings = 0; +constexpr int kInputTensorClassPredictions = 1; +constexpr int kInputTensorAnchors = 2; + +// Output tensors +constexpr int kOutputTensorDetectionBoxes = 0; +constexpr int kOutputTensorDetectionClasses = 1; +constexpr int kOutputTensorDetectionScores = 2; +constexpr int kOutputTensorNumDetections = 3; + +constexpr int kNumCoordBox = 4; +constexpr int kBatchSize = 1; + +// Object Detection model produces axis-aligned boxes in two formats: +// BoxCorner represents the upper right (xmin, ymin) and +// lower left corner (xmax, ymax). +// CenterSize represents the center (xcenter, ycenter), height and width. +// BoxCornerEncoding and CenterSizeEncoding are related as follows: +// ycenter = y / y_scale * anchor.h + anchor.y; +// xcenter = x / x_scale * anchor.w + anchor.x; +// half_h = 0.5*exp(h/ h_scale)) * anchor.h; +// half_w = 0.5*exp(w / w_scale)) * anchor.w; +// ymin = ycenter - half_h +// ymax = ycenter + half_h +// xmin = xcenter - half_w +// xmax = xcenter + half_w +struct BoxCornerEncoding { + float ymin; + float xmin; + float ymax; + float xmax; +}; + +struct CenterSizeEncoding { + float y; + float x; + float h; + float w; +}; +// We make sure that the memory allocations are contiguous with static assert. +static_assert(sizeof(BoxCornerEncoding) == sizeof(float) * kNumCoordBox, + "Size of BoxCornerEncoding is 4 float values"); +static_assert(sizeof(CenterSizeEncoding) == sizeof(float) * kNumCoordBox, + "Size of CenterSizeEncoding is 4 float values"); + +struct OpData { + int max_detections; + int max_classes_per_detection; + float non_max_suppression_score_threshold; + float intersection_over_union_threshold; + int num_classes; + CenterSizeEncoding scale_values; + // Indices of Temporary tensors + int decoded_boxes_index; + int scores_index; + int active_candidate_index; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* op_data = new OpData; + const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer); + const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap(); + op_data->max_detections = m["max_detections"].AsInt32(); + op_data->max_classes_per_detection = m["max_classes_per_detection"].AsInt32(); + op_data->non_max_suppression_score_threshold = + m["nms_score_threshold"].AsFloat(); + op_data->intersection_over_union_threshold = m["nms_iou_threshold"].AsFloat(); + op_data->num_classes = m["num_classes"].AsInt32(); + op_data->scale_values.y = m["y_scale"].AsFloat(); + op_data->scale_values.x = m["x_scale"].AsFloat(); + op_data->scale_values.h = m["h_scale"].AsFloat(); + op_data->scale_values.w = m["w_scale"].AsFloat(); + context->AddTensors(context, 1, &op_data->decoded_boxes_index); + context->AddTensors(context, 1, &op_data->scores_index); + context->AddTensors(context, 1, &op_data->active_candidate_index); + return op_data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<OpData*>(buffer); +} + +// TODO(chowdhery): Add to kernel_util.h +TfLiteStatus SetTensorSizes(TfLiteContext* context, TfLiteTensor* tensor, + std::initializer_list<int> values) { + TfLiteIntArray* size = TfLiteIntArrayCreate(values.size()); + int index = 0; + for (int v : values) { + size->data[index] = v; + ++index; + } + return context->ResizeTensor(context, tensor, size); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + auto* op_data = reinterpret_cast<OpData*>(node->user_data); + // Inputs: box_encodings, scores, anchors + TF_LITE_ENSURE_EQ(context, NumInputs(node), 3); + const TfLiteTensor* input_box_encodings = + GetInput(context, node, kInputTensorBoxEncodings); + const TfLiteTensor* input_class_predictions = + GetInput(context, node, kInputTensorClassPredictions); + const TfLiteTensor* input_anchors = + GetInput(context, node, kInputTensorAnchors); + TF_LITE_ENSURE_EQ(context, NumDimensions(input_box_encodings), 3); + TF_LITE_ENSURE_EQ(context, NumDimensions(input_class_predictions), 3); + TF_LITE_ENSURE_EQ(context, NumDimensions(input_anchors), 2); + // number of detected boxes + const int num_detected_boxes = + op_data->max_detections * op_data->max_classes_per_detection; + + // Outputs: detection_boxes, detection_scores, detection_classes, + // num_detections + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 4); + // Output Tensor detection_boxes: size is set to (1, num_detected_boxes, 4) + TfLiteTensor* detection_boxes = + GetOutput(context, node, kOutputTensorDetectionBoxes); + detection_boxes->type = kTfLiteFloat32; + SetTensorSizes(context, detection_boxes, + {kBatchSize, num_detected_boxes, kNumCoordBox}); + + // Output Tensor detection_classes: size is set to (1, num_detected_boxes) + TfLiteTensor* detection_classes = + GetOutput(context, node, kOutputTensorDetectionClasses); + detection_classes->type = kTfLiteFloat32; + SetTensorSizes(context, detection_classes, {kBatchSize, num_detected_boxes}); + + // Output Tensor detection_scores: size is set to (1, num_detected_boxes) + TfLiteTensor* detection_scores = + GetOutput(context, node, kOutputTensorDetectionScores); + detection_scores->type = kTfLiteFloat32; + SetTensorSizes(context, detection_scores, {kBatchSize, num_detected_boxes}); + + // Output Tensor num_detections: size is set to 1 + TfLiteTensor* num_detections = + GetOutput(context, node, kOutputTensorNumDetections); + num_detections->type = kTfLiteFloat32; + // TODO (chowdhery): Make it a scalar when available + SetTensorSizes(context, num_detections, {1}); + + // Temporary tensors + TfLiteIntArrayFree(node->temporaries); + node->temporaries = TfLiteIntArrayCreate(3); + node->temporaries->data[0] = op_data->decoded_boxes_index; + node->temporaries->data[1] = op_data->scores_index; + node->temporaries->data[2] = op_data->active_candidate_index; + + // decoded_boxes + TfLiteTensor* decoded_boxes = &context->tensors[op_data->decoded_boxes_index]; + decoded_boxes->type = kTfLiteFloat32; + decoded_boxes->allocation_type = kTfLiteArenaRw; + SetTensorSizes(context, decoded_boxes, + {input_box_encodings->dims->data[1], kNumCoordBox}); + + // scores + TfLiteTensor* scores = &context->tensors[op_data->scores_index]; + scores->type = kTfLiteFloat32; + scores->allocation_type = kTfLiteArenaRw; + SetTensorSizes(context, scores, + {input_class_predictions->dims->data[1], + input_class_predictions->dims->data[2]}); + + // active_candidate + TfLiteTensor* active_candidate = + &context->tensors[op_data->active_candidate_index]; + active_candidate->type = kTfLiteUInt8; + active_candidate->allocation_type = kTfLiteArenaRw; + SetTensorSizes(context, active_candidate, + {input_box_encodings->dims->data[1]}); + + return kTfLiteOk; +} + +class Dequantizer { + public: + Dequantizer(int zero_point, float scale) + : zero_point_(zero_point), scale_(scale) {} + float operator()(uint8 x) { + return (static_cast<float>(x) - zero_point_) * scale_; + } + + private: + int zero_point_; + float scale_; +}; + +void DequantizeBoxEncodings(const TfLiteTensor* input_box_encodings, int idx, + float quant_zero_point, float quant_scale, + CenterSizeEncoding* box_centersize) { + const uint8* boxes = + GetTensorData<uint8>(input_box_encodings) + kNumCoordBox * idx; + Dequantizer dequantize(quant_zero_point, quant_scale); + box_centersize->y = dequantize(boxes[0]); + box_centersize->x = dequantize(boxes[1]); + box_centersize->h = dequantize(boxes[2]); + box_centersize->w = dequantize(boxes[3]); +} + +template <class T> +T ReInterpretTensor(const TfLiteTensor* tensor) { + // TODO (chowdhery): check float + const float* tensor_base = tensor->data.f; + return reinterpret_cast<T>(tensor_base); +} + +template <class T> +T ReInterpretTensor(TfLiteTensor* tensor) { + // TODO (chowdhery): check float + float* tensor_base = tensor->data.f; + return reinterpret_cast<T>(tensor_base); +} + +TfLiteStatus DecodeCenterSizeBoxes(TfLiteContext* context, TfLiteNode* node, + OpData* op_data) { + // Parse input tensor boxencodings + const TfLiteTensor* input_box_encodings = + GetInput(context, node, kInputTensorBoxEncodings); + TF_LITE_ENSURE_EQ(context, input_box_encodings->dims->data[0], kBatchSize); + const int num_boxes = input_box_encodings->dims->data[1]; + TF_LITE_ENSURE_EQ(context, input_box_encodings->dims->data[2], kNumCoordBox); + const TfLiteTensor* input_anchors = + GetInput(context, node, kInputTensorAnchors); + + // Decode the boxes to get (ymin, xmin, ymax, xmax) based on the anchors + CenterSizeEncoding box_centersize; + CenterSizeEncoding scale_values = op_data->scale_values; + CenterSizeEncoding anchor; + for (int idx = 0; idx < num_boxes; ++idx) { + switch (input_box_encodings->type) { + // Quantized + case kTfLiteUInt8: + DequantizeBoxEncodings( + input_box_encodings, idx, + static_cast<float>(input_box_encodings->params.zero_point), + static_cast<float>(input_box_encodings->params.scale), + &box_centersize); + DequantizeBoxEncodings( + input_anchors, idx, + static_cast<float>(input_anchors->params.zero_point), + static_cast<float>(input_anchors->params.scale), &anchor); + break; + // Float + case kTfLiteFloat32: + box_centersize = ReInterpretTensor<const CenterSizeEncoding*>( + input_box_encodings)[idx]; + anchor = + ReInterpretTensor<const CenterSizeEncoding*>(input_anchors)[idx]; + break; + default: + // Unsupported type. + return kTfLiteError; + } + + float ycenter = box_centersize.y / scale_values.y * anchor.h + anchor.y; + float xcenter = box_centersize.x / scale_values.x * anchor.w + anchor.x; + float half_h = + 0.5f * static_cast<float>(std::exp(box_centersize.h / scale_values.h)) * + anchor.h; + float half_w = + 0.5f * static_cast<float>(std::exp(box_centersize.w / scale_values.w)) * + anchor.w; + TfLiteTensor* decoded_boxes = + &context->tensors[op_data->decoded_boxes_index]; + auto& box = ReInterpretTensor<BoxCornerEncoding*>(decoded_boxes)[idx]; + box.ymin = ycenter - half_h; + box.xmin = xcenter - half_w; + box.ymax = ycenter + half_h; + box.xmax = xcenter + half_w; + } + return kTfLiteOk; +} + +void DecreasingPartialArgSort(const float* values, int num_values, + int num_to_sort, int* indices) { + std::iota(indices, indices + num_values, 0); + std::partial_sort( + indices, indices + num_to_sort, indices + num_values, + [&values](const int i, const int j) { return values[i] > values[j]; }); +} + +void SelectDetectionsAboveScoreThreshold(const std::vector<float>& values, + const float threshold, + std::vector<float>* keep_values, + std::vector<int>* keep_indices) { + for (int i = 0; i < values.size(); i++) { + if (values[i] >= threshold) { + keep_values->emplace_back(values[i]); + keep_indices->emplace_back(i); + } + } +} + +bool ValidateBoxes(const TfLiteTensor* decoded_boxes, const int num_boxes) { + for (int i = 0; i < num_boxes; ++i) { + // ymax>=ymin, xmax>=xmin + auto& box = ReInterpretTensor<const BoxCornerEncoding*>(decoded_boxes)[i]; + if (box.ymin >= box.ymax || box.xmin >= box.xmax) { + return false; + } + } + return true; +} + +float ComputeIntersectionOverUnion(const TfLiteTensor* decoded_boxes, + const int i, const int j) { + auto& box_i = ReInterpretTensor<const BoxCornerEncoding*>(decoded_boxes)[i]; + auto& box_j = ReInterpretTensor<const BoxCornerEncoding*>(decoded_boxes)[j]; + const float area_i = (box_i.ymax - box_i.ymin) * (box_i.xmax - box_i.xmin); + const float area_j = (box_j.ymax - box_j.ymin) * (box_j.xmax - box_j.xmin); + if (area_i <= 0 || area_j <= 0) return 0.0; + const float intersection_ymin = std::max<float>(box_i.ymin, box_j.ymin); + const float intersection_xmin = std::max<float>(box_i.xmin, box_j.xmin); + const float intersection_ymax = std::min<float>(box_i.ymax, box_j.ymax); + const float intersection_xmax = std::min<float>(box_i.xmax, box_j.xmax); + const float intersection_area = + std::max<float>(intersection_ymax - intersection_ymin, 0.0) * + std::max<float>(intersection_xmax - intersection_xmin, 0.0); + return intersection_area / (area_i + area_j - intersection_area); +} + +// NonMaxSuppressionSingleClass() is O(n^2) pairwise comparison between boxes +// It assumes all boxes are good in beginning and sorts based on the scores. +// If lower-scoring box has too much overlap with a higher-scoring box, +// we get rid of the lower-scoring box. +TfLiteStatus NonMaxSuppressionSingleClassHelper( + TfLiteContext* context, TfLiteNode* node, OpData* op_data, + const std::vector<float>& scores, std::vector<int>* selected) { + const TfLiteTensor* input_box_encodings = + GetInput(context, node, kInputTensorBoxEncodings); + const TfLiteTensor* decoded_boxes = + &context->tensors[op_data->decoded_boxes_index]; + const int num_boxes = input_box_encodings->dims->data[1]; + const int max_detections = op_data->max_detections; + const float non_max_suppression_score_threshold = + op_data->non_max_suppression_score_threshold; + const float intersection_over_union_threshold = + op_data->intersection_over_union_threshold; + // Maximum detections should be positive. + TF_LITE_ENSURE(context, (max_detections >= 0)); + // intersection_over_union_threshold should be positive + // and should be less than 1. + TF_LITE_ENSURE(context, (intersection_over_union_threshold > 0.0f) && + (intersection_over_union_threshold <= 1.0f)); + // Validate boxes + TF_LITE_ENSURE(context, ValidateBoxes(decoded_boxes, num_boxes)); + + // threshold scores + std::vector<int> keep_indices; + // TODO (chowdhery): Remove the dynamic allocation and replace it + // with temporaries, esp for std::vector<float> + std::vector<float> keep_scores; + SelectDetectionsAboveScoreThreshold( + scores, non_max_suppression_score_threshold, &keep_scores, &keep_indices); + + int num_scores_kept = keep_scores.size(); + std::vector<int> sorted_indices; + sorted_indices.resize(num_scores_kept); + DecreasingPartialArgSort(keep_scores.data(), num_scores_kept, num_scores_kept, + sorted_indices.data()); + + const int num_boxes_kept = num_scores_kept; + const int output_size = std::min(num_boxes_kept, max_detections); + selected->clear(); + TfLiteTensor* active_candidate = + &context->tensors[op_data->active_candidate_index]; + TF_LITE_ENSURE(context, (active_candidate->dims->data[0]) == num_boxes); + int num_active_candidate = num_boxes_kept; + uint8_t* active_box_candidate = (active_candidate->data.uint8); + for (int row = 0; row < num_boxes_kept; row++) { + active_box_candidate[row] = 1; + } + + for (int i = 0; i < num_boxes_kept; ++i) { + if (num_active_candidate == 0 || selected->size() >= output_size) break; + if (active_box_candidate[i] == 1) { + selected->push_back(keep_indices[sorted_indices[i]]); + active_box_candidate[i] = 0; + num_active_candidate--; + } else { + continue; + } + for (int j = i + 1; j < num_boxes_kept; ++j) { + if (active_box_candidate[j] == 1) { + float intersection_over_union = ComputeIntersectionOverUnion( + decoded_boxes, keep_indices[sorted_indices[i]], + keep_indices[sorted_indices[j]]); + + if (intersection_over_union > intersection_over_union_threshold) { + active_box_candidate[j] = 0; + num_active_candidate--; + } + } + } + } + return kTfLiteOk; +} + +// This function implements a fast version of Non Maximal Suppression for +// multiple classes where +// 1) we keep the top-k scores for each anchor and +// 2) during NMS, each anchor only uses the highest class score for sorting. +// 3) Compared to standard NMS, the worst runtime of this version is O(N^2) +// instead of O(KN^2) where N is the number of anchors and K the number of +// classes. +TfLiteStatus NonMaxSuppressionMultiClassFastHelper(TfLiteContext* context, + TfLiteNode* node, + OpData* op_data, + const float* scores) { + const TfLiteTensor* input_box_encodings = + GetInput(context, node, kInputTensorBoxEncodings); + const TfLiteTensor* decoded_boxes = + &context->tensors[op_data->decoded_boxes_index]; + + TfLiteTensor* detection_boxes = + GetOutput(context, node, kOutputTensorDetectionBoxes); + TfLiteTensor* detection_classes = + GetOutput(context, node, kOutputTensorDetectionClasses); + TfLiteTensor* detection_scores = + GetOutput(context, node, kOutputTensorDetectionScores); + TfLiteTensor* num_detections = + GetOutput(context, node, kOutputTensorNumDetections); + + const int num_boxes = input_box_encodings->dims->data[1]; + const int num_classes = op_data->num_classes; + const int max_categories_per_anchor = op_data->max_classes_per_detection; + // The row index offset is 1 if background class is included and 0 otherwise. + const int label_offset = 1; + TF_LITE_ENSURE(context, (label_offset != -1)); + TF_LITE_ENSURE(context, (max_categories_per_anchor > 0)); + const int num_classes_with_background = num_classes + label_offset; + const int num_categories_per_anchor = + std::min(max_categories_per_anchor, num_classes); + std::vector<float> max_scores; + max_scores.resize(num_boxes); + std::vector<int> sorted_class_indices; + sorted_class_indices.resize(num_boxes * num_classes); + for (int row = 0; row < num_boxes; row++) { + const float* box_scores = + scores + row * num_classes_with_background + label_offset; + int* class_indices = sorted_class_indices.data() + row * num_classes; + DecreasingPartialArgSort(box_scores, num_classes, num_categories_per_anchor, + class_indices); + max_scores[row] = box_scores[class_indices[0]]; + } + // Perform non-maximal suppression on max scores + std::vector<int> selected; + NonMaxSuppressionSingleClassHelper(context, node, op_data, max_scores, + &selected); + // Allocate output tensors + int output_box_index = 0; + for (const auto& selected_index : selected) { + const float* box_scores = + scores + selected_index * num_classes_with_background + label_offset; + const int* class_indices = + sorted_class_indices.data() + selected_index * num_classes; + + for (int col = 0; col < num_categories_per_anchor; ++col) { + int box_offset = num_categories_per_anchor * output_box_index + col; + // detection_boxes + ReInterpretTensor<BoxCornerEncoding*>(detection_boxes)[box_offset] = + ReInterpretTensor<const BoxCornerEncoding*>( + decoded_boxes)[selected_index]; + // detection_classes + detection_classes->data.f[box_offset] = class_indices[col]; + // detection_scores + detection_scores->data.f[box_offset] = box_scores[class_indices[col]]; + output_box_index++; + } + } + num_detections->data.f[0] = output_box_index; + return kTfLiteOk; +} + +void DequantizeClassPredictions(const TfLiteTensor* input_class_predictions, + const int num_boxes, + const int num_classes_with_background, + const TfLiteTensor* scores) { + float quant_zero_point = + static_cast<float>(input_class_predictions->params.zero_point); + float quant_scale = static_cast<float>(input_class_predictions->params.scale); + Dequantizer dequantize(quant_zero_point, quant_scale); + const uint8* scores_quant = GetTensorData<uint8>(input_class_predictions); + for (int idx = 0; idx < num_boxes * num_classes_with_background; ++idx) { + scores->data.f[idx] = dequantize(scores_quant[idx]); + } +} + +TfLiteStatus NonMaxSuppressionMultiClass(TfLiteContext* context, + TfLiteNode* node, OpData* op_data) { + // Get the input tensors + const TfLiteTensor* input_box_encodings = + GetInput(context, node, kInputTensorBoxEncodings); + const TfLiteTensor* input_class_predictions = + GetInput(context, node, kInputTensorClassPredictions); + const int num_boxes = input_box_encodings->dims->data[1]; + const int num_classes = op_data->num_classes; + TF_LITE_ENSURE_EQ(context, input_class_predictions->dims->data[0], + kBatchSize); + TF_LITE_ENSURE_EQ(context, input_class_predictions->dims->data[1], num_boxes); + const int num_classes_with_background = + input_class_predictions->dims->data[2]; + + TF_LITE_ENSURE(context, (num_classes_with_background == num_classes + 1)); + + const TfLiteTensor* scores; + switch (input_class_predictions->type) { + case kTfLiteUInt8: { + TfLiteTensor* temporary_scores = &context->tensors[op_data->scores_index]; + DequantizeClassPredictions(input_class_predictions, num_boxes, + num_classes_with_background, temporary_scores); + scores = temporary_scores; + } break; + case kTfLiteFloat32: + scores = input_class_predictions; + break; + default: + // Unsupported type. + return kTfLiteError; + } + NonMaxSuppressionMultiClassFastHelper(context, node, op_data, + GetTensorData<float>(scores)); + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + // TODO(chowdhery): Generalize for any batch size + TF_LITE_ENSURE(context, (kBatchSize == 1)); + auto* op_data = reinterpret_cast<OpData*>(node->user_data); + // These two functions correspond to two blocks in the Object Detection model. + // In future, we would like to break the custom op in two blocks, which is + // currently not feasible because we would like to input quantized inputs + // and do all calculations in float. Mixed quantized/float calculations are + // currently not supported in TFLite. + + // This fills in temporary decoded_boxes + // by transforming input_box_encodings and input_anchors from + // CenterSizeEncodings to BoxCornerEncoding + DecodeCenterSizeBoxes(context, node, op_data); + // This fills in the output tensors + // by choosing effective set of decoded boxes + // based on Non Maximal Suppression, i.e. selecting + // highest scoring non-overlapping boxes. + NonMaxSuppressionMultiClass(context, node, op_data); + + return kTfLiteOk; +} +} // namespace detection_postprocess + +TfLiteRegistration* Register_DETECTION_POSTPROCESS() { + static TfLiteRegistration r = {detection_postprocess::Init, + detection_postprocess::Free, + detection_postprocess::Prepare, + detection_postprocess::Eval}; + return &r; +} + +} // namespace custom +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc b/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc new file mode 100644 index 0000000000..1e8caebd82 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc @@ -0,0 +1,235 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <functional> +#include <memory> +#include <vector> + +#include <gtest/gtest.h> +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace ops { +namespace custom { + +TfLiteRegistration* Register_DETECTION_POSTPROCESS(); + +namespace { + +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; + +class BaseDetectionPostprocessOpModel : public SingleOpModel { + public: + BaseDetectionPostprocessOpModel(const TensorData& input1, + const TensorData& input2, + const TensorData& input3, + const TensorData& output1, + const TensorData& output2, + const TensorData& output3, + const TensorData& output4) { + input1_ = AddInput(input1); + input2_ = AddInput(input2); + input3_ = AddInput(input3); + output1_ = AddOutput(output1); + output2_ = AddOutput(output2); + output3_ = AddOutput(output3); + output4_ = AddOutput(output4); + + flexbuffers::Builder fbb; + fbb.Map([&]() { + fbb.Int("max_detections", 3); + fbb.Int("max_classes_per_detection", 1); + fbb.Float("nms_score_threshold", 0.0); + fbb.Float("nms_iou_threshold", 0.5); + fbb.Int("num_classes", 2); + fbb.Float("y_scale", 10.0); + fbb.Float("x_scale", 10.0); + fbb.Float("h_scale", 5.0); + fbb.Float("w_scale", 5.0); + }); + fbb.Finish(); + SetCustomOp("TFLite_Detection_PostProcess", fbb.GetBuffer(), + Register_DETECTION_POSTPROCESS); + BuildInterpreter({GetShape(input1_), GetShape(input2_), GetShape(input3_)}); + } + + int input1() { return input1_; } + int input2() { return input2_; } + int input3() { return input3_; } + + template <class T> + void SetInput1(std::initializer_list<T> data) { + PopulateTensor<T>(input1_, data); + } + + template <class T> + void SetInput2(std::initializer_list<T> data) { + PopulateTensor<T>(input2_, data); + } + + template <class T> + void SetInput3(std::initializer_list<T> data) { + PopulateTensor<T>(input3_, data); + } + + template <class T> + std::vector<T> GetOutput1() { + return ExtractVector<T>(output1_); + } + + template <class T> + std::vector<T> GetOutput2() { + return ExtractVector<T>(output2_); + } + + template <class T> + std::vector<T> GetOutput3() { + return ExtractVector<T>(output3_); + } + + template <class T> + std::vector<T> GetOutput4() { + return ExtractVector<T>(output4_); + } + + std::vector<int> GetOutputShape1() { return GetTensorShape(output1_); } + std::vector<int> GetOutputShape2() { return GetTensorShape(output2_); } + std::vector<int> GetOutputShape3() { return GetTensorShape(output3_); } + std::vector<int> GetOutputShape4() { return GetTensorShape(output4_); } + + protected: + int input1_; + int input2_; + int input3_; + int output1_; + int output2_; + int output3_; + int output4_; +}; + +TEST(DetectionPostprocessOpTest, FloatTest) { + BaseDetectionPostprocessOpModel m( + {TensorType_FLOAT32, {1, 6, 4}}, {TensorType_FLOAT32, {1, 6, 3}}, + {TensorType_FLOAT32, {6, 4}}, {TensorType_FLOAT32, {}}, + {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, + {TensorType_FLOAT32, {}}); + + // six boxes in center-size encoding + m.SetInput1<float>({0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}); + // class scores - two classes with background + m.SetInput2<float>({0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0., + .5, .4, 0., .3, .2}); + // six anchors in center-size encoding + m.SetInput3<float>({0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, + 0.5, 0.5, 1.0, 1.0, 0.5, 10.5, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}); + // Same boxes in box-corner encoding: + // { 0.0, 0.0, 1.0, 1.0, + // 0.0, 0.1, 1.0, 1.1, + // 0.0, -0.1, 1.0, 0.9, + // 0.0, 10.0, 1.0, 11.0, + // 0.0, 10.1, 1.0, 11.1, + // 0.0, 100.0, 1.0, 101.0} + m.Invoke(); + // detection_boxes + // in center-size + std::vector<int> output_shape1 = m.GetOutputShape1(); + EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4)); + EXPECT_THAT( + m.GetOutput1<float>(), + ElementsAreArray(ArrayFloatNear( + {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0}, + 1e-1))); + // detection_classes + std::vector<int> output_shape2 = m.GetOutputShape2(); + EXPECT_THAT(output_shape2, ElementsAre(1, 3)); + EXPECT_THAT(m.GetOutput2<float>(), + ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1))); + // detection_scores + std::vector<int> output_shape3 = m.GetOutputShape3(); + EXPECT_THAT(output_shape3, ElementsAre(1, 3)); + EXPECT_THAT(m.GetOutput3<float>(), + ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1))); + // num_detections + std::vector<int> output_shape4 = m.GetOutputShape4(); + EXPECT_THAT(output_shape4, ElementsAre(1)); + EXPECT_THAT(m.GetOutput4<float>(), + ElementsAreArray(ArrayFloatNear({3.0}, 1e-1))); +} + +TEST(DetectionPostprocessOpTest, QuantizedTest) { + BaseDetectionPostprocessOpModel m( + {TensorType_UINT8, {1, 6, 4}, -1.0, 1.0}, + {TensorType_UINT8, {1, 6, 3}, 0.0, 1.0}, + {TensorType_UINT8, {6, 4}, 0.0, 100.5}, {TensorType_FLOAT32, {}}, + {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, + {TensorType_FLOAT32, {}}); + // six boxes in center-size encoding + std::vector<std::initializer_list<float>> inputs1 = { + {0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; + m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]); + // class scores - two classes with background + std::vector<std::initializer_list<float>> inputs2 = { + {0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0., .5, .4, 0., .3, + .2}}; + m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]); + // six anchors in center-size encoding + std::vector<std::initializer_list<float>> inputs3 = { + {0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.5, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}}; + m.QuantizeAndPopulate<uint8_t>(m.input3(), inputs3[0]); + m.Invoke(); + // detection_boxes + // in center-size + std::vector<int> output_shape1 = m.GetOutputShape1(); + EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4)); + EXPECT_THAT( + m.GetOutput1<float>(), + ElementsAreArray(ArrayFloatNear( + {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0}, + 3e-1))); + // detection_classes + std::vector<int> output_shape2 = m.GetOutputShape2(); + EXPECT_THAT(output_shape2, ElementsAre(1, 3)); + EXPECT_THAT(m.GetOutput2<float>(), + ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1))); + // detection_scores + std::vector<int> output_shape3 = m.GetOutputShape3(); + EXPECT_THAT(output_shape3, ElementsAre(1, 3)); + EXPECT_THAT(m.GetOutput3<float>(), + ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1))); + // num_detections + std::vector<int> output_shape4 = m.GetOutputShape4(); + EXPECT_THAT(output_shape4, ElementsAre(1)); + EXPECT_THAT(m.GetOutput4<float>(), + ElementsAreArray(ArrayFloatNear({3.0}, 1e-1))); +} +} // namespace +} // namespace custom +} // namespace ops +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc index e52e4fe535..8d4bb51006 100644 --- a/tensorflow/contrib/lite/kernels/div.cc +++ b/tensorflow/contrib/lite/kernels/div.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" @@ -78,29 +78,47 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } template <KernelType kernel_type> -void EvalFloat(TfLiteContext* context, TfLiteNode* node, - TfLiteDivParams* params, const OpData* data, - const TfLiteTensor* input1, const TfLiteTensor* input2, - TfLiteTensor* output) { - float output_activation_min, output_activation_max; - CalculateActivationRangeFloat(params->activation, &output_activation_min, - &output_activation_max); -#define TF_LITE_DIV(type, opname) \ - type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \ - GetTensorData<float>(input2), GetTensorDims(input2), \ - output_activation_min, output_activation_max, \ - GetTensorData<float>(output), GetTensorDims(output)) - if (kernel_type == kReference) { - if (data->requires_broadcast) { - TF_LITE_DIV(reference_ops, BroadcastDiv); +void EvalDiv(TfLiteContext* context, TfLiteNode* node, TfLiteDivParams* params, + const OpData* data, const TfLiteTensor* input1, + const TfLiteTensor* input2, TfLiteTensor* output) { +#define TF_LITE_DIV(type, opname, data_type) \ + tflite::ArithmeticParams op_params; \ + data_type output_activation_min, output_activation_max; \ + CalculateActivationRange(params->activation, &output_activation_min, \ + &output_activation_max); \ + SetActivationParams(output_activation_min, output_activation_max, \ + &op_params); \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<data_type>(input1), GetTensorShape(input2), \ + GetTensorData<data_type>(input2), GetTensorShape(output), \ + GetTensorData<data_type>(output)) + if (output->type == kTfLiteInt32) { + if (kernel_type == kReference) { + if (data->requires_broadcast) { + TF_LITE_DIV(reference_ops, BroadcastDiv4DSlow, int32_t); + } else { + TF_LITE_DIV(reference_ops, Div, int32_t); + } } else { - TF_LITE_DIV(reference_ops, Div); + if (data->requires_broadcast) { + TF_LITE_DIV(optimized_ops, BroadcastDiv4DSlow, int32_t); + } else { + TF_LITE_DIV(optimized_ops, Div, int32_t); + } } - } else { - if (data->requires_broadcast) { - TF_LITE_DIV(optimized_ops, BroadcastDiv); + } else if (output->type == kTfLiteFloat32) { + if (kernel_type == kReference) { + if (data->requires_broadcast) { + TF_LITE_DIV(reference_ops, BroadcastDiv4DSlow, float); + } else { + TF_LITE_DIV(reference_ops, Div, float); + } } else { - TF_LITE_DIV(optimized_ops, Div); + if (data->requires_broadcast) { + TF_LITE_DIV(optimized_ops, BroadcastDiv4DSlow, float); + } else { + TF_LITE_DIV(optimized_ops, Div, float); + } } } #undef TF_LITE_DIV @@ -115,11 +133,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - if (output->type == kTfLiteFloat32) { - EvalFloat<kernel_type>(context, node, params, data, input1, input2, output); + if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) { + EvalDiv<kernel_type>(context, node, params, data, input1, input2, output); } else { - context->ReportError(context, - "Div only supports FLOAT32 and quantized UINT8 now."); + context->ReportError( + context, + "Div only supports FLOAT32, INT32 and quantized UINT8 now, got %d.", + output->type); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/div_test.cc b/tensorflow/contrib/lite/kernels/div_test.cc index 276b8289fb..97aa2fe04e 100644 --- a/tensorflow/contrib/lite/kernels/div_test.cc +++ b/tensorflow/contrib/lite/kernels/div_test.cc @@ -52,6 +52,13 @@ class FloatDivOpModel : public BaseDivOpModel { std::vector<float> GetOutput() { return ExtractVector<float>(output_); } }; +class IntegerDivOpModel : public BaseDivOpModel { + public: + using BaseDivOpModel::BaseDivOpModel; + + std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); } +}; + TEST(FloatDivOpTest, NoActivation) { FloatDivOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}}, @@ -75,7 +82,7 @@ TEST(FloatDivOpTest, ActivationRELU_N1_TO_1) { } TEST(FloatDivOpTest, VariousInputShapes) { - std::vector<std::initializer_list<int>> test_shapes = { + std::vector<std::vector<int>> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; for (int i = 0; i < test_shapes.size(); ++i) { FloatDivOpModel m({TensorType_FLOAT32, test_shapes[i]}, @@ -92,7 +99,7 @@ TEST(FloatDivOpTest, VariousInputShapes) { } TEST(FloatDivOpTest, WithBroadcast) { - std::vector<std::initializer_list<int>> test_shapes = { + std::vector<std::vector<int>> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; for (int i = 0; i < test_shapes.size(); ++i) { FloatDivOpModel m({TensorType_FLOAT32, test_shapes[i]}, @@ -108,6 +115,56 @@ TEST(FloatDivOpTest, WithBroadcast) { } } +TEST(IntegerDivOpTest, NoActivation) { + IntegerDivOpModel m({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, {TensorType_INT32, {}}, + ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-2, 2, -15, 8}); + m.PopulateTensor<int32_t>(m.input2(), {5, -2, -3, 5}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, -1, 5, 1})); +} + +TEST(IntegerDivOpTest, ActivationRELU_N1_TO_1) { + IntegerDivOpModel m({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, {TensorType_INT32, {}}, + ActivationFunctionType_RELU_N1_TO_1); + m.PopulateTensor<int32_t>(m.input1(), {-2, 2, -12, 8}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, -15, 5}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1, 1, 0, 1})); +} + +TEST(IntegerDivOpTest, VariousInputShapes) { + std::vector<std::vector<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + IntegerDivOpModel m({TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 3, 8, 11, -20}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 6, 5, -11, -1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-20, 1, 0, 1, -1, 20})) + << "With shape number " << i; + } +} + +TEST(IntegerDivOpTest, WithBroadcast) { + std::vector<std::vector<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + IntegerDivOpModel m({TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, {}}, // always a scalar + {TensorType_INT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 21, 7, 8, 11, -123}); + m.PopulateTensor<int32_t>(m.input2(), {3}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-6, 7, 2, 2, 3, -41})) + << "With shape number " << i; + } +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/eigen_support.cc b/tensorflow/contrib/lite/kernels/eigen_support.cc index f1fdb42624..e542ad0765 100644 --- a/tensorflow/contrib/lite/kernels/eigen_support.cc +++ b/tensorflow/contrib/lite/kernels/eigen_support.cc @@ -14,31 +14,100 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/kernels/eigen_support.h" -#include "third_party/eigen3/Eigen/Core" +#include <utility> + +#include "tensorflow/contrib/lite/arena_planner.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" namespace tflite { namespace eigen_support { +namespace { + +#ifndef EIGEN_DONT_ALIGN +// Eigen may require buffers to be algiend to 16, 32 or 64 bytes depending on +// hardware architecture and build configurations. +// If the static assertion fails, try to increase `kDefaultTensorAlignment` to +// in `arena_planner.h` to 32 or 64. +static_assert( + kDefaultTensorAlignment % EIGEN_MAX_ALIGN_BYTES == 0, + "kDefaultArenaAlignment doesn't comply with Eigen alignment requirement."); +#endif // EIGEN_DONT_ALIGN + +// We have a single global threadpool for all convolution operations. This means +// that inferences started from different threads may block each other, but +// since the underlying resource of CPU cores should be consumed by the +// operations anyway, it shouldn't affect overall performance. +class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface { + public: + // Takes ownership of 'pool' + explicit EigenThreadPoolWrapper(Eigen::ThreadPool* pool) : pool_(pool) {} + ~EigenThreadPoolWrapper() override {} -struct RefCountedEigenContext { + void Schedule(std::function<void()> fn) override { + pool_->Schedule(std::move(fn)); + } + int NumThreads() const override { return pool_->NumThreads(); } + int CurrentThreadId() const override { return pool_->CurrentThreadId(); } + + private: + std::unique_ptr<Eigen::ThreadPool> pool_; +}; + +struct RefCountedEigenContext : public TfLiteExternalContext { + std::unique_ptr<Eigen::ThreadPoolInterface> thread_pool_wrapper; + std::unique_ptr<Eigen::ThreadPoolDevice> device; int num_references = 0; }; +RefCountedEigenContext* GetEigenContext(TfLiteContext* context) { + return reinterpret_cast<RefCountedEigenContext*>( + context->GetExternalContext(context, kTfLiteEigenContext)); +} + +void InitDevice(TfLiteContext* context, RefCountedEigenContext* ptr) { + int num_threads = 4; + if (context->recommended_num_threads != -1) { + num_threads = context->recommended_num_threads; + } + ptr->device.reset(); // destroy before we invalidate the thread pool + ptr->thread_pool_wrapper.reset( + new EigenThreadPoolWrapper(new Eigen::ThreadPool(num_threads))); + ptr->device.reset( + new Eigen::ThreadPoolDevice(ptr->thread_pool_wrapper.get(), num_threads)); +} + +TfLiteStatus Refresh(TfLiteContext* context) { + Eigen::setNbThreads(context->recommended_num_threads); + + auto* ptr = GetEigenContext(context); + if (ptr != nullptr) { + InitDevice(context, ptr); + } + + return kTfLiteOk; +} + +} // namespace + void IncrementUsageCounter(TfLiteContext* context) { - auto* ptr = reinterpret_cast<RefCountedEigenContext*>(context->eigen_context); + auto* ptr = GetEigenContext(context); if (ptr == nullptr) { if (context->recommended_num_threads != -1) { Eigen::setNbThreads(context->recommended_num_threads); } ptr = new RefCountedEigenContext; + ptr->type = kTfLiteEigenContext; + ptr->Refresh = Refresh; ptr->num_references = 0; - context->eigen_context = ptr; + InitDevice(context, ptr); + context->SetExternalContext(context, kTfLiteEigenContext, ptr); } ptr->num_references++; } void DecrementUsageCounter(TfLiteContext* context) { - auto* ptr = reinterpret_cast<RefCountedEigenContext*>(context->eigen_context); + auto* ptr = GetEigenContext(context); if (ptr == nullptr) { TF_LITE_FATAL( "Call to DecrementUsageCounter() not preceded by " @@ -46,14 +115,17 @@ void DecrementUsageCounter(TfLiteContext* context) { } if (--ptr->num_references == 0) { delete ptr; - context->eigen_context = nullptr; + context->SetExternalContext(context, kTfLiteEigenContext, nullptr); } } -void SetNumThreads(TfLiteContext* context, int num_threads) { - IncrementUsageCounter(context); - Eigen::setNbThreads(num_threads); - DecrementUsageCounter(context); +const Eigen::ThreadPoolDevice* GetThreadPoolDevice(TfLiteContext* context) { + auto* ptr = GetEigenContext(context); + if (ptr == nullptr) { + TF_LITE_FATAL( + "Call to GetFromContext() not preceded by IncrementUsageCounter()"); + } + return ptr->device.get(); } } // namespace eigen_support diff --git a/tensorflow/contrib/lite/kernels/eigen_support.h b/tensorflow/contrib/lite/kernels/eigen_support.h index aa8c351fd8..feb1543f7b 100644 --- a/tensorflow/contrib/lite/kernels/eigen_support.h +++ b/tensorflow/contrib/lite/kernels/eigen_support.h @@ -15,7 +15,11 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_EIGEN_SUPPORT_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_EIGEN_SUPPORT_H_ -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" + +namespace EigenForTFLite { +struct ThreadPoolDevice; +} namespace tflite { namespace eigen_support { @@ -28,8 +32,8 @@ void IncrementUsageCounter(TfLiteContext* context); // usages all temporary Eigen objects will be deleted. void DecrementUsageCounter(TfLiteContext* context); -// Set the number of threads that can be used by Eigen. -void SetNumThreads(TfLiteContext* context, int num_threads); +const EigenForTFLite::ThreadPoolDevice* GetThreadPoolDevice( + TfLiteContext* context); } // namespace eigen_support } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/elementwise.cc b/tensorflow/contrib/lite/kernels/elementwise.cc index b719a08394..8c624b3208 100644 --- a/tensorflow/contrib/lite/kernels/elementwise.cc +++ b/tensorflow/contrib/lite/kernels/elementwise.cc @@ -14,7 +14,7 @@ limitations under the License. ==============================================================================*/ #include <cmath> -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -22,43 +22,130 @@ namespace tflite { namespace ops { namespace builtin { namespace elementwise { +namespace { -TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) { +bool IsNumericSupportedType(const TfLiteType type) { + return type == kTfLiteFloat32; +} + +bool IsLogicalSupportedType(const TfLiteType type) { + return type == kTfLiteBool; +} + +typedef bool (*IsSupportedType)(TfLiteType); +template <IsSupportedType> +TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); const TfLiteTensor* input = GetInput(context, node, 0); TfLiteTensor* output = GetOutput(context, node, 0); TF_LITE_ENSURE_EQ(context, input->type, output->type); - // Quantized float is not supported yet. - TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + if (!IsSupportedType(input->type)) { + context->ReportError(context, "Current data type %d is not supported.", + input->type); + return kTfLiteError; + } return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims)); } -TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) { +template <typename T> +inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node, + T func(T), TfLiteType expected_type) { const TfLiteTensor* input = GetInput(context, node, 0); TfLiteTensor* output = GetOutput(context, node, 0); - switch (input->type) { - case kTfLiteFloat32: { - size_t elements = NumElements(input); - const float* in = GetTensorData<float>(input); - const float* in_end = in + elements; - float* out = output->data.f; - for (; in < in_end; in++, out++) *out = std::sin(*in); - return kTfLiteOk; - } - default: { - context->ReportError(context, "Only float32 is supported currently"); - return kTfLiteError; - } + TF_LITE_ENSURE_EQ(context, input->type, expected_type); + const int64_t num_elements = NumElements(input); + const T* in_data = GetTensorData<T>(input); + T* out_data = GetTensorData<T>(output); + for (int64_t i = 0; i < num_elements; ++i) { + out_data[i] = func(in_data[i]); } + return kTfLiteOk; +} + +inline TfLiteStatus EvalNumeric(TfLiteContext* context, TfLiteNode* node, + float float_func(float)) { + return EvalImpl<float>(context, node, float_func, kTfLiteFloat32); +} + +inline TfLiteStatus EvalLogical(TfLiteContext* context, TfLiteNode* node, + bool bool_func(bool)) { + return EvalImpl<bool>(context, node, bool_func, kTfLiteBool); +} + +TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) { + return EvalNumeric(context, node, std::sin); +} + +TfLiteStatus LogEval(TfLiteContext* context, TfLiteNode* node) { + return EvalNumeric(context, node, std::log); +} + +TfLiteStatus SqrtEval(TfLiteContext* context, TfLiteNode* node) { + return EvalNumeric(context, node, std::sqrt); +} + +TfLiteStatus RsqrtEval(TfLiteContext* context, TfLiteNode* node) { + return EvalNumeric(context, node, [](float f) { return 1.f / std::sqrt(f); }); } +TfLiteStatus SquareEval(TfLiteContext* context, TfLiteNode* node) { + return EvalNumeric(context, node, [](float f) { return f * f; }); +} + +TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) { + return EvalLogical(context, node, [](bool v) { return !v; }); +} + +} // namespace } // namespace elementwise TfLiteRegistration* Register_SIN() { - static TfLiteRegistration r = {nullptr, nullptr, elementwise::SinPrepare, - elementwise::SinEval}; + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare<elementwise::IsNumericSupportedType>, + elementwise::SinEval}; + return &r; +} + +TfLiteRegistration* Register_LOG() { + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare<elementwise::IsNumericSupportedType>, + elementwise::LogEval}; + return &r; +} + +TfLiteRegistration* Register_SQRT() { + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare<elementwise::IsNumericSupportedType>, + elementwise::SqrtEval}; + return &r; +} + +TfLiteRegistration* Register_RSQRT() { + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare<elementwise::IsNumericSupportedType>, + elementwise::RsqrtEval}; + return &r; +} + +TfLiteRegistration* Register_SQUARE() { + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare<elementwise::IsNumericSupportedType>, + elementwise::SquareEval}; + return &r; +} + +TfLiteRegistration* Register_LOGICAL_NOT() { + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>, + elementwise::LogicalNotEval}; return &r; } diff --git a/tensorflow/contrib/lite/kernels/elementwise_test.cc b/tensorflow/contrib/lite/kernels/elementwise_test.cc index 412ffb04b9..5dd89a0eae 100644 --- a/tensorflow/contrib/lite/kernels/elementwise_test.cc +++ b/tensorflow/contrib/lite/kernels/elementwise_test.cc @@ -24,25 +24,40 @@ namespace { using ::testing::ElementsAreArray; -class SinOpModel : public SingleOpModel { +class ElementWiseOpBaseModel : public SingleOpModel { public: - SinOpModel(std::initializer_list<int> input_shape) { - input_ = AddInput(TensorType_FLOAT32); - output_ = AddOutput(TensorType_FLOAT32); - SetBuiltinOp(BuiltinOperator_SIN, BuiltinOptions_NONE, 0); - BuildInterpreter({input_shape}); - } - int input() const { return input_; } int output() const { return output_; } - private: + protected: int input_; int output_; }; +class ElementWiseOpFloatModel : public ElementWiseOpBaseModel { + public: + ElementWiseOpFloatModel(BuiltinOperator op, + std::initializer_list<int> input_shape) { + input_ = AddInput(TensorType_FLOAT32); + output_ = AddOutput(TensorType_FLOAT32); + SetBuiltinOp(op, BuiltinOptions_NONE, 0); + BuildInterpreter({input_shape}); + } +}; + +class ElementWiseOpBoolModel : public ElementWiseOpBaseModel { + public: + ElementWiseOpBoolModel(BuiltinOperator op, + std::initializer_list<int> input_shape) { + input_ = AddInput(TensorType_BOOL); + output_ = AddOutput(TensorType_BOOL); + SetBuiltinOp(op, BuiltinOptions_NONE, 0); + BuildInterpreter({input_shape}); + } +}; + TEST(ElementWise, Sin) { - SinOpModel m({1, 1, 4, 1}); + ElementWiseOpFloatModel m(BuiltinOperator_SIN, {1, 1, 4, 1}); m.PopulateTensor<float>(m.input(), {0, 3.1415926, -3.1415926, 1}); m.Invoke(); EXPECT_THAT(m.ExtractVector<float>(m.output()), @@ -50,6 +65,51 @@ TEST(ElementWise, Sin) { EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1})); } +TEST(ElementWise, Log) { + ElementWiseOpFloatModel m(BuiltinOperator_LOG, {1, 1, 4, 1}); + m.PopulateTensor<float>(m.input(), {1, 3.1415926, 1, 1}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<float>(m.output()), + ElementsAreArray(ArrayFloatNear({0, 1.14473, 0, 0}))); + EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1})); +} + +TEST(ElementWise, Sqrt) { + ElementWiseOpFloatModel m(BuiltinOperator_SQRT, {1, 1, 4, 1}); + m.PopulateTensor<float>(m.input(), {0, 1, 2, 4}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<float>(m.output()), + ElementsAreArray(ArrayFloatNear({0, 1, 1.41421, 2}))); + EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1})); +} + +TEST(ElementWise, Rsqrt) { + ElementWiseOpFloatModel m(BuiltinOperator_RSQRT, {1, 1, 4, 1}); + m.PopulateTensor<float>(m.input(), {1, 2, 4, 9}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<float>(m.output()), + ElementsAreArray(ArrayFloatNear({1, 0.7071, 0.5, 0.33333}))); + EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1})); +} + +TEST(ElementWise, Square) { + ElementWiseOpFloatModel m(BuiltinOperator_SQUARE, {1, 1, 4, 1}); + m.PopulateTensor<float>(m.input(), {1, 2, 0.5, -3.0}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<float>(m.output()), + ElementsAreArray(ArrayFloatNear({1, 4.0, 0.25, 9.0}))); + EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1})); +} + +TEST(ElementWise, LogicalNot) { + ElementWiseOpBoolModel m(BuiltinOperator_LOGICAL_NOT, {1, 1, 4, 1}); + m.PopulateTensor<bool>(m.input(), {true, false, true, false}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<bool>(m.output()), + ElementsAreArray({false, true, false, true})); + EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1})); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup.cc b/tensorflow/contrib/lite/kernels/embedding_lookup.cc index 7539c0b30d..fe33f98eb0 100644 --- a/tensorflow/contrib/lite/kernels/embedding_lookup.cc +++ b/tensorflow/contrib/lite/kernels/embedding_lookup.cc @@ -24,11 +24,11 @@ limitations under the License. // Output: // Output.dim[0] == Tensor[0].dim[0], num of lookups // Output.dim[1] == Tensor[1].dim[1], num of items per row -// Each item in output is a raw bytes copy of corresponding item in input. +// Each item in output is a raw bytes copy of the corresponding item in input, +// or a dequantized value in the case of a uint8 input. // When indices are out of bound, the ops will not succeed. // -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -37,8 +37,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" @@ -69,11 +69,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { return context->ResizeTensor(context, output, outputSize); } -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - TfLiteTensor* output = GetOutput(context, node, 0); - const TfLiteTensor* lookup = GetInput(context, node, 0); - const TfLiteTensor* value = GetInput(context, node, 1); - +TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, + const TfLiteTensor* lookup, const TfLiteTensor* value, + TfLiteTensor* output) { const int row_size = SizeOfDimension(value, 0); const int row_bytes = value->bytes / row_size; @@ -91,6 +89,53 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } +TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node, + const TfLiteTensor* lookup, const TfLiteTensor* value, + TfLiteTensor* output) { + const int row_size = SizeOfDimension(value, 0); + const double scaling_factor = value->params.scale; + + // col_size after we flatten tensor into 2D. + int col_size = 1; + for (int i = 1; i < NumDimensions(value); i++) { + col_size *= SizeOfDimension(value, i); + } + + for (int i = 0; i < SizeOfDimension(lookup, 0); i++) { + int idx = lookup->data.i32[i]; + if (idx >= row_size || idx < 0) { + context->ReportError(context, "Embedding Lookup: index out of bounds."); + return kTfLiteError; + } else { + // Dequantize embedding values. + // TODO(alanchiao): refactor scalar multiply into separate function + // for ease of adding a neon equivalent if ever necessary. + for (int j = 0; j < col_size; j++) { + const int8_t* value_ptr = reinterpret_cast<int8_t*>(value->data.uint8); + output->data.f[j + i * col_size] = + value_ptr[j + idx * col_size] * scaling_factor; + } + } + } + + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* lookup = GetInput(context, node, 0); + const TfLiteTensor* value = GetInput(context, node, 1); + TfLiteTensor* output = GetOutput(context, node, 0); + switch (value->type) { + case kTfLiteFloat32: + return EvalFloat(context, node, lookup, value, output); + case kTfLiteUInt8: + return EvalHybrid(context, node, lookup, value, output); + default: + context->ReportError(context, "Type not currently supported."); + return kTfLiteError; + } +} + } // namespace embedding_lookup TfLiteRegistration* Register_EMBEDDING_LOOKUP() { diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc index d3be36993c..aa75b03990 100644 --- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc +++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc @@ -65,8 +65,8 @@ limitations under the License. #include <algorithm> #include <cmath> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc index 9b501878f1..4a88d168c6 100644 --- a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc +++ b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc @@ -7,13 +7,14 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License +for the specific language governing permissions and limitations under the +License. ==============================================================================*/ // Unit test for TFLite Lookup op. +#include <initializer_list> #include <iomanip> #include <vector> @@ -29,12 +30,13 @@ namespace { using ::testing::ElementsAreArray; -class EmbeddingLookupOpModel : public SingleOpModel { +class BaseEmbeddingLookupOpModel : public SingleOpModel { public: - EmbeddingLookupOpModel(std::initializer_list<int> index_shape, - std::initializer_list<int> weight_shape) { + BaseEmbeddingLookupOpModel(std::initializer_list<int> index_shape, + std::initializer_list<int> weight_shape, + TensorType weight_type = TensorType_FLOAT32) { input_ = AddInput(TensorType_INT32); - weight_ = AddInput(TensorType_FLOAT32); + weight_ = AddInput(weight_type); output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP, BuiltinOptions_NONE, 0); BuildInterpreter({index_shape, weight_shape}); @@ -44,6 +46,18 @@ class EmbeddingLookupOpModel : public SingleOpModel { PopulateTensor(input_, data); } + std::vector<float> GetOutput() { return ExtractVector<float>(output_); } + + protected: + int input_; + int weight_; + int output_; +}; + +class EmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel { + public: + using BaseEmbeddingLookupOpModel::BaseEmbeddingLookupOpModel; + void Set3DWeightMatrix(const std::function<float(int, int, int)>& function) { TfLiteTensor* tensor = interpreter_->tensor(weight_); int rows = tensor->dims->data[0]; @@ -57,20 +71,25 @@ class EmbeddingLookupOpModel : public SingleOpModel { } } } +}; - std::vector<float> GetOutput() { return ExtractVector<float>(output_); } +class HybridEmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel { + public: + HybridEmbeddingLookupOpModel(std::initializer_list<int> index_shape, + std::initializer_list<int> weight_shape) + : BaseEmbeddingLookupOpModel(index_shape, weight_shape, + TensorType_UINT8) {} - private: - int input_; - int weight_; - int output_; + void SetWeight(std::initializer_list<float> data) { + SymmetricQuantizeAndPopulate(weight_, data); + } }; // TODO(ahentz): write more tests that exercise the details of the op, such as // lookup errors and variable input shapes. TEST(EmbeddingLookupOpTest, SimpleTest) { EmbeddingLookupOpModel m({3}, {3, 2, 4}); - m.PopulateTensor<int>(0, {1, 0, 2}); + m.SetInput({1, 0, 2}); m.Set3DWeightMatrix( [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; }); @@ -84,6 +103,69 @@ TEST(EmbeddingLookupOpTest, SimpleTest) { }))); } +TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTest) { + HybridEmbeddingLookupOpModel m({3}, {3, 8}); + m.SetInput({1, 0, 2}); + m.SetWeight({ + 0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13, // Row 0 + 1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13, // Row 1 + 2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13, // Row 2 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear( + { + 1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13, // Row 1 + 0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13, // Row 0 + 2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13, // Row 2 + }, + 7.41e-03))); +} + +TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTest) { + HybridEmbeddingLookupOpModel m({3}, {3, 2, 4}); + m.SetInput({1, 0, 2}); + m.SetWeight({ + 0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13, // Row 0 + 1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13, // Row 1 + 2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13, // Row 2 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear( + { + 1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13, // Row 1 + 0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13, // Row 0 + 2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13, // Row 2 + }, + 7.41e-03))); +} + +TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTest) { + HybridEmbeddingLookupOpModel m({3}, {3, 2, 2, 2}); + m.SetInput({1, 0, 2}); + m.SetWeight({ + 0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13, // Row 0 + 1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13, // Row 1 + 2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13, // Row 2 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear( + { + 1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13, // Row 1 + 0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13, // Row 0 + 2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13, // Row 2 + }, + 7.41e-03))); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/exp.cc b/tensorflow/contrib/lite/kernels/exp.cc index ce03cdfe26..673e7be90a 100644 --- a/tensorflow/contrib/lite/kernels/exp.cc +++ b/tensorflow/contrib/lite/kernels/exp.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include <string.h> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" diff --git a/tensorflow/contrib/lite/kernels/expand_dims.cc b/tensorflow/contrib/lite/kernels/expand_dims.cc new file mode 100644 index 0000000000..fa1140b19c --- /dev/null +++ b/tensorflow/contrib/lite/kernels/expand_dims.cc @@ -0,0 +1,113 @@ + +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <string.h> +#include <vector> +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" +namespace tflite { +namespace ops { +namespace builtin { +namespace expand_dims { +constexpr int kInput = 0; +constexpr int kAxis = 1; +constexpr int kOutput = 0; + +namespace { +TfLiteStatus ExpandTensorDim(TfLiteContext* context, const TfLiteTensor& input, + int axis, TfLiteTensor* output) { + const TfLiteIntArray& input_dims = *input.dims; + if (axis < 0) { + axis = input_dims.size + 1 + axis; + } + TF_LITE_ENSURE(context, axis <= input_dims.size); + + TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_dims.size + 1); + for (int i = 0; i < output_dims->size; ++i) { + if (i < axis) { + output_dims->data[i] = input_dims.data[i]; + } else if (i == axis) { + output_dims->data[i] = 1; + } else { + output_dims->data[i] = input_dims.data[i - 1]; + } + } + + return context->ResizeTensor(context, output, output_dims); +} + +TfLiteStatus GetAxisValueFromTensor(TfLiteContext* context, + const TfLiteTensor& axis, int* axis_value) { + TF_LITE_ENSURE_EQ(context, NumElements(&axis), 1); + switch (axis.type) { + case kTfLiteInt32: + *axis_value = *GetTensorData<int32_t>(&axis); + return kTfLiteOk; + case kTfLiteInt64: + *axis_value = *GetTensorData<int64_t>(&axis); + return kTfLiteOk; + default: + return kTfLiteError; + } +} + +} // namespace + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + const TfLiteTensor* input = GetInput(context, node, kInput); + const TfLiteTensor* axis = GetInput(context, node, kAxis); + TfLiteTensor* output = GetOutput(context, node, 0); + output->type = input->type; + if (IsConstantTensor(axis)) { + int axis_value; + TF_LITE_ENSURE_OK(context, + GetAxisValueFromTensor(context, *axis, &axis_value)); + return ExpandTensorDim(context, *input, axis_value, output); + } + SetTensorToDynamic(output); + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + // Just copy input to output. + const TfLiteTensor* input = GetInput(context, node, kInput); + TfLiteTensor* output = GetOutput(context, node, 0); + const TfLiteTensor* axis = GetInput(context, node, kAxis); + if (IsDynamicTensor(output)) { + int axis_value; + TF_LITE_ENSURE_OK(context, + GetAxisValueFromTensor(context, *axis, &axis_value)); + TF_LITE_ENSURE_OK(context, + ExpandTensorDim(context, *input, axis_value, output)); + } + memcpy(output->data.raw, input->data.raw, input->bytes); + return kTfLiteOk; +} + +} // namespace expand_dims +TfLiteRegistration* Register_EXPAND_DIMS() { + static TfLiteRegistration r = {nullptr, nullptr, expand_dims::Prepare, + expand_dims::Eval}; + return &r; +} +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/expand_dims_test.cc b/tensorflow/contrib/lite/kernels/expand_dims_test.cc new file mode 100644 index 0000000000..a3bc1813db --- /dev/null +++ b/tensorflow/contrib/lite/kernels/expand_dims_test.cc @@ -0,0 +1,83 @@ + +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; + +class ExpandDimsOpModel : public SingleOpModel { + public: + ExpandDimsOpModel(std::initializer_list<int> input_shape, + TensorType input_type) { + input_ = AddInput(input_type); + axis_ = AddInput(TensorType_INT32); + output_ = AddOutput(input_type); + SetBuiltinOp(BuiltinOperator_EXPAND_DIMS, BuiltinOptions_ExpandDimsOptions, + 0); + BuildInterpreter({input_shape, {1}}); + } + void SetInputFloat(std::initializer_list<float> data) { + PopulateTensor<float>(input_, data); + } + void SetAxis(int axis) { PopulateTensor<int32_t>(axis_, {axis}); } + std::vector<float> GetValuesFloat() { return ExtractVector<float>(output_); } + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + protected: + int input_; + int axis_; + int output_; +}; + +TEST(ExpandDimsOpTest, DifferentAxis) { + ExpandDimsOpModel m({2, 2}, TensorType_FLOAT32); + std::initializer_list<float> values = {-1.f, 1.f, -2.f, 2.f}; + m.SetInputFloat(values); + m.SetAxis(0); + m.Invoke(); + EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values)); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2})); + + m.SetAxis(1); + m.Invoke(); + EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values)); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2})); + + m.SetAxis(2); + m.Invoke(); + EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values)); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1})); + + m.SetAxis(-1); + m.Invoke(); + EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values)); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1})); +} +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/fake_quant.cc b/tensorflow/contrib/lite/kernels/fake_quant.cc new file mode 100644 index 0000000000..b51af72fe6 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/fake_quant.cc @@ -0,0 +1,95 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <string.h> +#include <vector> +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace fake_quant { + +// This file has reference implementation of FakeQuant. +enum KernelType { + kReference, +}; + +struct OpContext { + OpContext(TfLiteContext* context, TfLiteNode* node) { + input = GetInput(context, node, 0); + output = GetOutput(context, node, 0); + } + const TfLiteTensor* input; + TfLiteTensor* output; +}; + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + const auto* params = + reinterpret_cast<TfLiteFakeQuantParams*>(node->builtin_data); + + if (params->narrow_range) { + context->ReportError( + context, + "narrow_range FakeQuant is not currently supported at runtime. " + "narrow_range is only meant to be applied to weights, not activations"); + return kTfLiteError; + } + + OpContext op_context(context, node); + TfLiteIntArray* output_dims = TfLiteIntArrayCopy(op_context.input->dims); + op_context.output->type = op_context.input->type; + return context->ResizeTensor(context, op_context.output, output_dims); +} + +template <KernelType kernel_type> +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + OpContext op_context(context, node); + + const auto* params = + reinterpret_cast<TfLiteFakeQuantParams*>(node->builtin_data); + + tflite::FakeQuantParams op_params; + op_params.num_bits = params->num_bits; + op_params.minmax.min = params->min; + op_params.minmax.max = params->max; + reference_ops::FakeQuant(op_params, GetTensorShape(op_context.input), + GetTensorData<float>(op_context.input), + GetTensorShape(op_context.output), + GetTensorData<float>(op_context.output)); + + return kTfLiteOk; +} + +} // namespace fake_quant + +TfLiteRegistration* Register_FAKE_QUANT_REF() { + static TfLiteRegistration r = {nullptr, nullptr, fake_quant::Prepare, + fake_quant::Eval<fake_quant::kReference>}; + return &r; +} + +TfLiteRegistration* Register_FAKE_QUANT() { return Register_FAKE_QUANT_REF(); } + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/fake_quant_test.cc b/tensorflow/contrib/lite/kernels/fake_quant_test.cc new file mode 100644 index 0000000000..11a02f7ed7 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/fake_quant_test.cc @@ -0,0 +1,112 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; + +class FakeQuantOpModel : public SingleOpModel { + public: + FakeQuantOpModel(const TensorData& input, const TensorType& output, float min, + float max, int num_bits) { + input_ = AddInput(input); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_FAKE_QUANT, BuiltinOptions_FakeQuantOptions, + CreateFakeQuantOptions(builder_, min, max, num_bits).Union()); + BuildInterpreter({GetShape(input_)}); + } + + template <class T> + void SetInput(std::initializer_list<T> data) { + PopulateTensor(input_, data); + } + + template <class T> + std::vector<T> GetOutput() { + return ExtractVector<T>(output_); + } + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + protected: + int input_; + int output_; +}; + +TEST(FakeQuantOpTest, FloatPositiveRange8Test) { + std::initializer_list<float> data = {0.0, 1.0, 0.25, + 0.50, 0.4444444, 0.00001}; + FakeQuantOpModel m({TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32, 0.0f, + 1.0f, 8); + m.SetInput<float>(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2})); + EXPECT_THAT( + m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({0, 1, 0.25098, 0.498039, 0.443137, 0}))); +} + +TEST(FakeQuantOpTest, FloatNegativeRange8Test) { + std::initializer_list<float> data = {0.0, -0.9, 0.25, + 0.50, 0.4444444, -0.00001}; + FakeQuantOpModel m({TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32, -0.9f, + 0.9f, 8); + m.SetInput<float>(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear( + {0, -0.896471, 0.247059, 0.501176, 0.444706, 0}))); +} + +TEST(FakeQuantOpTest, FloatPositiveRange16Test) { + std::initializer_list<float> data = {0.0, 1.0, 0.25, + 0.50, 0.4444444, 0.00001}; + FakeQuantOpModel m({TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32, 0.0f, + 1.0f, 16); + m.SetInput<float>(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear( + {0, 1, 0.250004, 0.500008, 0.44445, 1.5259e-05}))); +} + +TEST(FakeQuantOpTest, FloatNegativeRange16Test) { + std::initializer_list<float> data = {0.0, -0.9, 0.25, + 0.50, 0.4444444, -0.00001}; + FakeQuantOpModel m({TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32, -0.9f, + 0.9f, 16); + m.SetInput<float>(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear( + {0, -0.900014, 0.249998, 0.499995, 0.444431, 0}))); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/floor.cc b/tensorflow/contrib/lite/kernels/floor.cc index 697b777693..59ff77f35b 100644 --- a/tensorflow/contrib/lite/kernels/floor.cc +++ b/tensorflow/contrib/lite/kernels/floor.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -41,8 +41,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input = GetInput(context, node, kInputTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - optimized_ops::Floor(GetTensorData<float>(input), GetTensorDims(input), - GetTensorData<float>(output), GetTensorDims(output)); + optimized_ops::Floor(GetTensorShape(input), GetTensorData<float>(input), + GetTensorShape(output), GetTensorData<float>(output)); + return kTfLiteOk; } } // namespace floor diff --git a/tensorflow/contrib/lite/kernels/floor_div.cc b/tensorflow/contrib/lite/kernels/floor_div.cc new file mode 100644 index 0000000000..5d62cd2755 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/floor_div.cc @@ -0,0 +1,146 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace floor_div { +namespace { + +// Input/output tensor index. +constexpr int kInputTensor1 = 0; +constexpr int kInputTensor2 = 1; +constexpr int kOutputTensor = 0; + +// Op data for floor_div op. +struct OpData { + bool requires_broadcast; +}; + +template <typename T> +T FloorDiv(T input1, T input2) { + return std::floor(std::divides<double>()(static_cast<double>(input1), + static_cast<double>(input2))); +} + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* data = new OpData; + data->requires_broadcast = false; + return data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<OpData*>(buffer); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + // Reinterprete the opaque data provided by user. + OpData* data = reinterpret_cast<OpData*>(node->user_data); + + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + TF_LITE_ENSURE_EQ(context, input1->type, input2->type); + + const TfLiteType type = input1->type; + if (type != kTfLiteInt32) { + context->ReportError(context, "Currently floor_div only supports int32."); + return kTfLiteError; + } + output->type = type; + + data->requires_broadcast = !HaveSameShapes(input1, input2); + + TfLiteIntArray* output_size = nullptr; + if (data->requires_broadcast) { + TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast( + context, input1, input2, &output_size)); + } else { + output_size = TfLiteIntArrayCopy(input1->dims); + } + + return context->ResizeTensor(context, output, output_size); +} + +template <typename T> +TfLiteStatus EvalImpl(TfLiteContext* context, bool requires_broadcast, + const TfLiteTensor* input1, const TfLiteTensor* input2, + TfLiteTensor* output) { + const T* denominator_data = GetTensorData<T>(input2); + + // Validate the denominator. + for (int i = 0; i < NumElements(input2); ++i) { + if (std::equal_to<T>()(denominator_data[i], 0)) { + context->ReportError(context, "Division by 0"); + return kTfLiteError; + } + } + if (requires_broadcast) { + reference_ops::BroadcastBinaryFunction4DSlow<T, T, T>( + GetTensorShape(input1), GetTensorData<T>(input1), + GetTensorShape(input2), denominator_data, GetTensorShape(output), + GetTensorData<T>(output), FloorDiv<T>); + } else { + reference_ops::BinaryFunction<T, T, T>( + GetTensorShape(input1), GetTensorData<T>(input1), + GetTensorShape(input2), GetTensorData<T>(input2), + GetTensorShape(output), GetTensorData<T>(output), FloorDiv<T>); + } + + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + OpData* data = reinterpret_cast<OpData*>(node->user_data); + + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + switch (input1->type) { + case kTfLiteInt32: { + return EvalImpl<int32_t>(context, data->requires_broadcast, input1, + input2, output); + } + default: { + context->ReportError(context, "Currently floor_div only supports int32."); + return kTfLiteError; + } + } +} + +} // namespace +} // namespace floor_div + +TfLiteRegistration* Register_FLOOR_DIV() { + // Init, Free, Prepare, Eval are satisfying the Interface required by + // TfLiteRegistration. + static TfLiteRegistration r = {floor_div::Init, floor_div::Free, + floor_div::Prepare, floor_div::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/floor_div_test.cc b/tensorflow/contrib/lite/kernels/floor_div_test.cc new file mode 100644 index 0000000000..eea69b61ac --- /dev/null +++ b/tensorflow/contrib/lite/kernels/floor_div_test.cc @@ -0,0 +1,90 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAre; + +template <typename T> +class FloorDivModel : public SingleOpModel { + public: + FloorDivModel(const TensorData& input1, const TensorData& input2, + const TensorData& output) { + input1_ = AddInput(input1); + input2_ = AddInput(input2); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_FLOOR_DIV, BuiltinOptions_FloorDivOptions, + CreateFloorDivOptions(builder_).Union()); + BuildInterpreter({GetShape(input1_), GetShape(input2_)}); + } + + int input1() { return input1_; } + int input2() { return input2_; } + + std::vector<T> GetOutput() { return ExtractVector<T>(output_); } + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + private: + int input1_; + int input2_; + int output_; +}; + +TEST(PowOpModel, Simple) { + FloorDivModel<int32_t> model({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {}}); + model.PopulateTensor<int32_t>(model.input1(), {10, 9, 11, 3}); + model.PopulateTensor<int32_t>(model.input2(), {2, 2, 3, 4}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), ElementsAre(5, 4, 3, 0)); +} + +TEST(PowOpModel, NegativeValue) { + FloorDivModel<int32_t> model({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {}}); + model.PopulateTensor<int32_t>(model.input1(), {10, -9, -11, 7}); + model.PopulateTensor<int32_t>(model.input2(), {2, 2, -3, -4}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), ElementsAre(5, -5, 3, -2)); +} + +TEST(PowOpModel, BroadcastFloorDiv) { + FloorDivModel<int32_t> model({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1}}, {TensorType_INT32, {}}); + model.PopulateTensor<int32_t>(model.input1(), {10, -9, -11, 7}); + model.PopulateTensor<int32_t>(model.input2(), {-3}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), ElementsAre(-4, 3, 3, -3)); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc index a486b81d76..f6d2f76dbe 100644 --- a/tensorflow/contrib/lite/kernels/fully_connected.cc +++ b/tensorflow/contrib/lite/kernels/fully_connected.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -21,8 +20,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/gemm_support.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" @@ -63,6 +62,7 @@ constexpr int kInputTensor = 0; constexpr int kWeightsTensor = 1; constexpr int kBiasTensor = 2; constexpr int kOutputTensor = 0; +constexpr int kShuffledInputWorkspaceTensor = 1; constexpr int kScratchBufferTensor = 1; void* Init(TfLiteContext* context, const char* buffer, size_t length) { @@ -70,7 +70,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { // Instead, we allocate a new object to carry information from Prepare() to // Eval(). gemm_support::IncrementUsageCounter(context); - auto* op_data = new OpData; + auto* op_data = new OpData(); context->AddTensors(context, 1, &op_data->input_quantized_index); return op_data; } @@ -87,7 +87,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Check we have all the inputs and outputs we need. TF_LITE_ENSURE_EQ(context, node->inputs->size, 3); - TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); + // Shuffled formats need a workspace to store the shuffled input activations. + const int expected_outputs_count = + params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault ? 1 + : 2; + TF_LITE_ENSURE_EQ(context, node->outputs->size, expected_outputs_count); const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor); @@ -101,16 +105,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { input_size *= input->dims->data[i]; } + TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2); const int batch_size = input_size / filter->dims->data[1]; const int num_units = filter->dims->data[0]; - TF_LITE_ASSERT_EQ(input_size, batch_size * filter->dims->data[1]); + TF_LITE_ENSURE_EQ(context, input_size, batch_size * filter->dims->data[1]); if (bias) { TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0)); } - TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2); - // Note that quantized inference requires that all tensors have their // parameters set. This is usually done during quantized training. TfLiteType data_type = input->type; @@ -118,11 +121,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); - QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier, - &data->output_shift); - CalculateActivationRangeUint8(params->activation, output, - &data->output_activation_min, - &data->output_activation_max); + int exponent; + QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent); + data->output_shift = -exponent; + TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized( + context, params->activation, output, &data->output_activation_min, + &data->output_activation_max)); } // If we have to perform on-the-fly quantization (with quantized weights and @@ -218,11 +222,8 @@ TfLiteStatus EvalPieQuantized(TfLiteContext* context, TfLiteNode* node, tensor_utils::ZeroVector(output->data.f, batch_size * num_units); } - // TODO(mirkov): change std::minmax_element with a vectorized call. - auto minmax_element = - std::minmax_element(input->data.f, input->data.f + total_input_size); // Save matrix multiplication computation for all zero input. - if (*minmax_element.first == 0.0 && *minmax_element.second == 0.0) { + if (tensor_utils::IsZeroVector(input->data.f, total_input_size)) { tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units, params->activation, output->data.f); @@ -280,30 +281,57 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, int32_t input_offset = -input->params.zero_point; int32_t filter_offset = -filter->params.zero_point; int32_t output_offset = output->params.zero_point; -#define TF_LITE_FULLY_CONNECTED(type) \ - type::FullyConnected( \ - GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset, \ - GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset, \ - GetTensorData<int32_t>(bias), GetTensorDims(bias), output_offset, \ - data->output_multiplier, data->output_shift, \ - data->output_activation_min, data->output_activation_max, \ - GetTensorData<uint8_t>(output), GetTensorDims(output), gemm_context) +#define TF_LITE_FULLY_CONNECTED(type, output_data_type) \ + { \ + FullyConnectedParams op_params; \ + op_params.input_offset = input_offset; \ + op_params.weights_offset = filter_offset; \ + op_params.output_offset = output_offset; \ + op_params.output_multiplier = data->output_multiplier; \ + op_params.output_shift = -data->output_shift; \ + op_params.quantized_activation_min = data->output_activation_min; \ + op_params.quantized_activation_max = data->output_activation_max; \ + type::FullyConnected( \ + op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \ + GetTensorShape(filter), GetTensorData<uint8_t>(filter), \ + GetTensorShape(bias), GetTensorData<int32_t>(bias), \ + GetTensorShape(output), GetTensorData<output_data_type>(output), \ + gemm_context); \ + } if (kernel_type == kReference) { - TF_LITE_FULLY_CONNECTED(reference_ops); - } else if (kernel_type == kPie) { - if (input->type == kTfLiteFloat32) { - // Pie currently only supports quantized models and float inputs/outputs. - TfLiteTensor* input_quantized = - &context->tensors[node->temporaries->data[0]]; - return EvalPieQuantized(context, node, params, data, input, filter, bias, - input_quantized, output); - } else { - // TODO(ahentz): we don't have a quantized version of the PIE kernels, so - // we just defer to the MINI ones. - TF_LITE_FULLY_CONNECTED(optimized_ops); + switch (output->type) { + case kTfLiteUInt8: + TF_LITE_FULLY_CONNECTED(reference_ops, uint8_t); + break; + case kTfLiteInt16: + TF_LITE_FULLY_CONNECTED(reference_ops, int16_t); + break; + default: + context->ReportError( + context, + "Quantized FullyConnected expects output data type uint8 or int16"); + return kTfLiteError; } + } else if (kernel_type == kPie && input->type == kTfLiteFloat32) { + // Pie currently only supports quantized models and float inputs/outputs. + TfLiteTensor* input_quantized = + &context->tensors[node->temporaries->data[0]]; + return EvalPieQuantized(context, node, params, data, input, filter, bias, + input_quantized, output); } else { - TF_LITE_FULLY_CONNECTED(optimized_ops); + switch (output->type) { + case kTfLiteUInt8: + TF_LITE_FULLY_CONNECTED(optimized_ops, uint8_t); + break; + case kTfLiteInt16: + TF_LITE_FULLY_CONNECTED(optimized_ops, int16_t); + break; + default: + context->ReportError( + context, + "Quantized FullyConnected expects output data type uint8 or int16"); + return kTfLiteError; + } } #undef TF_LITE_FULLY_CONNECTED @@ -311,19 +339,67 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, } template <KernelType kernel_type> +TfLiteStatus EvalShuffledQuantized(TfLiteContext* context, TfLiteNode* node, + TfLiteFullyConnectedParams* params, + OpData* data, const TfLiteTensor* input, + const TfLiteTensor* filter, + const TfLiteTensor* bias, + TfLiteTensor* output, + TfLiteTensor* shuffled_input_workspace) { + gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context); + + // TODO(b/110697972) decide more consistently if / how / where we want + // to perform this kind of runtime data type checks. + if (input->type != kTfLiteUInt8 || filter->type != kTfLiteUInt8 || + bias->type != kTfLiteInt32 || output->type != kTfLiteInt16 || + shuffled_input_workspace->type != kTfLiteUInt8) { + context->ReportError(context, "Unexpected data type"); + return kTfLiteError; + } + +#define TF_LITE_SHUFFLED_FULLY_CONNECTED(type) \ + { \ + FullyConnectedParams op_params; \ + op_params.output_multiplier = data->output_multiplier; \ + op_params.output_shift = -data->output_shift; \ + op_params.quantized_activation_min = data->output_activation_min; \ + op_params.quantized_activation_max = data->output_activation_max; \ + type::ShuffledFullyConnected( \ + op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \ + GetTensorShape(filter), GetTensorData<uint8_t>(filter), \ + GetTensorShape(bias), GetTensorData<int32_t>(bias), \ + GetTensorShape(output), GetTensorData<int16_t>(output), \ + GetTensorData<uint8_t>(shuffled_input_workspace), gemm_context); \ + } + if (kernel_type == kReference) { + TF_LITE_SHUFFLED_FULLY_CONNECTED(reference_ops); + } else { + TF_LITE_SHUFFLED_FULLY_CONNECTED(optimized_ops); + } +#undef TF_LITE_SHUFFLED_FULLY_CONNECTED + + return kTfLiteOk; +} + +template <KernelType kernel_type> TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, TfLiteFullyConnectedParams* params, OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output) { float output_activation_min, output_activation_max; - CalculateActivationRangeFloat(params->activation, &output_activation_min, - &output_activation_max); -#define TF_LITE_FULLY_CONNECTED(type) \ - type::FullyConnected(GetTensorData<float>(input), GetTensorDims(input), \ - GetTensorData<float>(filter), GetTensorDims(filter), \ - GetTensorData<float>(bias), GetTensorDims(bias), \ - output_activation_min, output_activation_max, \ - GetTensorData<float>(output), GetTensorDims(output)) + CalculateActivationRange(params->activation, &output_activation_min, + &output_activation_max); +#define TF_LITE_FULLY_CONNECTED(type) \ + { \ + FullyConnectedParams op_params; \ + op_params.float_activation_min = output_activation_min; \ + op_params.float_activation_max = output_activation_max; \ + type::FullyConnected(op_params, GetTensorShape(input), \ + GetTensorData<float>(input), GetTensorShape(filter), \ + GetTensorData<float>(filter), GetTensorShape(bias), \ + GetTensorData<float>(bias), GetTensorShape(output), \ + GetTensorData<float>(output)); \ + } if (kernel_type == kReference) { TF_LITE_FULLY_CONNECTED(reference_ops); } else if (kernel_type == kPie) { @@ -354,10 +430,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return EvalFloat<kernel_type>(context, node, params, data, input, filter, bias, output); case kTfLiteUInt8: - return EvalQuantized<kernel_type>(context, node, params, data, input, - filter, bias, output); + if (params->weights_format == + kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8) { + TfLiteTensor* shuffled_input_workspace = + GetOutput(context, node, kShuffledInputWorkspaceTensor); + return EvalShuffledQuantized<kernel_type>(context, node, params, data, + input, filter, bias, output, + shuffled_input_workspace); + } else if (params->weights_format == + kTfLiteFullyConnectedWeightsFormatDefault) { + return EvalQuantized<kernel_type>(context, node, params, data, input, + filter, bias, output); + } else { + context->ReportError(context, + "Unhandled fully-connected weights format"); + return kTfLiteError; + } default: - context->ReportError(context, "Type not currently supported."); + context->ReportError(context, "Type %d not currently supported.", + filter->type); return kTfLiteError; } return kTfLiteOk; diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc index 05dd028b48..08b4320946 100644 --- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc +++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc @@ -15,6 +15,7 @@ limitations under the License. // Unit test for TFLite FULLY_CONNECTED op. #include <iomanip> +#include <random> #include <vector> #include <gmock/gmock.h> @@ -133,9 +134,12 @@ static float fully_connected_golden_output[] = { class BaseFullyConnectedOpModel : public SingleOpModel { public: // TODO(ahentz): test different activation types too. - BaseFullyConnectedOpModel(TfLiteRegistration* registration, int units, - int batches, const TensorData& input, - const TensorData& output = {TensorType_FLOAT32}) + BaseFullyConnectedOpModel( + TfLiteRegistration* registration, int units, int batches, + const TensorData& input, const TensorData& output = {TensorType_FLOAT32}, + ActivationFunctionType activation_func = ActivationFunctionType_RELU, + FullyConnectedOptionsWeightsFormat weights_format = + FullyConnectedOptionsWeightsFormat_DEFAULT) : batches_(batches), units_(units) { int total_input_size = 1; for (int i = 0; i < input.shape.size(); ++i) { @@ -159,10 +163,13 @@ class BaseFullyConnectedOpModel : public SingleOpModel { } output_ = AddOutput(output); + if (weights_format != FullyConnectedOptionsWeightsFormat_DEFAULT) { + AddOutput({TensorType_UINT8, input.shape}); + } SetBuiltinOp( BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions, - CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU) + CreateFullyConnectedOptions(builder_, activation_func, weights_format) .Union()); resolver_ = absl::make_unique<SingleOpResolver>( BuiltinOperator_FULLY_CONNECTED, registration); @@ -188,13 +195,11 @@ class FloatFullyConnectedOpModel : public BaseFullyConnectedOpModel { public: using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel; - void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); } + void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); } - void SetWeights(std::initializer_list<float> f) { - PopulateTensor(weights_, f); - } + void SetWeights(const std::vector<float>& f) { PopulateTensor(weights_, f); } - void SetInput(std::initializer_list<float> data) { + void SetInput(const std::vector<float>& data) { PopulateTensor(input_, data); } void SetInput(int offset, float* begin, float* end) { @@ -208,20 +213,50 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel { public: using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel; - void SetBias(std::initializer_list<float> data) { + void SetBias(const std::vector<float>& data) { QuantizeAndPopulate<int32_t>(bias_, data); } - void SetWeights(std::initializer_list<float> data) { + void SetWeights(const std::vector<float>& data) { QuantizeAndPopulate<uint8_t>(weights_, data); } - void SetInput(std::initializer_list<float> data) { + void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth, + int output_depth) { + std::vector<float> shuffled_data(data.size()); + CHECK_EQ(input_depth % 16, 0); + CHECK_EQ(output_depth % 4, 0); + float* shuffled_data_ptr = shuffled_data.data(); + for (int block_o = 0; block_o < output_depth; block_o += 4) { + for (int block_i = 0; block_i < input_depth; block_i += 16) { + for (int o = 0; o < 4; o++) { + for (int i = 0; i < 16; i++) { + *shuffled_data_ptr++ = + data[(block_o + o) * input_depth + block_i + i]; + } + } + } + } + TfLiteTensor* t = interpreter_->tensor(weights_); + auto quantized_data = + Quantize<uint8_t>(shuffled_data, t->params.scale, t->params.zero_point); + for (uint8_t& q : quantized_data) { + q ^= 0x80; + } + PopulateTensor(weights_, 0, quantized_data.data(), + quantized_data.data() + quantized_data.size()); + } + void SetInput(const std::vector<float>& data) { QuantizeAndPopulate<uint8_t>(input_, data); } - std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); } + template <typename T> + std::vector<T> GetOutput() { + return ExtractVector<T>(output_); + } + + template <typename T> std::vector<float> GetDequantizedOutput() { - return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_), - GetScale(output_), GetZeroPoint(output_)); + return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_), + GetZeroPoint(output_)); } }; @@ -256,12 +291,12 @@ class HybridFullyConnectedOpModel : public SingleOpModel { ops::builtin::Register_FULLY_CONNECTED_PIE()); BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)}); } - void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); } - void SetWeights(std::initializer_list<float> data) { + void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); } + void SetWeights(const std::vector<float>& data) { SymmetricQuantizeAndPopulate(weights_, data); } - void SetInput(std::initializer_list<float> f) { PopulateTensor(input_, f); } + void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); } std::vector<float> GetOutput() { return ExtractVector<float>(output_); } int input_size() { return input_size_; } @@ -340,6 +375,24 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest) { EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60)); } +TEST_P(FloatFullyConnectedOpTest, SimpleTest2) { + FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/1, /*batches=*/2, + /*input=*/{TensorType_FLOAT32, {2, 2}}); + m.SetWeights({ + 2, 4, // u = 0 + }); + m.SetBias({1}); + + m.SetInput({ + 1, 2, // b = 0 + 2, 1, // b = 1 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAre(11, 9)); +} + TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) { QuantizedFullyConnectedOpModel m( GetRegistration(), /*units=*/3, /*batches*/ 2, @@ -350,7 +403,38 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) { m.SetWeights({ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 + }); + m.SetBias({1, 2, 3}); + + m.SetInput({ + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear({ + 24, 25, 26, // + 58, 59, 60, // + }))); + EXPECT_THAT(m.GetOutput<uint8_t>(), + ElementsAre(151, 152, 153, 185, 186, 187)); +} + +TEST_P(QuantizedFullyConnectedOpTest, + SimpleTestQuantizedOutputMultiplierGreaterThan1) { + // real_multiplier = 2. + QuantizedFullyConnectedOpModel m( + GetRegistration(), /*units=*/3, /*batches*/ 2, + /*input=*/{TensorType_UINT8, {2, 10}, -127, 128}, + /*output=*/{TensorType_UINT8, {}, -63.5, 64}); + + m.SetWeights({ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 }); m.SetBias({1, 2, 3}); @@ -361,11 +445,136 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) { m.Invoke(); - EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({ - 24, 25, 26, // - 58, 59, 60, // - }))); - EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187)); + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear({ + 24, 25, 26, // first batch + 58, 59, 60, // second batch + }))); + EXPECT_THAT(m.GetOutput<uint8_t>(), + ElementsAre(175, 177, 179, 243, 245, 247)); +} + +void SimpleTestQuantizedInt16OutputCase( + TfLiteRegistration* registration, int input_depth, int output_depth, + int batches, FullyConnectedOptionsWeightsFormat weights_format) { + const uint8_t kWeightsZeroPoint = 128; + const float kWeightsScale = 1.f / 128.f; + const uint8_t kInputZeroPoint = 128; + const float kInputScale = 1.f / 128.f; + const float kInputMin = (0 - kInputZeroPoint) * kInputScale; + const float kInputMax = (255 - kInputZeroPoint) * kInputScale; + // Output ranges in [-8..8] encoded as int16 + const float kOutputScale = 8.f / 32768.f; + const float kOutputMin = -32768 * kOutputScale; + const float kOutputMax = 32767 * kOutputScale; + + QuantizedFullyConnectedOpModel m( + registration, output_depth, batches, + /*input=*/ + {TensorType_UINT8, {batches, input_depth}, kInputMin, kInputMax}, + /*output=*/{TensorType_INT16, {}, kOutputMin, kOutputMax}, + /*activation_func=*/ActivationFunctionType_NONE, weights_format); + + std::mt19937 random_engine; + std::uniform_int_distribution<uint8_t> weights_dist; + + std::vector<float> weights_data(input_depth * output_depth); + for (auto& w : weights_data) { + uint8_t q = weights_dist(random_engine); + w = (q - kWeightsZeroPoint) * kWeightsScale; + } + + // Based on weights_format, enforce any shape requirement for that format/path + // and set the (possibly shuffled) weights. + switch (weights_format) { + case FullyConnectedOptionsWeightsFormat_DEFAULT: + m.SetWeights(weights_data); + break; + case FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8: + // The shuffled path currently supports only a restrictive subset of + // shapes, described by the following assertions: + CHECK_EQ(input_depth % 16, 0); + CHECK_EQ(output_depth % 4, 0); + CHECK(batches == 1 || batches == 4); + m.ShuffleAndSetWeights(weights_data, input_depth, output_depth); + break; + default: + LOG(FATAL) << "Unhandled weights format"; + } + + std::uniform_int_distribution<uint8_t> input_dist; + std::vector<float> input_data(input_depth * batches); + for (auto& i : input_data) { + uint8_t q = input_dist(random_engine); + i = (q - kInputZeroPoint) * kInputScale; + } + + std::vector<float> bias_data(output_depth); + // As the output ranges in [-8, 8], it's reasonable to have bias values + // in [-1, 1], this won't result in too much saturation. + std::uniform_real_distribution<float> bias_dist(-1.f, 1.f); + for (auto& b : bias_data) { + b = bias_dist(random_engine); + } + + m.SetBias(bias_data); + m.SetInput(input_data); + + m.Invoke(); + + std::vector<float> expected_output_data(output_depth * batches); + for (int b = 0; b < batches; b++) { + for (int o = 0; o < output_depth; o++) { + float accum = bias_data[o]; + for (int i = 0; i < input_depth; i++) { + accum += + input_data[b * input_depth + i] * weights_data[o * input_depth + i]; + } + accum = std::min(accum, kOutputMax); + accum = std::max(accum, kOutputMin); + expected_output_data[b * output_depth + o] = accum; + } + } + + EXPECT_THAT(m.GetDequantizedOutput<int16_t>(), + ElementsAreArray(ArrayFloatNear(expected_output_data, 3e-4f))); +} + +TEST_P(QuantizedFullyConnectedOpTest, + SimpleTestQuantizedInt16OutputDefaultWeights) { + for (int input_depth : {1, 3, 10, 100}) { + for (int output_depth : {1, 3, 10, 100}) { + for (int batch : {1, 3, 10, 100}) { + SimpleTestQuantizedInt16OutputCase( + GetRegistration(), input_depth, output_depth, batch, + FullyConnectedOptionsWeightsFormat_DEFAULT); + } + } + } +} + +TEST_P(QuantizedFullyConnectedOpTest, + SimpleTestQuantizedInt16OutputShuffled4x16Int8Weights) { + // The shuffled weights block shape is 4x16. The shape of the weights matrix + // is: rows = output_depth, cols = input_depth. It must be a multiple of 4x16. + // This means that output_depth must be a multiple of 4, and input_deth must + // be a multiple of 16. + for (int input_depth_numblocks : {1, 3}) { + for (int output_depth_numblocks : {1, 3}) { + int input_depth = 16 * input_depth_numblocks; + int output_depth = 4 * output_depth_numblocks; + // The fast shuffled path is currently supporting only batch sizes of 1 + // and 4. The idea is that the whole point of that path is to go as fast + // as possible for small batch size, which requires fully specializing + // it for each batch size, and for larger batch sizes the generic + // gemmlowp-based implementation is fast enough. + for (int batch : {1, 4}) { + SimpleTestQuantizedInt16OutputCase( + GetRegistration(), input_depth, output_depth, batch, + FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8); + } + } + } } TEST(HybridFullyConnectedOpTest, SimpleTestQuantized) { @@ -396,11 +605,11 @@ TEST(HybridFullyConnectedOpTest, SimpleTestQuantized) { /*max_abs_error=*/1.3f))); } -TEST(FloatFullyConnectedOpTest, SimpleTest4DInput) { +TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput) { // Note that it is not required that the first dimension be the number of // batches. All we care is that the input can be evenly distributed in // batches. In this case, we need the input to have multiples of '2'. - FloatFullyConnectedOpModel m(ops::builtin::Register_FULLY_CONNECTED_PIE(), + FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/3, /*batches=*/2, /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}}); m.SetWeights({ @@ -444,11 +653,44 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) { m.Invoke(); - EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({ - 24, 25, 26, // - 58, 59, 60, // - }))); - EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187)); + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear({ + 24, 25, 26, // + 58, 59, 60, // + }))); + EXPECT_THAT(m.GetOutput<uint8_t>(), + ElementsAre(151, 152, 153, 185, 186, 187)); +} + +TEST_P(QuantizedFullyConnectedOpTest, + SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) { + // real_multiplier = 2. + QuantizedFullyConnectedOpModel m( + GetRegistration(), /*units=*/3, /*batches=*/2, + /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128}, + /*output=*/{TensorType_UINT8, {}, -63.5, 64}); + + m.SetWeights({ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + }); + m.SetBias({1, 2, 3}); + + m.SetInput({ + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(), + ElementsAreArray(ArrayFloatNear({ + 24, 25, 26, // first batch + 58, 59, 60, // second batch + }))); + EXPECT_THAT(m.GetOutput<uint8_t>(), + ElementsAre(175, 177, 179, 243, 245, 247)); } INSTANTIATE_TEST_CASE_P( diff --git a/tensorflow/contrib/lite/kernels/gather.cc b/tensorflow/contrib/lite/kernels/gather.cc index c452d3ebac..b5afeb1a7b 100644 --- a/tensorflow/contrib/lite/kernels/gather.cc +++ b/tensorflow/contrib/lite/kernels/gather.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include <string.h> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -40,10 +40,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = GetOutput(context, node, kOutputTensor); // Only INT32 positions are supported. TF_LITE_ENSURE_EQ(context, positions->type, kTfLiteInt32); - // Check that input and output types match. - TF_LITE_ENSURE_EQ(context, input->type, output->type); - // TODO(mgubin): only 0D or 1D positions are currently supported. - TF_LITE_ENSURE(context, NumDimensions(positions) <= 1); + // Assign to output the input type. + output->type = input->type; // TODO(mgubin): Only default axis == 0 is supported. TF_LITE_ENSURE_EQ(context, params->axis, 0); // Check conditions for different types. @@ -59,8 +57,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1); } break; default: - context->ReportError(context, - "Only float32 and string types are supported"); + context->ReportError( + context, "Only float32 and string types are supported, got %d", + input->type); return kTfLiteError; } const int num_dimensions = @@ -85,11 +84,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* positions = GetInput(context, node, kInputPositions); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); const int input_rank = NumDimensions(input); -#define TF_LITE_GATHER(data_type, index_type) \ - optimized_ops::Gather( \ - GetTensorData<data_type>(input), GetTensorDims(input), input_rank, \ - GetTensorData<index_type>(positions), GetTensorDims(positions), \ - GetTensorData<data_type>(output), GetTensorDims(output)); +#define TF_LITE_GATHER(data_type, index_type) \ + { \ + tflite::GatherParams op_params; \ + op_params.input_rank = input_rank; \ + optimized_ops::Gather( \ + op_params, GetTensorShape(input), GetTensorData<data_type>(input), \ + GetTensorShape(positions), GetTensorData<index_type>(positions), \ + GetTensorShape(output), GetTensorData<data_type>(output)); \ + } switch (input->type) { case kTfLiteFloat32: TF_LITE_GATHER(float, int32_t); @@ -101,6 +104,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_GATHER(int32_t, int32_t); break; case kTfLiteString: { + // TODO(mgubin): Currently support only for 1D output tensors. DynamicBuffer buffer; const int32* indexes = positions->data.i32; const int num_strings = GetStringCount(input); diff --git a/tensorflow/contrib/lite/kernels/gather_test.cc b/tensorflow/contrib/lite/kernels/gather_test.cc index cdadbeda18..1b48884e09 100644 --- a/tensorflow/contrib/lite/kernels/gather_test.cc +++ b/tensorflow/contrib/lite/kernels/gather_test.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include <gtest/gtest.h> -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" #include "tensorflow/contrib/lite/interpreter.h" #include "tensorflow/contrib/lite/kernels/register.h" #include "tensorflow/contrib/lite/kernels/test_util.h" @@ -96,6 +96,15 @@ TEST(GatherOpTest, Test0DIndexWith0DResult) { EXPECT_TRUE(m.GetOutputShape().empty()); } +TEST(GatherOpTest, Test2DIndexWith2DResult) { + GatherOpModel m({3}, TensorType_FLOAT32, {1, 2}); + m.SetInputFloat({1.0, 2.0, 3.0}); + m.SetPositions({1, 0}); + m.Invoke(); + EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0, 1.0}))); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); +} + TEST(FloatGatherOpTest, Duplicate) { GatherOpModel m({1, 2, 2}, TensorType_FLOAT32, {2}); m.SetInputFloat({-2.0, 0.2, 0.7, 0.8}); diff --git a/tensorflow/contrib/lite/kernels/gemm_support.cc b/tensorflow/contrib/lite/kernels/gemm_support.cc index 95f45ea768..ed334af2da 100644 --- a/tensorflow/contrib/lite/kernels/gemm_support.cc +++ b/tensorflow/contrib/lite/kernels/gemm_support.cc @@ -14,57 +14,70 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/kernels/gemm_support.h" +#include <memory> + #include "tensorflow/contrib/lite/kernels/op_macros.h" namespace tflite { namespace gemm_support { +namespace { -struct RefCountedGemmContext { - gemmlowp::GemmContext* gemm_context_ = nullptr; - int num_references_ = 0; +struct RefCountedGemmContext : public TfLiteExternalContext { + std::unique_ptr<gemmlowp::GemmContext> gemm_context; + int num_references = 0; }; +RefCountedGemmContext* GetGemmLowpContext(TfLiteContext* context) { + return reinterpret_cast<RefCountedGemmContext*>( + context->GetExternalContext(context, kTfLiteGemmLowpContext)); +} + +TfLiteStatus Refresh(TfLiteContext* context) { + auto* ptr = GetGemmLowpContext(context); + if (ptr != nullptr) { + ptr->gemm_context->set_max_num_threads(context->recommended_num_threads); + } + return kTfLiteOk; +} + +} // namespace + void IncrementUsageCounter(TfLiteContext* context) { - auto* ptr = reinterpret_cast<RefCountedGemmContext*>(context->gemm_context); + auto* ptr = GetGemmLowpContext(context); if (ptr == nullptr) { ptr = new RefCountedGemmContext; - ptr->gemm_context_ = new gemmlowp::GemmContext(); + ptr->type = kTfLiteGemmLowpContext; + ptr->Refresh = Refresh; + ptr->gemm_context.reset(new gemmlowp::GemmContext()); if (context->recommended_num_threads != -1) { - ptr->gemm_context_->set_max_num_threads(context->recommended_num_threads); + ptr->gemm_context->set_max_num_threads(context->recommended_num_threads); } - ptr->num_references_ = 0; - context->gemm_context = ptr; + ptr->num_references = 0; + context->SetExternalContext(context, kTfLiteGemmLowpContext, ptr); } - ptr->num_references_++; + ptr->num_references++; } void DecrementUsageCounter(TfLiteContext* context) { - auto* ptr = reinterpret_cast<RefCountedGemmContext*>(context->gemm_context); + auto* ptr = GetGemmLowpContext(context); if (ptr == nullptr) { TF_LITE_FATAL( "Call to DecrementUsageCounter() not preceded by " "IncrementUsageCounter()"); } - if (--ptr->num_references_ == 0) { - delete ptr->gemm_context_; + if (--ptr->num_references == 0) { delete ptr; - context->gemm_context = nullptr; + context->SetExternalContext(context, kTfLiteGemmLowpContext, nullptr); } } gemmlowp::GemmContext* GetFromContext(TfLiteContext* context) { - auto* ptr = reinterpret_cast<RefCountedGemmContext*>(context->gemm_context); + auto* ptr = GetGemmLowpContext(context); if (ptr == nullptr) { TF_LITE_FATAL( "Call to GetFromContext() not preceded by IncrementUsageCounter()"); } - return ptr->gemm_context_; -} - -void SetNumThreads(TfLiteContext* context, int num_threads) { - IncrementUsageCounter(context); - GetFromContext(context)->set_max_num_threads(num_threads); - DecrementUsageCounter(context); + return ptr->gemm_context.get(); } } // namespace gemm_support diff --git a/tensorflow/contrib/lite/kernels/gemm_support.h b/tensorflow/contrib/lite/kernels/gemm_support.h index f033501cb6..43cd2b3055 100644 --- a/tensorflow/contrib/lite/kernels/gemm_support.h +++ b/tensorflow/contrib/lite/kernels/gemm_support.h @@ -16,7 +16,7 @@ limitations under the License. #define TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_ #include "public/gemmlowp.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" namespace tflite { namespace gemm_support { @@ -45,9 +45,6 @@ void IncrementUsageCounter(TfLiteContext* context); // 'context'. If there are no more usages the GemmContext will be deleted. void DecrementUsageCounter(TfLiteContext* context); -// Set the number of threads that can be used by gemmlowp. -void SetNumThreads(TfLiteContext* context, int num_threads); - } // namespace gemm_support } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc index 41211d41aa..c0b3c3c0c5 100644 --- a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc +++ b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc @@ -31,7 +31,6 @@ limitations under the License. // Each item indicates whether the corresponding lookup has a returned value. // 0 for missing key, 1 for found key. -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -40,8 +39,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" #include "tensorflow/contrib/lite/string_util.h" diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD index aabbb0685c..afb5ec05df 100644 --- a/tensorflow/contrib/lite/kernels/internal/BUILD +++ b/tensorflow/contrib/lite/kernels/internal/BUILD @@ -43,6 +43,10 @@ cc_library( "compatibility.h", "types.h", ], + deps = [ + "//tensorflow/contrib/lite/kernels:op_macros", + "@com_google_absl//absl/base:core_headers", + ], ) config_setting( @@ -160,9 +164,45 @@ cc_library( ":types", ":reference_base", ":round", + ":tensor_utils", + "//third_party/eigen3", + "@gemmlowp", + "//tensorflow/contrib/lite/c:c_api_internal", + ] + select({ + ":haswell": tflite_deps_intel, + ":ios_x86_64": tflite_deps_intel, + ":k8": tflite_deps_intel, + ":x86": tflite_deps_intel, + ":x86_64": tflite_deps_intel, + ":darwin": tflite_deps_intel, + ":darwin_x86_64": tflite_deps_intel, + ":freebsd": tflite_deps_intel, + "//conditions:default": [], + }), +) + +cc_library( + name = "legacy_optimized_base", + srcs = [], + hdrs = [ + "common.h", + "optimized/depthwiseconv_float.h", + "optimized/depthwiseconv_uint8.h", + "optimized/depthwiseconv_uint8_3x3_filter.h", + "optimized/legacy_optimized_ops.h", + "optimized/optimized_ops.h", + ], + copts = tflite_copts(), + deps = [ + ":quantization_util", + ":strided_slice_logic", + ":tensor_utils", + ":types", + ":legacy_reference_base", + ":round", "//third_party/eigen3", "@gemmlowp", - "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite/c:c_api_internal", ] + select({ ":haswell": tflite_deps_intel, ":ios_x86_64": tflite_deps_intel, @@ -184,13 +224,15 @@ cc_library( "optimized/eigen_spatial_convolutions.h", "optimized/eigen_tensor_reduced_instantiations_oss.h", "optimized/multithreaded_conv.h", + # FIXME(petewarden) - This should be removed, since it's a header from the + # :tensor dependency below. "tensor.h", ], deps = [ ":optimized_base", + ":tensor", ":types", - "//tensorflow/contrib/lite:builtin_op_data", - "//tensorflow/contrib/lite:context", + "//tensorflow/contrib/lite/c:c_api_internal", "//third_party/eigen3", ], ) @@ -198,8 +240,9 @@ cc_library( cc_test( name = "tensor_test", srcs = ["tensor_test.cc"], + tags = ["no_oss"], deps = [ - ":reference", + ":tensor", "@com_google_googletest//:gtest", ], ) @@ -220,12 +263,14 @@ cc_library( deps = [ ":round", ":types", + "//tensorflow/contrib/lite/kernels:op_macros", ], ) cc_test( name = "quantization_util_test", srcs = ["quantization_util_test.cc"], + tags = ["no_oss"], deps = [ ":quantization_util", "@com_google_googletest//:gtest", @@ -250,16 +295,18 @@ cc_library( "common.h", "reference/depthwiseconv_float.h", "reference/depthwiseconv_uint8.h", + "reference/fully_connected.h", "reference/reference_ops.h", + "reference/softmax.h", ], deps = [ ":quantization_util", ":round", ":strided_slice_logic", ":types", - "//third_party/eigen3", "@gemmlowp", - "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:op_macros", ] + select({ ":haswell": tflite_deps_intel, ":ios_x86_64": tflite_deps_intel, @@ -274,11 +321,60 @@ cc_library( ) cc_library( + name = "legacy_reference_base", + srcs = [], + hdrs = [ + "common.h", + "reference/depthwiseconv_float.h", + "reference/depthwiseconv_uint8.h", + "reference/fully_connected.h", + "reference/legacy_reference_ops.h", + "reference/reference_ops.h", + "reference/softmax.h", + ], + deps = [ + ":quantization_util", + ":round", + ":strided_slice_logic", + ":types", + "@gemmlowp", + "//tensorflow/contrib/lite/c:c_api_internal", + "//tensorflow/contrib/lite/kernels:op_macros", + ] + select({ + ":haswell": tflite_deps_intel, + ":ios_x86_64": tflite_deps_intel, + ":k8": tflite_deps_intel, + ":x86": tflite_deps_intel, + ":x86_64": tflite_deps_intel, + ":darwin": tflite_deps_intel, + ":darwin_x86_64": tflite_deps_intel, + ":freebsd": tflite_deps_intel, + "//conditions:default": [], + }), +) + +cc_library( + name = "tensor", + hdrs = [ + "tensor.h", + "tensor_ctypes.h", + ], + deps = [ + ":types", + "//tensorflow/contrib/lite/c:c_api_internal", + ], +) + +# Deprecated version of :tensor, kept for backwards compatibility. +cc_library( name = "reference", - hdrs = ["tensor.h"], + hdrs = [ + "tensor.h", + "tensor_ctypes.h", + ], deps = [ ":types", - "//tensorflow/contrib/lite:context", + "//tensorflow/contrib/lite/c:c_api_internal", ], ) @@ -292,7 +388,7 @@ cc_library( ], deps = [ ":round", - "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite/c:c_api_internal", "//tensorflow/contrib/lite/kernels:activation_functor", "//tensorflow/contrib/lite/kernels:op_macros", ], @@ -317,7 +413,7 @@ cc_library( ":cpu_check", ":round", ":types", - "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite/c:c_api_internal", "//tensorflow/contrib/lite/kernels:activation_functor", "//tensorflow/contrib/lite/kernels:op_macros", "@arm_neon_2_x86_sse", @@ -331,7 +427,7 @@ cc_library( hdrs = ["kernel_utils.h"], deps = [ ":tensor_utils", - "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite/c:c_api_internal", ], ) @@ -373,9 +469,10 @@ cc_library( ], copts = NEON_FLAGS_IF_APPLICABLE, deps = [ - "//tensorflow/contrib/lite/kernels:activation_functor", - "//tensorflow/contrib/lite:builtin_op_data", + "@com_google_absl//absl/base:core_headers", + "//tensorflow/contrib/lite/c:c_api_internal", "@arm_neon_2_x86_sse", + "//tensorflow/contrib/lite/kernels:op_macros", "@gemmlowp", ] + select({ ":arm": [ @@ -414,12 +511,25 @@ cc_library( ":darwin": [ ":neon_tensor_utils", ], + ":darwin_x86_64": [ + ":neon_tensor_utils", + ], "//conditions:default": [ ":portable_tensor_utils", ], }), ) +cc_library( + name = "test_util", + srcs = ["test_util.cc"], + hdrs = ["test_util.h"], + deps = [ + ":types", + "//tensorflow/contrib/lite:string", + ], +) + cc_test( name = "tensor_utils_test", srcs = ["tensor_utils_test.cc"], @@ -431,15 +541,112 @@ cc_test( "//conditions:default": [], }), linkstatic = 1, - tags = ["tflite_not_portable_ios"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":tensor_utils", - "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite/c:c_api_internal", "//tensorflow/contrib/lite/kernels:test_util", "@com_google_googletest//:gtest_main", ], ) +cc_test( + name = "depthwiseconv_float_test", + srcs = ["depthwiseconv_float_test.cc"], + tags = ["no_oss"], + deps = [ + ":optimized_base", + ":reference_base", + ":test_util", + ":types", + "@com_google_googletest//:gtest_main", + ], +) + +cc_test( + name = "depthwiseconv_quantized_test", + srcs = ["depthwiseconv_quantized_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], + deps = [ + ":optimized_base", + ":reference_base", + ":test_util", + ":types", + "@com_google_googletest//:gtest_main", + ], +) + +cc_test( + name = "resize_bilinear_test", + srcs = ["resize_bilinear_test.cc"], + tags = [ + "no_oss", + "tflite_not_portable", + ], + deps = [ + ":optimized_base", + ":reference_base", + ":test_util", + ":types", + "@com_google_googletest//:gtest_main", + ], +) + +cc_test( + name = "softmax_quantized_test", + timeout = "long", + srcs = [ + "softmax_quantized_test.cc", + ], + tags = ["no_oss"], + deps = [ + ":optimized_base", + ":quantization_util", + ":reference_base", + ":test_util", + "//tensorflow/contrib/lite:string", + "@com_google_googletest//:gtest_main", + ], +) + +cc_test( + name = "logsoftmax_quantized_test", + timeout = "long", + srcs = [ + "logsoftmax_quantized_test.cc", + ], + tags = [ + "no_oss", + "tflite_not_portable", + ], + deps = [ + ":optimized_base", + ":quantization_util", + ":reference_base", + ":test_util", + "//tensorflow/contrib/lite:string", + "@com_google_googletest//:gtest_main", + ], +) + +cc_test( + name = "log_quantized_test", + srcs = ["log_quantized_test.cc"], + tags = ["no_oss"], + deps = [ + ":optimized_base", + ":reference_base", + "//tensorflow/contrib/lite:string", + "@com_google_googletest//:gtest_main", + ], +) + cc_library( name = "cpu_check", hdrs = [ @@ -459,6 +666,7 @@ cc_library( cc_test( name = "batch_to_space_nd_test", srcs = ["batch_to_space_nd_test.cc"], + tags = ["no_oss"], deps = [ ":optimized_base", "@com_google_googletest//:gtest_main", diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h index ede95dfee0..e67fee11b8 100644 --- a/tensorflow/contrib/lite/kernels/internal/common.h +++ b/tensorflow/contrib/lite/kernels/internal/common.h @@ -45,7 +45,7 @@ limitations under the License. #endif #endif -#include "public/gemmlowp.h" +#include "fixedpoint/fixedpoint.h" #include "tensorflow/contrib/lite/kernels/internal/types.h" namespace tflite { @@ -87,12 +87,12 @@ float ActivationFunction(float x) { output_activation_max); } -inline int32 MultiplyByQuantizedMultiplierSmallerThanOne( - int32 x, int32 quantized_multiplier, int right_shift) { +inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp( + int32 x, int32 quantized_multiplier, int left_shift) { using gemmlowp::RoundingDivideByPOT; using gemmlowp::SaturatingRoundingDoublingHighMul; return RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift); + SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift); } inline int32 MultiplyByQuantizedMultiplierGreaterThanOne( @@ -117,6 +117,9 @@ template <typename T> int CountLeadingZeros(T integer_input) { static_assert(std::is_unsigned<T>::value, "Only unsigned integer types handled."); +#if defined(__GNUC__) + return integer_input ? __builtin_clz(integer_input) : 0; +#else const T one_in_leading_positive = static_cast<T>(1) << (std::numeric_limits<T>::digits - 1); int leading_zeros = 0; @@ -125,6 +128,140 @@ int CountLeadingZeros(T integer_input) { ++leading_zeros; } return leading_zeros; +#endif +} + +// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +// BROADCASTING. +// +// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional +// rectangular array of numbers. +// +// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. +// However, as Dims<N> is to be deprecated, this class exists as an adaptor +// to enable simple unoptimized implementations of element-wise broadcasting +// operations. +template <int N> +struct NdArrayDesc { + // The "extent" of each dimension. Indices along dimension d must be in the + // half-open interval [0, extents[d]). + int extents[N]; + + // The number of *elements* (not bytes) between consecutive indices of each + // dimension. + int strides[N]; +}; + +// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +// BROADCASTING. +// +// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. +inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, + int i3) { + TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); + TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); + TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); + TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); + return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + + i3 * desc.strides[3]; +} + +// Given the dimensions of the operands for an element-wise binary broadcast, +// adjusts them so that they can be directly iterated over with simple loops. +// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and +// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. +// +// This function assumes that the two input shapes are compatible up to +// broadcasting and the shorter one has already been prepended with 1s to be the +// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), +// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that +// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be +// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). +// +// When two shapes are compatible up to broadcasting, for each dimension d, +// the input extents are either equal, or one of them is 1. +// +// This function performs the following for each dimension d: +// - If the extents are equal, then do nothing since the loop that walks over +// both of the input arrays is correct. +// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 +// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows +// array0 to be referenced *at any index* in dimension d and still access the +// same slice. +template <int N> +inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, + const Dims<N>& input1_dims, + NdArrayDesc<N>* desc0_out, + NdArrayDesc<N>* desc1_out) { + TFLITE_DCHECK(desc0_out != nullptr); + TFLITE_DCHECK(desc1_out != nullptr); + + // Copy dims to desc. + for (int i = 0; i < N; ++i) { + desc0_out->extents[i] = input0_dims.sizes[i]; + desc0_out->strides[i] = input0_dims.strides[i]; + desc1_out->extents[i] = input1_dims.sizes[i]; + desc1_out->strides[i] = input1_dims.strides[i]; + } + + // Walk over each dimension. If the extents are equal do nothing. + // Otherwise, set the desc with extent 1 to have extent equal to the other and + // stride 0. + for (int i = 0; i < N; ++i) { + const int extent0 = ArraySize(input0_dims, i); + const int extent1 = ArraySize(input1_dims, i); + if (extent0 != extent1) { + if (extent0 == 1) { + desc0_out->strides[i] = 0; + desc0_out->extents[i] = extent1; + } else { + TFLITE_DCHECK_EQ(extent1, 1); + desc1_out->strides[i] = 0; + desc1_out->extents[i] = extent0; + } + } + } +} + +template <int N> +inline void NdArrayDescsForElementwiseBroadcast( + const RuntimeShape& input0_shape, const RuntimeShape& input1_shape, + NdArrayDesc<N>* desc0_out, NdArrayDesc<N>* desc1_out) { + TFLITE_DCHECK(desc0_out != nullptr); + TFLITE_DCHECK(desc1_out != nullptr); + + auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape); + auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape); + + // Copy dims to desc, calculating strides. + int desc0_stride = 1; + int desc1_stride = 1; + for (int i = N - 1; i >= 0; --i) { + desc0_out->extents[i] = extended_input0_shape.Dims(i); + desc0_out->strides[i] = desc0_stride; + desc0_stride *= extended_input0_shape.Dims(i); + desc1_out->extents[i] = extended_input1_shape.Dims(i); + desc1_out->strides[i] = desc1_stride; + desc1_stride *= extended_input1_shape.Dims(i); + } + + // Walk over each dimension. If the extents are equal do nothing. + // Otherwise, set the desc with extent 1 to have extent equal to the other and + // stride 0. + for (int i = 0; i < N; ++i) { + const int extent0 = extended_input0_shape.Dims(i); + const int extent1 = extended_input1_shape.Dims(i); + if (extent0 != extent1) { + if (extent0 == 1) { + desc0_out->strides[i] = 0; + desc0_out->extents[i] = extent1; + } else { + TFLITE_DCHECK_EQ(extent1, 1); + desc1_out->strides[i] = 0; + desc1_out->extents[i] = extent0; + } + } + } } } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h index 93fc6b6a76..7c176e0fa1 100644 --- a/tensorflow/contrib/lite/kernels/internal/compatibility.h +++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h @@ -15,65 +15,65 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_ -#include <cassert> #include <cstdint> -#include <cstdlib> + +#include "tensorflow/contrib/lite/kernels/op_macros.h" #ifndef TFLITE_DCHECK -#define TFLITE_DCHECK(condition) (condition) ? (void)0 : assert(false) +#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_EQ -#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_NE -#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_GE -#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_GT -#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_LE -#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif #ifndef TFLITE_DCHECK_LT -#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : assert(false) +#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE #endif // TODO(ahentz): Clean up: We should stick to the DCHECK versions. #ifndef TFLITE_CHECK -#define TFLITE_CHECK(condition) (condition) ? (void)0 : abort() +#define TFLITE_CHECK(condition) (condition) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_EQ -#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort() +#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_NE -#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : abort() +#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_GE -#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort() +#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_GT -#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : abort() +#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_LE -#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : abort() +#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ABORT #endif #ifndef TFLITE_CHECK_LT -#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : abort() +#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT #endif // TODO(ahentz): Clean up. @@ -84,4 +84,27 @@ using uint16 = std::uint16_t; using int32 = std::int32_t; using uint32 = std::uint32_t; +// TFLITE_DEPRECATED() +// +// Duplicated from absl/base/macros.h to avoid pulling in that library. +// Marks a deprecated class, struct, enum, function, method and variable +// declarations. The macro argument is used as a custom diagnostic message (e.g. +// suggestion of a better alternative). +// +// Example: +// +// class TFLITE_DEPRECATED("Use Bar instead") Foo {...}; +// TFLITE_DEPRECATED("Use Baz instead") void Bar() {...} +// +// Every usage of a deprecated entity will trigger a warning when compiled with +// clang's `-Wdeprecated-declarations` option. This option is turned off by +// default, but the warnings will be reported by clang-tidy. +#if defined(__clang__) && __cplusplus >= 201103L +#define TFLITE_DEPRECATED(message) __attribute__((deprecated(message))) +#endif + +#ifndef TFLITE_DEPRECATED +#define TFLITE_DEPRECATED(message) +#endif + #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/depthwiseconv_float_test.cc b/tensorflow/contrib/lite/kernels/internal/depthwiseconv_float_test.cc new file mode 100644 index 0000000000..41862a21a6 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/depthwiseconv_float_test.cc @@ -0,0 +1,157 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <algorithm> +#include <cmath> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/kernels/internal/common.h" +#include "tensorflow/contrib/lite/kernels/internal/test_util.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" + +#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK +#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h" + +namespace tflite { +namespace { + +// Runs the DepthwiseConv and compares against the reference implementation. +void TestOneDepthwiseConv( + const DepthwiseParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape) { + const int output_buffer_size = output_shape.FlatSize(); + std::vector<float> output_data(output_buffer_size); + std::vector<float> reference_output_data(output_buffer_size); + reference_ops::DepthwiseConv(params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + reference_output_data.data()); + optimized_ops::DepthwiseConv(params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + output_data.data()); + + double sum_abs_diff = 0; + float max_abs_val = 0; + for (int i = 0; i < output_buffer_size; i++) { + sum_abs_diff += std::abs(output_data[i] - reference_output_data[i]); + max_abs_val = std::max(max_abs_val, std::abs(reference_output_data[i])); + } + if (sum_abs_diff != 0.f) { + const float mean_diff = + static_cast<float>(sum_abs_diff / output_buffer_size); + const float relative_error = std::abs(mean_diff) / max_abs_val; + ASSERT_LT(relative_error, 1e-5f); + } +} + +// This function picks some random DepthwiseConv params, which may or may not +// be legal. If they're not legal, it returns false. If they're legal, +// it runs the DepthwiseConv test and returns true. This allows the caller +// to loop until a test has been run. +bool TryTestOneDepthwiseConv() { + // We have to pick a lot of positive values, where we are particularly + // interested in small values because they are most likely to be special + // cases in optimized implementations, and secondarily because they allow + // tests to run fast, which means we can run more tests and get more + // coverage. + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int input_depth = ExponentialRandomPositiveInt(0.9f, 6, 50); + const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int filter_width = ExponentialRandomPositiveInt(0.9f, 4, 10); + const int filter_height = ExponentialRandomPositiveInt(0.9f, 4, 10); + const int depth_multiplier = ExponentialRandomPositiveInt(0.8f, 6, 50); + const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8); + const int output_depth = input_depth * depth_multiplier; + const int dilation_width_factor = RandomElement(std::vector<int>({1, 2, 4})); + const int dilation_height_factor = RandomElement(std::vector<int>({1, 2, 4})); + float output_activation_min, output_activation_max; + FusedActivationFunctionType ac = + RandomElement(std::vector<FusedActivationFunctionType>( + {FusedActivationFunctionType::kNone, + FusedActivationFunctionType::kRelu, + FusedActivationFunctionType::kRelu1, + FusedActivationFunctionType::kRelu6})); + GetActivationMinMax(ac, &output_activation_min, &output_activation_max); + // The optimized DepthwiseConv implementation currently uses a fixed-size + // accumulator buffer on the stack, with that size. This currently means + // that it does not support larger output depths. It CHECK's for it, + // so it's safe in the sense that if a larger output depth was encountered, + // it would explicitly fail. We just need to adjust our testing to that + // constraint. + const int kMaxSupportedOutputDepth = 1024; + if (output_depth > kMaxSupportedOutputDepth) { + return false; + } + RuntimeShape input_shape_inference( + {batch, input_height, input_width, input_depth}); + RuntimeShape output_shape_inference; + int pad_width, pad_height; + const auto padding_type = + UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid; + if (!ComputeConvSizes(input_shape_inference, output_depth, filter_width, + filter_height, stride, dilation_width_factor, + dilation_height_factor, padding_type, + &output_shape_inference, &pad_width, &pad_height)) { + return false; + } + RuntimeShape filter_shape_inference( + {1, filter_height, filter_width, output_depth}); + RuntimeShape bias_shape_inference({1, 1, 1, output_depth}); + const int input_buffer_size = input_shape_inference.FlatSize(); + const int filter_buffer_size = filter_shape_inference.FlatSize(); + std::vector<float> input_data(input_buffer_size); + std::vector<float> filter_data(filter_buffer_size); + std::vector<float> bias_data(output_depth); + const float input_amplitude = 1.f; + const float filter_amplitude = 1.f; + const float bias_amplitude = + filter_width * filter_height * input_amplitude * filter_amplitude; + FillRandom(&input_data, -input_amplitude, input_amplitude); + FillRandom(&filter_data, -filter_amplitude, filter_amplitude); + FillRandom(&bias_data, -bias_amplitude, bias_amplitude); + DepthwiseParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride; + op_params.stride_height = stride; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.depth_multiplier = depth_multiplier; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + TestOneDepthwiseConv(op_params, input_shape_inference, input_data.data(), + filter_shape_inference, filter_data.data(), + bias_shape_inference, bias_data.data(), + output_shape_inference); + return true; +} + +void TestOneDepthwiseConv() { + while (!TryTestOneDepthwiseConv()) { + } +} + +TEST(TestDepthwiseConv, TestDepthwiseConv) { + const int kTestsToRun = 100 * 1000; + for (int i = 0; i < kTestsToRun; i++) { + TestOneDepthwiseConv(); + } +} +} // namespace +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/depthwiseconv_quantized_test.cc new file mode 100644 index 0000000000..9414e109c3 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/depthwiseconv_quantized_test.cc @@ -0,0 +1,349 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <sys/types.h> +#include <algorithm> +#include <cmath> +#include <cstdint> +#include <cstdlib> +#include <iterator> +#include <limits> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/kernels/internal/test_util.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" + +#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK +#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h" + +namespace tflite { +namespace { + +// Runs the DepthwiseConv and compares against the reference implementation. +template <FusedActivationFunctionType Ac> +int TestOneDepthwiseConvWithGivenOutputShift( + const std::uint8_t* input_data, const RuntimeShape& input_shape, + std::int32_t input_offset, const std::uint8_t* filter_data, + const RuntimeShape& filter_shape, std::int32_t filter_offset, + const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride, + int pad_width, int pad_height, int depth_multiplier, + std::int32_t output_offset, std::int32_t output_multiplier, + int output_shift, std::int32_t output_activation_min, + std::int32_t output_activation_max, const RuntimeShape& output_shape) { + const int output_buffer_size = output_shape.FlatSize(); + std::vector<std::uint8_t> output_data(output_buffer_size); + std::vector<std::uint8_t> reference_output_data(output_buffer_size); + + tflite::DepthwiseParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride; + op_params.stride_height = stride; + op_params.dilation_width_factor = 1; + op_params.dilation_height_factor = 1; + op_params.depth_multiplier = depth_multiplier; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = -output_shift; + reference_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + reference_output_data.data()); + optimized_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + output_data.data()); + int saturated_min = 0; + int saturated_max = 0; + std::vector<int> diff(output_buffer_size); + std::int64_t sum_diff = 0; + std::int64_t sum_abs_diff = 0; + for (int i = 0; i < output_buffer_size; i++) { + diff[i] = static_cast<int>(output_data[i]) - + static_cast<int>(reference_output_data[i]); + sum_diff += diff[i]; + sum_abs_diff += std::abs(diff[i]); + saturated_min += output_data[i] == output_activation_min; + saturated_max += output_data[i] == output_activation_max; + } + // These stats help understand test failures. + std::sort(std::begin(diff), std::end(diff)); + const int min_diff = diff.front(); + const int max_diff = diff.back(); + const int median_diff = diff[diff.size() / 2]; + const float mean_diff = static_cast<float>(sum_diff) / output_buffer_size; + const float mean_abs_diff = + static_cast<float>(sum_abs_diff) / output_buffer_size; + // Normally we should require bit-for-bit exact results. Unfortunately a bug + // in the Intel arm_neon_sse.h translation header that we use for x86 tests + // causes 1-bit inaccuracy in + // the vqrdmulh_n_s32 intrinsic, which causes off-by-1 errors in quantized + // DepthwiseConv ops. So we have to live with a few off-by-one errors for now, + // yet still ensure that no more than a small minority of values are wrong. + EXPECT_TRUE(std::abs(mean_diff) < 1e-5f && mean_abs_diff < 1e-5f && + std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 && + std::abs(max_diff) <= 1); + if (saturated_min > 2 * saturated_max) { + return -1; + } + if (saturated_max > 2 * saturated_min) { + return 1; + } + return 0; +} + +// The point of this function is that we can't practically know which +// output_shift value to pass to test DepthwiseConv. It's not easy to guess (we +// could do some +// statistics for large size, but they would be fragile at smaller sizes), and +// guessing wrong would mean that all the values get saturated so the test +// becomes +// vacuous. So we just bisect our way to reasonable output_shift values. +template <FusedActivationFunctionType Ac> +void TestOneDepthwiseConvBisectOutputShift( + const std::uint8_t* input_data, const RuntimeShape& input_shape, + std::int32_t input_offset, const std::uint8_t* filter_data, + const RuntimeShape& filter_shape, std::int32_t filter_offset, + const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride, + int pad_width, int pad_height, int depth_multiplier, + std::int32_t output_offset, std::int32_t output_multiplier, + int output_activation_bisect_start, int output_activation_bisect_end, + std::int32_t output_activation_min, std::int32_t output_activation_max, + const RuntimeShape& output_shape) { + ASSERT_LT(output_activation_bisect_start, output_activation_bisect_end) + << "Bisection failed ?!?!"; + int output_shift_bisect_midpoint = + (output_activation_bisect_start + output_activation_bisect_end) / 2; + int bisect_result = TestOneDepthwiseConvWithGivenOutputShift<Ac>( + input_data, input_shape, input_offset, filter_data, filter_shape, + filter_offset, bias_data, bias_shape, stride, pad_width, pad_height, + depth_multiplier, output_offset, output_multiplier, + output_shift_bisect_midpoint, output_activation_min, + output_activation_max, output_shape); + // At this point we know that the test succeeded (otherwise it would have + // aborted). + if (bisect_result == 0) { + // The result isn't particularly saturated on one or the other side. + // All good, we're done. + return; + } + if (output_activation_bisect_start == output_activation_bisect_end - 1) { + // There is still some saturation on one side, but the bisection is + // finished anyways. We're done; nothing more we can do about it. This + // happens + // in particular when using an activation with a narrow range. + return; + } + // Continue the bisection based on the present result. + int new_output_activation_bisect_start = bisect_result == 1 + ? output_shift_bisect_midpoint + : output_activation_bisect_start; + int new_output_activation_bisect_end = bisect_result == 1 + ? output_activation_bisect_end + : output_shift_bisect_midpoint; + TestOneDepthwiseConvBisectOutputShift<Ac>( + input_data, input_shape, input_offset, filter_data, filter_shape, + filter_offset, bias_data, bias_shape, stride, pad_width, pad_height, + depth_multiplier, output_offset, output_multiplier, + new_output_activation_bisect_start, new_output_activation_bisect_end, + output_activation_min, output_activation_max, output_shape); +} + +template <FusedActivationFunctionType Ac> +void TestOneDepthwiseConv( + const std::uint8_t* input_data, const RuntimeShape& input_shape, + std::int32_t input_offset, const std::uint8_t* filter_data, + const RuntimeShape& filter_shape, std::int32_t filter_offset, + const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride, + int pad_width, int pad_height, int depth_multiplier, + std::int32_t output_offset, std::int32_t output_multiplier, + std::int32_t output_activation_min, std::int32_t output_activation_max, + const RuntimeShape& output_shape) { + TestOneDepthwiseConvBisectOutputShift<Ac>( + input_data, input_shape, input_offset, filter_data, filter_shape, + filter_offset, bias_data, bias_shape, stride, pad_width, pad_height, + depth_multiplier, output_offset, output_multiplier, 0, 32, + output_activation_min, output_activation_max, output_shape); +} + +void TestOneDepthwiseConv( + FusedActivationFunctionType Ac, const std::uint8_t* input_data, + const RuntimeShape& input_shape, std::int32_t input_offset, + const std::uint8_t* filter_data, const RuntimeShape& filter_shape, + std::int32_t filter_offset, const std::int32_t* bias_data, + const RuntimeShape& bias_shape, int stride, int pad_width, int pad_height, + int depth_multiplier, std::int32_t output_offset, + std::int32_t output_multiplier, std::int32_t output_activation_min, + std::int32_t output_activation_max, const RuntimeShape& output_shape) { +#define TOCO_HANDLE_CASE(AC_TYPE) \ + if (AC_TYPE == Ac) { \ + TestOneDepthwiseConv<AC_TYPE>( \ + input_data, input_shape, input_offset, filter_data, filter_shape, \ + filter_offset, bias_data, bias_shape, stride, pad_width, pad_height, \ + depth_multiplier, output_offset, output_multiplier, \ + output_activation_min, output_activation_max, output_shape); \ + return; \ + } + TOCO_HANDLE_CASE(FusedActivationFunctionType::kNone) + TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu) + TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu1) + TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu6) +#undef TOCO_HANDLE_CASE +} + +bool TryTestDepthwiseConv(int batch, int input_depth, int input_width, + int input_height, int filter_width, int filter_height, + int depth_multiplier, int stride, + int dilation_width_factor, int dilation_height_factor, + PaddingType padding_type) { + const int output_depth = input_depth * depth_multiplier; + // The optimized DepthwiseConv implementation currently uses a fixed-size + // accumulator buffer on the stack, with that size. This currently means + // that it does not support larger output depths. It CHECK's for it, + // so it's safe in the sense that if a larger output depth was encountered, + // it would explicitly fail. We just need to adjust our testing to that + // constraint. + const int kMaxSupportedOutputDepth = 1024; + if (output_depth > kMaxSupportedOutputDepth) { + return false; + } + const auto ac = RandomElement(std::vector<FusedActivationFunctionType>( + {FusedActivationFunctionType::kNone, FusedActivationFunctionType::kRelu, + FusedActivationFunctionType::kRelu6, + FusedActivationFunctionType::kRelu1})); + int output_activation_min = 0; + int output_activation_max = 255; + if (ac != FusedActivationFunctionType::kNone && UniformRandomInt(0, 1)) { + output_activation_min = UniformRandomInt(0, 50); + output_activation_max = UniformRandomInt(200, 255); + } + const std::int32_t output_multiplier = + UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max()); + const std::int32_t input_offset = UniformRandomInt(-256, 0); + const std::int32_t filter_offset = UniformRandomInt(-256, 0); + const std::int32_t output_offset = UniformRandomInt(-256, 0); + RuntimeShape input_shape_inference( + {batch, input_height, input_width, input_depth}); + RuntimeShape output_shape_inference; + int pad_width, pad_height; + if (!ComputeConvSizes(input_shape_inference, output_depth, filter_width, + filter_height, stride, dilation_width_factor, + dilation_height_factor, padding_type, + &output_shape_inference, &pad_width, &pad_height)) { + return false; + } + RuntimeShape filter_shape_inference( + {1, filter_height, filter_width, output_depth}); + RuntimeShape bias_shape_inference({1, 1, 1, output_depth}); + const int input_buffer_size = input_shape_inference.FlatSize(); + const int filter_buffer_size = filter_shape_inference.FlatSize(); + std::vector<std::uint8_t> input_data(input_buffer_size); + std::vector<std::uint8_t> filter_data(filter_buffer_size); + std::vector<std::int32_t> bias_data(output_depth); + FillRandom(&input_data); + FillRandom(&filter_data); + FillRandom(&bias_data, -10000, 10000); + TestOneDepthwiseConv(ac, input_data.data(), input_shape_inference, + input_offset, filter_data.data(), filter_shape_inference, + filter_offset, bias_data.data(), bias_shape_inference, + stride, pad_width, pad_height, depth_multiplier, + output_offset, output_multiplier, output_activation_min, + output_activation_max, output_shape_inference); + return true; +} + +// This function picks some random DepthwiseConv params, which may or may not +// be legal. If they're not legal, it returns false. If they're legal, +// it runs the DepthwiseConv test and returns true. This allows the caller +// to loop until a test has been run. +bool TryTestOneDepthwiseConv() { + // We have to pick a lot of positive values, where we are particularly + // interested in small values because they are most likely to be special + // cases in optimized implementations, and secondarily because they allow + // tests to run fast, which means we can run more tests and get more + // coverage. + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int input_depth = ExponentialRandomPositiveInt(0.9f, 6, 50); + const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int filter_width = ExponentialRandomPositiveInt(0.9f, 4, 10); + const int filter_height = ExponentialRandomPositiveInt(0.9f, 4, 10); + const int depth_multiplier = ExponentialRandomPositiveInt(0.8f, 6, 50); + const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8); + const int dilation_width_factor = RandomElement(std::vector<int>({1, 2, 4})); + const int dilation_height_factor = RandomElement(std::vector<int>({1, 2, 4})); + const auto padding_type = + UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid; + + return TryTestDepthwiseConv(batch, input_depth, input_width, input_height, + filter_width, filter_height, depth_multiplier, + stride, dilation_width_factor, + dilation_height_factor, padding_type); +} + +// Tests parameters for the 3x3 filter kernel. +bool TryTestOneDepthwiseConv3x3Filter() { + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50); + const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int filter_width = 3; + const int filter_height = 3; + const int depth_multiplier = 1; + const int stride = UniformRandomInt(1, 2); + // We don't support dilations in the 3x3 filter. + const int dilation_width_factor = 1; + const int dilation_height_factor = 1; + // Although the kernel supports only kValid padding, we test that kSame + // is using the correct code path. + const auto padding_type = + UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid; + + return TryTestDepthwiseConv(batch, input_depth, input_width, input_height, + filter_width, filter_height, depth_multiplier, + stride, dilation_width_factor, + dilation_height_factor, padding_type); +} + +void TestOneDepthwiseConv() { + while (!TryTestOneDepthwiseConv()) { + } +} + +void TestOneDepthwiseConv3x3Filter() { + while (!TryTestOneDepthwiseConv3x3Filter()) { + } +} + +TEST(TestDepthwiseConv, TestDepthwiseConv) { + const int kTestsToRun = 10 * 1000; + for (int i = 0; i < kTestsToRun; i++) { + TestOneDepthwiseConv(); + } +} + +TEST(TestDepthwiseConv3x3Filter, TestDepthwiseConv) { + const int kTestsToRun = 3 * 1000; + for (int i = 0; i < kTestsToRun; i++) { + TestOneDepthwiseConv3x3Filter(); + } +} + +} // namespace +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc index 5f9cfc450d..083e5839bd 100644 --- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc +++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc @@ -14,8 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" -#include <algorithm> - #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" namespace tflite { @@ -26,6 +24,21 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr, int input_size, int num_units, int batch_size, TfLiteFusedActivation activation, float* hidden_state_ptr_batch, float* output_ptr_batch) { + RnnBatchStep(input_ptr_batch, input_weights_ptr, + /*aux_input_ptr_batch=*/nullptr, + /*aux_input_weights_ptr=*/nullptr, recurrent_weights_ptr, + bias_ptr, input_size, /*aux_input_size=*/0, num_units, + batch_size, activation, hidden_state_ptr_batch, + output_ptr_batch); +} + +void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr, + const float* aux_input_ptr_batch, + const float* aux_input_weights_ptr, + const float* recurrent_weights_ptr, const float* bias_ptr, + int input_size, int aux_input_size, int num_units, + int batch_size, TfLiteFusedActivation activation, + float* hidden_state_ptr_batch, float* output_ptr_batch) { // Output = bias tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size, output_ptr_batch); @@ -33,6 +46,12 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr, tensor_utils::MatrixBatchVectorMultiplyAccumulate( input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size, output_ptr_batch, /*result_stride=*/1); + // Output += aux_input * aux_input_weights (if they are not empty). + if (aux_input_size > 0) { + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch, + batch_size, output_ptr_batch, /*result_stride=*/1); + } // Output += recurrent_weights * hidden_state tensor_utils::MatrixBatchVectorMultiplyAccumulate( recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch, @@ -52,21 +71,41 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr, TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch, int8_t* quantized_hidden_state_ptr_batch, - float* hidden_state_ptr_batch, float* output_ptr_batch) { + float* scaling_factors, float* hidden_state_ptr_batch, + float* output_ptr_batch) { + RnnBatchStep(input_ptr_batch, input_weights_ptr, input_weights_scale, + /*aux_input_ptr_batch=*/nullptr, + /*aux_input_weights_ptr=*/nullptr, + /*aux_input_weights_scale=*/0.0f, recurrent_weights_ptr, + recurrent_weights_scale, bias_ptr, input_size, + /*aux_input_size=*/0, num_units, batch_size, activation, + quantized_input_ptr_batch, + /*aux_quantized_input_ptr_batch=*/nullptr, + quantized_hidden_state_ptr_batch, scaling_factors, + hidden_state_ptr_batch, output_ptr_batch); +} + +void RnnBatchStep( + const float* input_ptr_batch, const int8_t* input_weights_ptr, + float input_weights_scale, const float* aux_input_ptr_batch, + const int8_t* aux_input_weights_ptr, float aux_input_weights_scale, + const int8_t* recurrent_weights_ptr, float recurrent_weights_scale, + const float* bias_ptr, int input_size, int aux_input_size, int num_units, + int batch_size, TfLiteFusedActivation activation, + int8_t* quantized_input_ptr_batch, int8_t* aux_quantized_input_ptr_batch, + int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors, + float* hidden_state_ptr_batch, float* output_ptr_batch) { // Output = bias tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size, output_ptr_batch); - // TODO(mirkov): change std::minmax_element with a vectorized call. - auto minmax_element = std::minmax_element( - input_ptr_batch, input_ptr_batch + batch_size * input_size); - // Save quantization and matmul computation for all zero input. - if (!(*minmax_element.first == 0.0 && *minmax_element.second == 0.0)) { + if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) { // Quantize input from float to uint8 + quantization params (scaling // factor). float unused_min, unused_max; - float* scaling_factors = new float[batch_size]; + // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function) + // whichever is faster. for (int b = 0; b < batch_size; ++b) { const int offset = b * input_size; tensor_utils::SymmetricQuantizeFloats( @@ -80,16 +119,33 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr, tensor_utils::MatrixBatchVectorMultiplyAccumulate( input_weights_ptr, num_units, input_size, quantized_input_ptr_batch, scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1); - delete[] scaling_factors; } - minmax_element = std::minmax_element( - hidden_state_ptr_batch, hidden_state_ptr_batch + batch_size * num_units); + if (aux_input_ptr_batch && + !tensor_utils::IsZeroVector(aux_input_ptr_batch, + batch_size * aux_input_size)) { + float unused_min, unused_max; + for (int b = 0; b < batch_size; ++b) { + const int offset = b * aux_input_size; + tensor_utils::SymmetricQuantizeFloats( + aux_input_ptr_batch + offset, aux_input_size, + aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max, + &scaling_factors[b]); + scaling_factors[b] *= aux_input_weights_scale; + } + + // Output += aux_input * aux_input_weights + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_weights_ptr, num_units, aux_input_size, + aux_quantized_input_ptr_batch, scaling_factors, batch_size, + output_ptr_batch, /*result_stride=*/1); + } + // Save quantization and matmul computation for all zero input. - if (!(*minmax_element.first == 0.0 && *minmax_element.second == 0.0)) { + if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch, + batch_size * num_units)) { // Quantize hidden_state float unused_min, unused_max; - float* scaling_factors = new float[batch_size]; for (int b = 0; b < batch_size; ++b) { const int offset = b * num_units; tensor_utils::SymmetricQuantizeFloats( @@ -104,7 +160,6 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr, recurrent_weights_ptr, num_units, num_units, quantized_hidden_state_ptr_batch, scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1); - delete[] scaling_factors; } // Output = activation(Output) and update hidden_state @@ -114,152 +169,5 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr, hidden_state_ptr_batch); } -void LstmStep( - const float* input_ptr_batch, const float* input_to_input_weights_ptr, - const float* input_to_forget_weights_ptr, - const float* input_to_cell_weights_ptr, - const float* input_to_output_weights_ptr, - const float* recurrent_to_input_weights_ptr, - const float* recurrent_to_forget_weights_ptr, - const float* recurrent_to_cell_weights_ptr, - const float* recurrent_to_output_weights_ptr, - const float* cell_to_input_weights_ptr, - const float* cell_to_forget_weights_ptr, - const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr, - const float* forget_gate_bias_ptr, const float* cell_bias_ptr, - const float* output_gate_bias_ptr, const float* projection_weights_ptr, - const float* projection_bias_ptr, const TfLiteLSTMParams* params, - int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr, - float* cell_state_ptr, float* input_gate_scratch, - float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch, - float* output_ptr_batch) { - // Since we have already checked that weights are all there or none, we can - // check the existense of only one to the get the condition. - const bool use_cifg = (input_to_input_weights_ptr == nullptr); - const bool use_peephole = (cell_to_output_weights_ptr != nullptr); - // Initialize scratch buffers with bias. - if (!use_cifg) { - tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch, - input_gate_scratch); - } - tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch, - forget_gate_scratch); - tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch, - cell_scratch); - tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch, - output_gate_scratch); - - // For each batch and cell: compute input_weight * input. - if (!use_cifg) { - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, - input_gate_scratch, /*result_stride=*/1); - } - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, - forget_gate_scratch, /*result_stride=*/1); - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, - cell_scratch, /*result_stride=*/1); - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, - output_gate_scratch, /*result_stride=*/1); - - // For each batch and cell: compute recurrent_weight * output_state. - if (!use_cifg) { - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr, - n_batch, input_gate_scratch, - /*result_stride=*/1); - } - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr, - n_batch, forget_gate_scratch, - /*result_stride=*/1); - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr, - n_batch, cell_scratch, /*result_stride=*/1); - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr, - n_batch, output_gate_scratch, - /*result_stride=*/1); - - // For each batch and cell: update input gate. - if (!use_cifg) { - if (use_peephole) { - tensor_utils::VectorBatchVectorCwiseProductAccumulate( - cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch, - input_gate_scratch); - } - tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch, - input_gate_scratch); - } - - // For each batch and cell: update forget gate. - if (use_peephole) { - tensor_utils::VectorBatchVectorCwiseProductAccumulate( - cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch, - forget_gate_scratch); - } - tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch, - forget_gate_scratch); - - // For each batch and cell: update the cell. - tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr, - n_batch * n_cell, cell_state_ptr); - tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell, - params->activation, cell_scratch); - if (use_cifg) { - tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell, - forget_gate_scratch); - tensor_utils::VectorVectorCwiseProductAccumulate( - cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr); - } else { - tensor_utils::VectorVectorCwiseProductAccumulate( - cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr); - } - if (params->cell_clip > 0.0) { - tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell, - params->cell_clip, cell_state_ptr); - } - - // For each batch and cell: update the output gate. - if (use_peephole) { - tensor_utils::VectorBatchVectorCwiseProductAccumulate( - cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch, - output_gate_scratch); - } - tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell, - output_gate_scratch); - tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell, - params->activation, cell_scratch); - tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch, - n_batch * n_cell, output_gate_scratch); - - // For each batch: update the projection and output_state. - const bool use_projection_weight = (projection_weights_ptr != nullptr); - const bool use_projection_bias = (projection_bias_ptr != nullptr); - if (use_projection_weight) { - if (use_projection_bias) { - tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output, - n_batch, output_ptr_batch); - } else { - tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output); - } - tensor_utils::MatrixBatchVectorMultiplyAccumulate( - projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch, - output_ptr_batch, /*result_stride=*/1); - if (params->proj_clip > 0.0) { - tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output, - params->proj_clip, output_ptr_batch); - } - } else { - tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output, - output_ptr_batch); - } - tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output, - output_state_ptr); -} - } // namespace kernel_utils } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h index cbfbcbeefc..74e0a4a53d 100644 --- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h +++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h @@ -15,7 +15,7 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_ -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" namespace tflite { namespace kernel_utils { @@ -35,12 +35,24 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr, TfLiteFusedActivation activation, float* hidden_state_ptr_batch, float* output_ptr_batch); +// Same as above but includes an auxiliary input with the corresponding weights. +void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr, + const float* aux_input_ptr_batch, + const float* aux_input_weights_ptr, + const float* recurrent_weights_ptr, const float* bias_ptr, + int input_size, int aux_input_size, int num_units, + int batch_size, TfLiteFusedActivation activation, + float* hidden_state_ptr_batch, float* output_ptr_batch); + // Performs a quantized RNN batch inference step. Same as above, but for // quantization purposes, we also pass in quantized_hidden_state_ptr_batch and // quantized_input_ptr_batch pointers for temporary storage of the quantized // values of hidden_state_ptr_batch and input_ptr_batch, respectively. // These temporary storages are expected to be preallocated to the same size as // the respective pointers. +// An additional preallocated temporary storage 'scaling_factors' (of size +// batch_size) is used to store the scaling factors of the quantization (used +// for recovery). // {input,recurrent}_weights_scale params are used for dequantization/recovery. void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr, float input_weights_scale, @@ -50,43 +62,19 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr, TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch, int8_t* quantized_hidden_state_ptr_batch, - float* hidden_state_ptr_batch, float* output_ptr_batch); + float* scaling_factors, float* hidden_state_ptr_batch, + float* output_ptr_batch); -// Performs an LSTM batch inference step for input specified by input_ptr_batch. -// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and -// biases (*_bias_ptr), and buffers (*_scratch), along with additional -// parameters: -// - params: various LSTM params including activation, clipping, etc., -// - n_batch: size of batch, -// - n_cell: number of cells (or units), -// - n_input: the input size, -// - n_output: the output size. -// -// The pointers to the cell and output state and the output are updated. Unless -// projection is specified output and output state contain the same data. -// -// The pointers with the suffix "_batch" point to data aligned in batch_major -// order, and each step processes batch_size many inputs from input_ptr_batch, -// and updates batch_size many cell and output states. -void LstmStep( - const float* input_ptr_batch, const float* input_to_input_weights_ptr, - const float* input_to_forget_weights_ptr, - const float* input_to_cell_weights_ptr, - const float* input_to_output_weights_ptr, - const float* recurrent_to_input_weights_ptr, - const float* recurrent_to_forget_weights_ptr, - const float* recurrent_to_cell_weights_ptr, - const float* recurrent_to_output_weights_ptr, - const float* cell_to_input_weights_ptr, - const float* cell_to_forget_weights_ptr, - const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr, - const float* forget_gate_bias_ptr, const float* cell_bias_ptr, - const float* output_gate_bias_ptr, const float* projection_weights_ptr, - const float* projection_bias_ptr, const TfLiteLSTMParams* params, - int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr, - float* cell_state_ptr, float* input_gate_scratch, - float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch, - float* output_ptr_batch); +void RnnBatchStep( + const float* input_ptr_batch, const int8_t* input_weights_ptr, + float input_weights_scale, const float* aux_input_ptr_batch, + const int8_t* aux_input_weights_ptr, float aux_input_weights_scale, + const int8_t* recurrent_weights_ptr, float recurrent_weights_scale, + const float* bias_ptr, int input_size, int aux_input_size, int num_units, + int batch_size, TfLiteFusedActivation activation, + int8_t* quantized_input_ptr_batch, int8_t* aux_quantized_input_ptr_batch, + int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors, + float* hidden_state_ptr_batch, float* output_ptr_batch); } // namespace kernel_utils } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc new file mode 100644 index 0000000000..8963abb9af --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc @@ -0,0 +1,334 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <algorithm> +#include <cmath> +#include <cstdlib> +#include <functional> +#include <iterator> +#include <limits> +#include <random> +#include <sstream> +#include <string> +#include <vector> + +#define GEMMLOWP_ENABLE_FIXEDPOINT_CONSTANTS_CHECKS + +#include <gmock/gmock.h> +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/string.h" + +namespace tflite { + +class NumberGenerator { + public: + std::vector<int> RandomIntVector(int n, int min_val, int max_val) { + std::vector<int> vec(n); + double scale = static_cast<double>(max_val + 1 - min_val) / engine_.max(); + for (auto& it : vec) { + it = min_val + std::floor(engine_() * scale); + } + return vec; + } + + std::mt19937 engine_; +}; + +class LogQuantizedTest : public ::testing::Test { + public: + NumberGenerator generator_; +}; + +// input_integer_bits <= 30. output_integer_bits > 0. +inline int32 LogPositiveValuesViaFloat(int32 input_val, int input_integer_bits, + int output_integer_bits) { + const double float_log_sum_of_exps = std::log( + static_cast<double>(input_val) * 0.5 / (1 << (30 - input_integer_bits))); + static constexpr double min_int = + static_cast<double>(std::numeric_limits<int32>::min()); + static constexpr double max_int = + static_cast<double>(std::numeric_limits<int32>::max()); + double double_result = tflite::TfLiteRound(float_log_sum_of_exps * + (1 << (31 - output_integer_bits))); + return static_cast<std::int32_t>( + std::min(max_int, std::max(min_int, double_result))); +} + +void CheckOutputData(const std::vector<int32>& test_output, + const std::vector<int32>& reference_output, + const std::vector<int32>& test_input, + const string& check_label, int input_integer_bits, + int output_integer_bits, int tolerance) { + // In the special case of small input, specifically raw value of 5, a rounding + // up leads to difference in the output. We do not aim to be accurate for + // very small input values, and there should be sufficient input fractional + // bits that this is a small input. + static constexpr double error_from_rounding_up = 0.0224585; + const int n = test_output.size(); + ASSERT_EQ(n, reference_output.size()); + for (int i = 0; i < n; ++i) { + // Adjust tolerance when input <= 5*2^-(31-input_integer_bits). + const int adjusted_tolerance = + test_input[i] > 5 + ? tolerance + : std::max(tolerance, static_cast<int>(std::ceil( + error_from_rounding_up * + (1 << (31 - output_integer_bits))))); + ASSERT_LE(std::abs(test_output[i] - reference_output[i]), + adjusted_tolerance) + << "Failure in \"" << check_label << "\" at i=" << i + << ", test_input[i]=" << test_input[i] << "=" + << static_cast<double>(test_input[i]) / (1 << (31 - input_integer_bits)) + << ", test_output[i]=" << test_output[i] << "=" + << static_cast<double>(test_output[i]) / + (1 << (31 - output_integer_bits)) + << ", reference_output[i]=" << reference_output[i] << "=" + << static_cast<double>(reference_output[i]) / + (1 << (31 - output_integer_bits)) + << ", difference[i]=" << std::abs(reference_output[i] - test_output[i]) + << "=" + << static_cast<double>(std::abs(reference_output[i] - test_output[i])) / + (1 << (31 - output_integer_bits)) + << "; tolerance=" << tolerance + << ", adj tolerance=" << adjusted_tolerance; + } +} + +void RightShiftVector(const std::vector<int32>& shifts, + std::vector<int32>* vec) { + const int n = vec->size(); + ASSERT_EQ(n, shifts.size()); + for (int i = 0; i < n; ++i) { + vec->at(i) = std::max(1, vec->at(i) >> shifts[i]); + } +} + +template <int OutputIntegerBits, int InputIntegerBits> +void RunSingleTest(const std::vector<int32>& test_input, + const string& check_label, int tolerance) { + const int n = test_input.size(); + std::vector<int32> float_gen_output(n, 0); + std::vector<int32> reference_output(n, 0); + std::vector<int32> optimized_output(n, 0); + + // Workaround the stupid things that intelligent humans do. + // Consequence of __builtin_clz(0u) may equal 31 instead of 32. + std::vector<int32> fudged_input(n, 0); + for (int i = 0; i < n; ++i) { + fudged_input[i] = std::max(test_input[i], 2); + } + + for (int i = 0; i < n; ++i) { + reference_output[i] = + tflite::reference_ops::log_x_for_x_greater_than_or_equal_to_1_impl< + OutputIntegerBits, InputIntegerBits>( + gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw( + fudged_input[i])) + .raw(); + optimized_output[i] = + tflite::optimized_ops::log_x_for_x_greater_than_or_equal_to_1_impl< + OutputIntegerBits, InputIntegerBits>( + gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw( + fudged_input[i])) + .raw(); + float_gen_output[i] = LogPositiveValuesViaFloat( + fudged_input[i], InputIntegerBits, OutputIntegerBits); + } + // Note that first check is intolerant. + { + std::ostringstream label; + label << check_label << " / optimized vs reference / InputIntegerBits=" + << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits; + CheckOutputData( + optimized_output, reference_output, test_input, label.str(), + InputIntegerBits, OutputIntegerBits, 0); + } + { + std::ostringstream label; + label << check_label << " / reference vs float-gen / InputIntegerBits=" + << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits; + CheckOutputData( + reference_output, float_gen_output, test_input, label.str(), + InputIntegerBits, OutputIntegerBits, tolerance); + } + { + std::ostringstream label; + label << check_label << " optimized vs float-gen / InputIntegerBits=" + << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits; + CheckOutputData( + optimized_output, float_gen_output, test_input, label.str(), + InputIntegerBits, OutputIntegerBits, tolerance); + } +} + +template <int OutputIntegerBits> +void RunSingleTest(const std::vector<int32>& test_input, int input_integer_bits, + const string& check_label, int tolerance) { +#define INPUT_CASE(K) \ + case K: \ + return RunSingleTest<OutputIntegerBits, K>(test_input, check_label, \ + tolerance) + switch (input_integer_bits) { + INPUT_CASE(0); + INPUT_CASE(1); + INPUT_CASE(2); + INPUT_CASE(3); + INPUT_CASE(4); + INPUT_CASE(5); + INPUT_CASE(6); + INPUT_CASE(7); + INPUT_CASE(8); + INPUT_CASE(9); + INPUT_CASE(10); + INPUT_CASE(11); + INPUT_CASE(12); + INPUT_CASE(13); + INPUT_CASE(14); + INPUT_CASE(15); + INPUT_CASE(16); + INPUT_CASE(17); + INPUT_CASE(18); + INPUT_CASE(19); + INPUT_CASE(20); + INPUT_CASE(21); + INPUT_CASE(22); + INPUT_CASE(23); + INPUT_CASE(24); + INPUT_CASE(25); + INPUT_CASE(26); + INPUT_CASE(27); + INPUT_CASE(28); + INPUT_CASE(29); + default: + ASSERT_LE(input_integer_bits, 30) + << "Input integer bits not handled: " << input_integer_bits; + } +#undef INPUT_CASE +} + +void RunSingleTest(const std::vector<int32>& test_input, int input_integer_bits, + int output_integer_bits, const string& check_label, + int tolerance) { +#define OUTPUT_CASE(K) \ + case K: \ + return RunSingleTest<K>(test_input, input_integer_bits, check_label, \ + tolerance) + switch (output_integer_bits) { + OUTPUT_CASE(0); + OUTPUT_CASE(1); + OUTPUT_CASE(2); + OUTPUT_CASE(3); + OUTPUT_CASE(4); + OUTPUT_CASE(5); + OUTPUT_CASE(6); + OUTPUT_CASE(7); + OUTPUT_CASE(8); + OUTPUT_CASE(9); + OUTPUT_CASE(10); + OUTPUT_CASE(11); + OUTPUT_CASE(12); + OUTPUT_CASE(13); + OUTPUT_CASE(14); + OUTPUT_CASE(15); + OUTPUT_CASE(16); + OUTPUT_CASE(17); + OUTPUT_CASE(18); + OUTPUT_CASE(19); + OUTPUT_CASE(20); + OUTPUT_CASE(21); + OUTPUT_CASE(22); + OUTPUT_CASE(23); + OUTPUT_CASE(24); + OUTPUT_CASE(25); + OUTPUT_CASE(26); + OUTPUT_CASE(27); + OUTPUT_CASE(28); + OUTPUT_CASE(29); + default: + ASSERT_LE(input_integer_bits, 30) + << "Input integer bits not handled: " << input_integer_bits; + } +#undef OUTPUT_CASE +} + +void RunUniformTest(int test_size, int input_integer_bits, + int output_integer_bits, const string& check_label, + int tolerance, NumberGenerator* generator) { + std::vector<int> test_data = generator->RandomIntVector( + test_size, 2, std::numeric_limits<int>::max() - 1); + test_data[0] = 2; + test_data[1] = 3; + test_data[2] = 4; + test_data[3] = std::numeric_limits<int32>::max() - 2; + test_data[4] = std::numeric_limits<int32>::max() - 1; + test_data[5] = std::numeric_limits<int32>::max(); + + RunSingleTest(test_data, input_integer_bits, output_integer_bits, + check_label + " / uniform test", tolerance); +} + +void RunUniformShiftUniformTest(int test_size, int input_integer_bits, + int output_integer_bits, + const string& check_label, int tolerance, + NumberGenerator* generator) { + std::vector<int> test_data = generator->RandomIntVector( + test_size, 2, std::numeric_limits<int>::max() - 1); + std::vector<int> shifts = generator->RandomIntVector(test_size, 0, 29); + RightShiftVector(shifts, &test_data); + + RunSingleTest(test_data, input_integer_bits, output_integer_bits, + check_label + " / shifted test", tolerance); +} + +TEST_F(LogQuantizedTest, VariedIntegerBits) { + static constexpr int kVariations = 250; + static constexpr int kRunSize = 250; + static constexpr int kIntegerTolerance = 8; + static constexpr double kOutputFloatTolerance = 7.0e-7; + + std::vector<int> input_integer_bits = + generator_.RandomIntVector(kVariations, 0, 24); + std::vector<int> output_integer_bits = + generator_.RandomIntVector(kVariations, 1, 10); + + for (int i = 0; i < kVariations; ++i) { + int var_output_integer_bits = output_integer_bits[i]; + int tolerance = + std::max(1.0 * kIntegerTolerance, + (1 << (31 - var_output_integer_bits)) * kOutputFloatTolerance); + + RunUniformTest(kRunSize, input_integer_bits[i], var_output_integer_bits, + "VariedIntegerBits", tolerance, &generator_); + RunUniformShiftUniformTest(kRunSize, input_integer_bits[i], + var_output_integer_bits, "VariedIntegerBits", + tolerance, &generator_); + } +} + +TEST_F(LogQuantizedTest, SelectedIntegerBits) { + static constexpr int kInputBits = 12; + static constexpr int kOutputBits = 5; + static constexpr int kRunSize = 100000; + static constexpr int kIntegerTolerance = 4; + + RunUniformTest(kRunSize, kInputBits, kOutputBits, "SelectedIntegerBits", + kIntegerTolerance, &generator_); + RunUniformShiftUniformTest(kRunSize, kInputBits, kOutputBits, + "SelectedIntegerBits", kIntegerTolerance, + &generator_); +} + +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc new file mode 100644 index 0000000000..2252ca1bcc --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc @@ -0,0 +1,251 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <algorithm> +#include <cmath> +#include <cstdlib> +#include <functional> +#include <iterator> +#include <limits> +#include <random> +#include <string> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/test_util.h" +#include "tensorflow/contrib/lite/string.h" + +namespace tflite { +namespace { + +void RunLogSoftmaxFloatReference(const uint8* input_data, + const RuntimeShape& shape_common, + int32 input_offset, const double input_scale, + int stride, float beta, + uint8* reference_output_data) { + const int ref_buffer_size = shape_common.FlatSize(); + std::vector<float> reference_dequant_data(ref_buffer_size); + std::vector<float> reference_output_float_data(ref_buffer_size); + + // Reference data generated via Dequant of input into float, and then applying + // float LogSoftmax. + DequantizationParams dq_params; + dq_params.zero_point = input_offset; + dq_params.scale = input_scale; + reference_ops::Dequantize(dq_params, shape_common, input_data, shape_common, + reference_dequant_data.data()); + SoftmaxParams sm_params; + optimized_ops::LogSoftmax(sm_params, shape_common, + reference_dequant_data.data(), shape_common, + reference_output_float_data.data()); + // Work with quantized scaling for LogSoftmax, under which 255 represents 0, + // and -16 gets nudged up to 0. + for (int i = 0; i < ref_buffer_size; i++) { + reference_output_data[i] = std::max( + 0, static_cast<int>( + 255 + std::round(16.0f * reference_output_float_data[i]))); + } +} + +void CheckOutputData(const uint8* test_output, const uint8* reference_output, + const RuntimeShape& shape_common, + const string& check_label, bool be_exacting) { + const int buffer_size = shape_common.FlatSize(); + // While calculating some metrics in floating point, we work with quantized + // scaling. + std::vector<int> diff(buffer_size); + int64_t sum_diff = 0; + int64_t sum_abs_diff = 0; + for (int i = 0; i < buffer_size; i++) { + diff[i] = static_cast<int>(test_output[i]) - reference_output[i]; + sum_diff += diff[i]; + sum_abs_diff += std::abs(diff[i]); + } + // These stats help understand test failures. + std::sort(std::begin(diff), std::end(diff)); + const int min_diff = diff.front(); + const int max_diff = diff.back(); + const int median_diff = diff[diff.size() / 2]; + const float mean_diff = static_cast<float>(sum_diff) / buffer_size; + const float mean_abs_diff = static_cast<float>(sum_abs_diff) / buffer_size; + // We either check for bit exactness (against the reference quantized version) + // or for general accuracy, allowing off-by-one (against the float reference). + if (be_exacting) { + ASSERT_TRUE(std::abs(min_diff) == 0 && std::abs(max_diff) == 0) + << check_label << ": " + << "std::abs(min_diff)=" << std::abs(min_diff) + << ", std::abs(max_diff)=" << std::abs(max_diff); + } else { + // For small numbers of samples, the estimates of the means vary more. + // Rather than widen the tolerances, we skip the smaller tests. + ASSERT_TRUE(((std::abs(mean_diff) < 2e-2f && mean_abs_diff < 3e-2f) || + buffer_size < 10000) && + std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 && + std::abs(max_diff) <= 1) + << check_label << ": " + << "buffer_size=" << buffer_size << ", mean_diff=" << mean_diff + << ", mean_abs_diff=" << mean_abs_diff + << ", median_diff=" << median_diff << ", min_diff=" << min_diff + << ", max_diff=" << max_diff; + } +} + +// Runs the LogSoftmax and compares against the float reference implementation +// and the quantized reference implementation. +void RunOneLogSoftmaxTest(const uint8* input_data, + const RuntimeShape& shape_common, int32 input_offset, + const double input_scale, int stride, float beta) { + const int buffer_size = shape_common.FlatSize(); + std::vector<uint8> optimized_logsoftmax_output(buffer_size); + std::vector<uint8> reference_float_logsoftmax_output(buffer_size); + std::vector<uint8> reference_quant_logsoftmax_output(buffer_size); + + RunLogSoftmaxFloatReference(input_data, shape_common, input_offset, + input_scale, stride, beta, + reference_float_logsoftmax_output.data()); + + int32 input_beta_multiplier; + int input_beta_left_shift; + int32 reverse_scaling_divisor; + int reverse_scaling_right_shift; + static const int kScaledDiffIntegerBits = 5; + tflite::PreprocessLogSoftmaxScalingExp( + beta, input_scale, kScaledDiffIntegerBits, &input_beta_multiplier, + &input_beta_left_shift, &reverse_scaling_divisor, + &reverse_scaling_right_shift); + reverse_scaling_right_shift *= -1; + // diff_min has a negative value, and is used to limit the maximum magnitude + // of the diffs, which are <= 0. + const int diff_min = -tflite::CalculateInputRadius(kScaledDiffIntegerBits, + input_beta_left_shift); + + SoftmaxParams params; + params.input_multiplier = input_beta_multiplier; + params.input_left_shift = input_beta_left_shift; + params.reverse_scaling_divisor = reverse_scaling_divisor; + params.reverse_scaling_right_shift = reverse_scaling_right_shift; + params.diff_min = diff_min; + optimized_ops::LogSoftmax(params, shape_common, input_data, shape_common, + optimized_logsoftmax_output.data()); + reference_ops::LogSoftmax(params, shape_common, input_data, shape_common, + reference_quant_logsoftmax_output.data()); + + CheckOutputData(optimized_logsoftmax_output.data(), + reference_float_logsoftmax_output.data(), shape_common, + "Optimized vs float reference", false); + CheckOutputData(optimized_logsoftmax_output.data(), + reference_quant_logsoftmax_output.data(), shape_common, + "Optimized vs quant reference", true); + CheckOutputData(reference_quant_logsoftmax_output.data(), + reference_float_logsoftmax_output.data(), shape_common, + "Quant reference vs float reference", false); +} + +// This function picks some random LogSoftmax params, which are checked for +// desirability. If not acceptable, it returns false. If they're OK, +// it runs the LogSoftmax test and returns true. This allows the caller +// to loop until a test has been run. +// +// Currently we do not reject for any reason. +bool TryOneUniformLogSoftmax() { + // We pick mostly positive values, on the whole emphasizing smaller values and + // therefore faster tests. We test a wider range of depths. In the case of + // LogSoftmax, the width and height really just create test repetitions. + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int input_depth = ExponentialRandomPositiveInt(0.75f, 175, 500); + const int input_width = ExponentialRandomPositiveInt(0.8f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.8f, 20, 200); + const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8); + const double input_scale = std::pow(10.0, UniformRandomFloat(-2.0, 1.0)); + const int32 input_offset = UniformRandomInt(-256, 0); + static constexpr float beta = 1.0f; + + auto shape_common = + RuntimeShape({batch, input_height, input_width, input_depth}); + const int buffer_size = shape_common.FlatSize(); + + std::vector<uint8> input_data(buffer_size); + FillRandom(&input_data); + RunOneLogSoftmaxTest(input_data.data(), shape_common, input_offset, + input_scale, stride, beta); + return true; +} + +// See TryOneUniformLogSoftmax() for a general description. +// +// Tests with "skyscraper" input patterns are included for two reasons. (a) +// Bimodal distributions are potentially challenging and perhaps more +// realistic than simple uniform random inputs. (b) Some implementations of +// LogSoftmax may adapt as they traverse the depth, and so we test handling of +// cases where relatively small values are encountered at the beginning and end. +bool TryOneSkyscraperLogSoftmax(bool small_depth) { + // We pick mostly positive values, on the whole emphasizing smaller values and + // therefore faster tests. We test a wider range of depths. In the case of + // LogSoftmax, the width and height really just create test repetitions. + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int input_depth = small_depth + ? ExponentialRandomPositiveInt(0.75f, 40, 500) + : ExponentialRandomPositiveInt(0.75f, 175, 500); + const int input_width = ExponentialRandomPositiveInt(0.7f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.7f, 20, 200); + const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8); + const double input_scale = std::pow(10.0, UniformRandomFloat(-2.0, 1.0)); + const int32 input_offset = UniformRandomInt(-256, 0); + static constexpr float beta = 1.0f; + // Extra parameters for skyscraper input patterns. + const double middle_proportion = + ExponentialRandomPositiveFloat(0.65f, 0.1, 1.0); + const int middle_min = UniformRandomInt(0, 255); + const int sides_max = UniformRandomInt(0, middle_min); + + auto shape_common = + RuntimeShape({batch, input_height, input_width, input_depth}); + const int buffer_size = shape_common.FlatSize(); + + std::vector<uint8> input_data(buffer_size); + FillRandomSkyscraper(&input_data, input_depth, middle_proportion, middle_min, + sides_max); + RunOneLogSoftmaxTest(input_data.data(), shape_common, input_offset, + input_scale, stride, beta); + return true; +} + +TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxTests) { + const int kTestsToRun = 1000; + for (int i = 0; i < kTestsToRun; i++) { + while (!TryOneUniformLogSoftmax()) { + } + } +} + +TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) { + const int kTestsToRun = 1000; + for (int i = 0; i < kTestsToRun; i++) { + while (!TryOneSkyscraperLogSoftmax(false)) { + } + } +} + +TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxTests) { + const int kTestsToRun = 1000; + for (int i = 0; i < kTestsToRun; i++) { + while (!TryOneSkyscraperLogSoftmax(true)) { + } + } +} +} // namespace +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h index 4a90e7e640..2d96da65c3 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h @@ -31,33 +31,50 @@ limitations under the License. namespace tflite { namespace cblas_ops { -inline void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims, float* im2col_data, - const Dims<4>& im2col_dims) { +inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data, const RuntimeShape& im2col_shape, + float* im2col_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); gemmlowp::ScopedProfilingLabel label("Conv/cblas"); const float* gemm_input_data = nullptr; - const Dims<4>* gemm_input_dims = nullptr; - const int filter_width = ArraySize(filter_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); + const RuntimeShape* gemm_input_shape = nullptr; + const int filter_width = filter_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); const bool need_im2col = stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1; if (need_im2col) { TFLITE_DCHECK(im2col_data); - optimized_ops::Im2col(input_data, input_dims, stride_width, stride_height, - pad_width, pad_height, filter_height, filter_width, 0, - im2col_data, im2col_dims); + ConvParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + optimized_ops::Im2col(op_params, filter_height, filter_width, 0, + input_shape, input_data, im2col_shape, im2col_data); + gemm_input_data = im2col_data; - gemm_input_dims = &im2col_dims; + gemm_input_shape = &im2col_shape; } else { TFLITE_DCHECK(!im2col_data); gemm_input_data = input_data; - gemm_input_dims = &input_dims; + gemm_input_shape = &input_shape; } // The following code computes matrix multiplication c = a * transponse(b) @@ -69,10 +86,10 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims, const float* a = gemm_input_data; const float* b = filter_data; float* c = output_data; - int m = gemm_input_dims->sizes[1] * gemm_input_dims->sizes[2] * - gemm_input_dims->sizes[3]; - int n = output_dims.sizes[0]; - int k = gemm_input_dims->sizes[0]; + const int gemm_input_dims = gemm_input_shape->DimensionsCount(); + int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1); + int n = output_shape.Dims(3); + int k = gemm_input_shape->Dims(gemm_input_dims - 1); // The stride of matrix a, b and c respectively. int stride_a = k; int stride_b = k; @@ -82,8 +99,8 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims, stride_a, b, stride_b, 0.0f, c, stride_c); optimized_ops::AddBiasAndEvalActivationFunction( - bias_data, bias_dims, output_data, output_dims, output_activation_min, - output_activation_max); + output_activation_min, output_activation_max, bias_shape, bias_data, + output_shape, output_data); } } // namespace cblas_ops diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h index 3a53d3ab07..934308ef29 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_ -#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_ namespace tflite { @@ -58,4 +58,4 @@ inline bool TestCPUFeatureNeon() { return false; } : Portable##funcname(__VA_ARGS__) #endif -#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_ +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h index 7f6eea2d5d..d8dd7bba89 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h @@ -761,7 +761,8 @@ struct FloatDepthwiseConvKernel<true, 4, 1> { // Accumulates the effect of one row of the filter, on a segment of one row // of the output, accessing the corresponding one row of the input. template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> -void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width, +void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, + int input_depth, int input_width, const float* input_data, int pad_width, int depth_multiplier, int filter_width, const float* filter_data, @@ -835,10 +836,10 @@ void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width, // generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized. inline void FloatDepthwiseConvAccumRowGeneric( - int stride, int input_depth, int input_width, const float* input_data, - int pad_width, int depth_multiplier, int filter_width, - const float* filter_data, int out_x_buffer_start, int out_x_buffer_end, - int output_depth, float* acc_buffer) { + int stride, int dilation_factor, int input_depth, int input_width, + const float* input_data, int pad_width, int depth_multiplier, + int filter_width, const float* filter_data, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, float* acc_buffer) { gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); #ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK @@ -860,6 +861,7 @@ inline void FloatDepthwiseConvAccumRowGeneric( << "* stride = " << stride << "\n" << "* input_depth = " << input_depth << "\n" << "* depth_multiplier = " << depth_multiplier << "\n" + << "* dilation_factor = " << dilation_factor << "\n" << "*\n" << "* Please do not hesitate to contact benoitjacob@ with this\n" << "* information.\n" @@ -869,14 +871,17 @@ inline void FloatDepthwiseConvAccumRowGeneric( const float* filter_base_ptr = filter_data; for (int filter_x = 0; filter_x < filter_width; ++filter_x) { const int out_x_loop_start = std::max( - out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); - const int out_x_loop_end = - std::min(out_x_buffer_end, - (pad_width + input_width - filter_x + stride - 1) / stride); + out_x_buffer_start, + (pad_width - dilation_factor * filter_x + stride - 1) / stride); + const int out_x_loop_end = std::min( + out_x_buffer_end, + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / + stride); float* acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; - const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; + const int in_x_origin = + (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; const float* input_ptr = input_data + in_x_origin * input_depth; const int input_ptr_increment = (stride - 1) * input_depth; for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { @@ -907,25 +912,37 @@ inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, } } -inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, - float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims) { +inline void DepthwiseConv( + const DepthwiseParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("DepthwiseConv"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); static const int kAccBufferMaxSize = 2048; float acc_buffer[kAccBufferMaxSize]; @@ -946,7 +963,8 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, FIXED_DEPTH_MULTIPLIER) \ if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ - depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ + depth_multiplier == FIXED_DEPTH_MULTIPLIER && \ + dilation_height_factor == 1 && dilation_width_factor == 1) { \ row_accum_func = \ FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ FIXED_DEPTH_MULTIPLIER>; \ @@ -990,14 +1008,22 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, row_accum_func = FloatDepthwiseConvAccumRowGeneric; } + const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); + const int input_batch_stride = input_height_stride * input_shape.Dims(1); + const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); + // Now that we have determined row_accum_func, we can start work. float* output_ptr = output_data; for (int b = 0; b < batches; ++b) { for (int out_y = 0; out_y < output_height; ++out_y) { const int in_y_origin = (out_y * stride_height) - pad_height; - const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_start = + std::max(0, (-in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(filter_height, + (input_height - in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; out_x_buffer_start += kOutputPixelsInAccBuffer) { const int out_x_buffer_end = std::min( @@ -1013,14 +1039,13 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, // Accumulation loop. Most of the time should be spent in here. for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { - const int in_y = in_y_origin + filter_y; - row_accum_func(stride_width, input_depth, input_width, - input_data + in_y * input_dims.strides[2] + - b * input_dims.strides[3], - pad_width, depth_multiplier, filter_width, - filter_data + filter_y * filter_dims.strides[2], - out_x_buffer_start, out_x_buffer_end, output_depth, - acc_buffer); + const int in_y = in_y_origin + dilation_height_factor * filter_y; + row_accum_func( + stride_width, dilation_width_factor, input_depth, input_width, + input_data + in_y * input_height_stride + b * input_batch_stride, + pad_width, depth_multiplier, filter_width, + filter_data + filter_y * filter_height_stride, out_x_buffer_start, + out_x_buffer_end, output_depth, acc_buffer); } // Finished accumulating. Now store to destination. const int num_output_values = output_depth * num_output_pixels; @@ -1067,34 +1092,6 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, float* output_data, - const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data, - bias_dims, stride_width, stride_height, pad_width, pad_height, - depth_multiplier, output_activation_min, output_activation_max, - output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, int depth_multiplier, - float* output_data, const Dims<4>& output_dims) { - DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, - bias_dims, stride, stride, pad_width, pad_height, - depth_multiplier, output_data, output_dims); -} - } // namespace optimized_ops } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h index dd6932ffe7..803eff292a 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h @@ -1466,11 +1466,14 @@ struct QuantizedDepthwiseConvKernel<false, 12, 1> { // Accumulates the effect of one row of the filter, on a segment of one row // of the output, accessing the corresponding one row of the input. template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> -void QuantizedDepthwiseConvAccumRow( - int stride, int input_depth, int input_width, const uint8* input_data, - int16 input_offset, int pad_width, int depth_multiplier, int filter_width, - const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, - int out_x_buffer_end, int output_depth, int32* acc_buffer) { +void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, + int input_depth, int input_width, + const uint8* input_data, int16 input_offset, + int pad_width, int depth_multiplier, + int filter_width, const uint8* filter_data, + int16 filter_offset, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, + int32* acc_buffer) { #ifdef GEMMLOWP_PROFILING gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); #endif @@ -1537,10 +1540,11 @@ void QuantizedDepthwiseConvAccumRow( // generic fallback of DepthwiseConvAccumRow, portable, non-templatized. inline void QuantizedDepthwiseConvAccumRowGeneric( - int stride, int input_depth, int input_width, const uint8* input_data, - int16 input_offset, int pad_width, int depth_multiplier, int filter_width, - const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, - int out_x_buffer_end, int output_depth, int32* acc_buffer) { + int stride, int dilation_factor, int input_depth, int input_width, + const uint8* input_data, int16 input_offset, int pad_width, + int depth_multiplier, int filter_width, const uint8* filter_data, + int16 filter_offset, int out_x_buffer_start, int out_x_buffer_end, + int output_depth, int32* acc_buffer) { gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); #ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK @@ -1562,6 +1566,7 @@ inline void QuantizedDepthwiseConvAccumRowGeneric( << "* stride = " << stride << "\n" << "* input_depth = " << input_depth << "\n" << "* depth_multiplier = " << depth_multiplier << "\n" + << "* dilation_factor = " << dilation_factor << "\n" << "*\n" << "* Please do not hesitate to contact benoitjacob@ with this\n" << "* information.\n" @@ -1571,14 +1576,17 @@ inline void QuantizedDepthwiseConvAccumRowGeneric( const uint8* filter_base_ptr = filter_data; for (int filter_x = 0; filter_x < filter_width; ++filter_x) { const int out_x_loop_start = std::max( - out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); - const int out_x_loop_end = - std::min(out_x_buffer_end, - (pad_width + input_width - filter_x + stride - 1) / stride); + out_x_buffer_start, + (pad_width - dilation_factor * filter_x + stride - 1) / stride); + const int out_x_loop_end = std::min( + out_x_buffer_end, + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / + stride); int32* acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; - const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; + const int in_x_origin = + (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; const uint8* input_ptr = input_data + in_x_origin * input_depth; const int input_ptr_increment = (stride - 1) * input_depth; for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { @@ -1669,42 +1677,61 @@ inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, } } -inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { +inline void DepthwiseConv( + const DepthwiseParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit"); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + TFLITE_DCHECK_GE(dilation_width_factor, 1); + TFLITE_DCHECK_GE(dilation_height_factor, 1); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); +#ifdef USE_NEON + const bool shift_left = (output_shift > 0); + const int32 multiplier_power_of_two = shift_left ? (1 << output_shift) : 1; +#endif + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); - -#ifdef __aarch64__ +// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on +// Jetson TX-2. This compiler does not support the offsetof() macro. +#if defined(__aarch64__) && !defined(GOOGLE_L4T) // Call kernel optimized for depthwise convolutions using 3x3 filters if // parameters are supported. - if (Fast3x3FilterKernelSupported(input_dims, filter_dims, stride_width, - stride_height, pad_width, pad_height, - depth_multiplier, output_dims)) { - DepthwiseConv3x3Filter(input_data, input_dims, input_offset, filter_data, - filter_dims, filter_offset, bias_data, bias_dims, - stride_width, stride_height, pad_width, pad_height, - depth_multiplier, output_offset, output_multiplier, - output_shift, output_activation_min, - output_activation_max, output_data, output_dims); + if (Fast3x3FilterKernelSupported( + input_shape, filter_shape, stride_width, stride_height, + dilation_width_factor, dilation_height_factor, pad_width, pad_height, + depth_multiplier, output_shape, output_shift)) { + DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + output_data); return; } #endif @@ -1728,7 +1755,8 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, FIXED_DEPTH_MULTIPLIER) \ if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ - depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ + depth_multiplier == FIXED_DEPTH_MULTIPLIER && \ + dilation_width_factor == 1 && dilation_height_factor == 1) { \ row_accum_func = \ QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ FIXED_DEPTH_MULTIPLIER>; \ @@ -1779,14 +1807,22 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, #undef TFMINI_USE_DEPTHWISECONV_KERNEL + const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); + const int input_batch_stride = input_height_stride * input_shape.Dims(1); + const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); + // Now that we have determined row_accum_func, we can start work. uint8* output_ptr = output_data; for (int b = 0; b < batches; ++b) { for (int out_y = 0; out_y < output_height; ++out_y) { const int in_y_origin = (out_y * stride_height) - pad_height; - const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_start = + std::max(0, (-in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(filter_height, + (input_height - in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; out_x_buffer_start += kOutputPixelsInAccBuffer) { const int out_x_buffer_end = std::min( @@ -1802,13 +1838,12 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, // Accumulation loop. Most of the time should be spent in here. for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { - const int in_y = in_y_origin + filter_y; + const int in_y = in_y_origin + dilation_height_factor * filter_y; row_accum_func( - stride_width, input_depth, input_width, - input_data + in_y * input_dims.strides[2] + - b * input_dims.strides[3], + stride_width, dilation_width_factor, input_depth, input_width, + input_data + in_y * input_height_stride + b * input_batch_stride, input_offset, pad_width, depth_multiplier, filter_width, - filter_data + filter_y * filter_dims.strides[2], filter_offset, + filter_data + filter_y * filter_height_stride, filter_offset, out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); } // Finished accumulating int32 values. Now need to convert them to @@ -1833,12 +1868,20 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, acc[j] = vld1q_s32(acc_buffer + i + 4 * j); } - // Fixed-point multiplication. - for (int j = 0; j < 4; j++) { - acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); - } - for (int j = 0; j < 4; j++) { - acc[j] = RoundingDivideByPOT(acc[j], output_shift); + if (!shift_left) { + // Fixed-point multiplication. + for (int j = 0; j < 4; j++) { + acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); + } + for (int j = 0; j < 4; j++) { + acc[j] = RoundingDivideByPOT(acc[j], -output_shift); + } + } else { + // Fixed-point multiplication. + for (int j = 0; j < 4; j++) { + acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two); + acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); + } } // Add the output offset. for (int j = 0; j < 4; j++) { @@ -1870,12 +1913,21 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, for (; i <= num_output_values - 8; i += 8) { int32x4_t acc0 = vld1q_s32(acc_buffer + i); int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4); - // Fixed-point multiplication. - acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); - acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); - // Rounding right shift. - acc0 = RoundingDivideByPOT(acc0, output_shift); - acc1 = RoundingDivideByPOT(acc1, output_shift); + if (!shift_left) { + // Fixed-point multiplication. + acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); + acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); + // Rounding right shift. + acc0 = RoundingDivideByPOT(acc0, -output_shift); + acc1 = RoundingDivideByPOT(acc1, -output_shift); + } else { + // Fixed-point multiplication. + acc0 = vmulq_n_s32(acc0, multiplier_power_of_two); + acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); + + acc1 = vmulq_n_s32(acc1, multiplier_power_of_two); + acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); + } // Add the output offset. acc0 = vaddq_s32(acc0, output_offset_vec); acc1 = vaddq_s32(acc1, output_offset_vec); @@ -1899,10 +1951,16 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, // that will have to go through the very slow scalar code. for (; i <= num_output_values - 4; i += 4) { int32x4_t acc = vld1q_s32(acc_buffer + i); - // Fixed-point multiplication. - acc = vqrdmulhq_n_s32(acc, output_multiplier); - // Rounding right shift. - acc = RoundingDivideByPOT(acc, output_shift); + if (!shift_left) { + // Fixed-point multiplication. + acc = vqrdmulhq_n_s32(acc, output_multiplier); + // Rounding right shift. + acc = RoundingDivideByPOT(acc, -output_shift); + } else { + // Fixed-point multiplication. + acc = vmulq_n_s32(acc, multiplier_power_of_two); + acc = vqrdmulhq_n_s32(acc, output_multiplier); + } // Add the output offset. acc = vaddq_s32(acc, output_offset_vec); // Apply the activation function. @@ -1923,8 +1981,8 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, // Handle leftover values, one by one. This is very slow. for (; i < num_output_values; i++) { int32 acc = acc_buffer[i]; - acc = MultiplyByQuantizedMultiplierSmallerThanOne( - acc, output_multiplier, output_shift); + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + output_shift); acc += output_offset; acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); @@ -1935,48 +1993,6 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, } } -// Legacy, for compatibility with old checked-in code. -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, stride_width, - stride_height, pad_width, pad_height, depth_multiplier, - output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data, - output_dims); -} - -// Legacy, for compatibility with old checked-in code. -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data, - filter_dims, filter_offset, bias_data, bias_dims, stride, - stride, pad_width, pad_height, depth_multiplier, - output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data, - output_dims); -} - } // namespace optimized_ops } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h index 55e0d5c3aa..4809ddd02a 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h @@ -23,3848 +23,2912 @@ limitations under the License. namespace tflite { namespace optimized_ops { -#ifdef __aarch64__ - -inline void preload_l1_keep(const uint8* ptr) { -#ifdef GEMMLOWP_ARM_64 - asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :); -#else - gemmlowp::Prefetch(ptr); -#endif -} - -// Implementation of quantized DepthwiseConv for 3x3 filters. - -// Below are helper structs to remove the use of arrays. -// There is an llvm bug that causes significant slowdown when using arrays for -// NEON intrinsics vector data types. -// See: https://bugs.llvm.org/show_bug.cgi?id=34945 - -struct Int32x8 { - int32x4_t low, high; -}; - -struct Filter3x3x8 { - int16x8_t f0, f1, f2, f3, f4, f5, f6, f7, f8; -}; - -// Loads 3x3 filter of depth 8 and adds filter offsets. -inline Filter3x3x8 Load3x3Filter(const uint8* filter_ptr, int32 filter_offset, - int output_depth) { - Filter3x3x8 filter; - - uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5, - temp_u8_6, temp_u8_7, temp_u8_8; - int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset); - - temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth); - temp_u8_1 = vld1_u8(filter_ptr + 1 * output_depth); - temp_u8_2 = vld1_u8(filter_ptr + 2 * output_depth); - temp_u8_3 = vld1_u8(filter_ptr + 3 * output_depth); - temp_u8_4 = vld1_u8(filter_ptr + 4 * output_depth); - temp_u8_5 = vld1_u8(filter_ptr + 5 * output_depth); - temp_u8_6 = vld1_u8(filter_ptr + 6 * output_depth); - temp_u8_7 = vld1_u8(filter_ptr + 7 * output_depth); - temp_u8_8 = vld1_u8(filter_ptr + 8 * output_depth); - - filter.f0 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0)); - filter.f1 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1)); - filter.f2 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2)); - filter.f3 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3)); - filter.f4 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4)); - filter.f5 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5)); - filter.f6 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6)); - filter.f7 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7)); - filter.f8 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8)); - - filter.f0 = vaddq_s16(filter.f0, filter_offset_vec); - filter.f1 = vaddq_s16(filter.f1, filter_offset_vec); - filter.f2 = vaddq_s16(filter.f2, filter_offset_vec); - filter.f3 = vaddq_s16(filter.f3, filter_offset_vec); - filter.f4 = vaddq_s16(filter.f4, filter_offset_vec); - filter.f5 = vaddq_s16(filter.f5, filter_offset_vec); - filter.f6 = vaddq_s16(filter.f6, filter_offset_vec); - filter.f7 = vaddq_s16(filter.f7, filter_offset_vec); - filter.f8 = vaddq_s16(filter.f8, filter_offset_vec); - - return filter; -} - -// Applies activation, offset and downquantize on a set of accumulator -// registers that correspond to a 2x2 output of depth 8. -// Stores results to output. -inline void DownquantizeAndStore2x2Output( - Int32x8 acc_0, Int32x8 acc_1, Int32x8 acc_2, Int32x8 acc_3, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - using gemmlowp::RoundingDivideByPOT; - const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); - const int32x4_t output_activation_min_vec = - vdupq_n_s32(output_activation_min); - const int32x4_t output_activation_max_vec = - vdupq_n_s32(output_activation_max); - - // Fixed-point multiplication. - acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier); - acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier); - acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier); - acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier); - acc_2.low = vqrdmulhq_n_s32(acc_2.low, output_multiplier); - acc_2.high = vqrdmulhq_n_s32(acc_2.high, output_multiplier); - acc_3.low = vqrdmulhq_n_s32(acc_3.low, output_multiplier); - acc_3.high = vqrdmulhq_n_s32(acc_3.high, output_multiplier); - - acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift); - acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift); - acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift); - acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift); - acc_2.low = RoundingDivideByPOT(acc_2.low, output_shift); - acc_2.high = RoundingDivideByPOT(acc_2.high, output_shift); - acc_3.low = RoundingDivideByPOT(acc_3.low, output_shift); - acc_3.high = RoundingDivideByPOT(acc_3.high, output_shift); - - // Add the output offset. - acc_0.low = vaddq_s32(acc_0.low, output_offset_vec); - acc_0.high = vaddq_s32(acc_0.high, output_offset_vec); - acc_1.low = vaddq_s32(acc_1.low, output_offset_vec); - acc_1.high = vaddq_s32(acc_1.high, output_offset_vec); - acc_2.low = vaddq_s32(acc_2.low, output_offset_vec); - acc_2.high = vaddq_s32(acc_2.high, output_offset_vec); - acc_3.low = vaddq_s32(acc_3.low, output_offset_vec); - acc_3.high = vaddq_s32(acc_3.high, output_offset_vec); - - // Apply the activation function. - acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec); - acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec); - acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec); - acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec); - acc_2.low = vmaxq_s32(acc_2.low, output_activation_min_vec); - acc_2.high = vmaxq_s32(acc_2.high, output_activation_min_vec); - acc_3.low = vmaxq_s32(acc_3.low, output_activation_min_vec); - acc_3.high = vmaxq_s32(acc_3.high, output_activation_min_vec); - - acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec); - acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec); - acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec); - acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec); - acc_2.low = vminq_s32(acc_2.low, output_activation_max_vec); - acc_2.high = vminq_s32(acc_2.high, output_activation_max_vec); - acc_3.low = vminq_s32(acc_3.low, output_activation_max_vec); - acc_3.high = vminq_s32(acc_3.high, output_activation_max_vec); - - // Saturating cast to uint8 and store to destination. - int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low); - int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high); - int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low); - int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high); - int16x4_t acc_2_low_s16 = vqmovn_s32(acc_2.low); - int16x4_t acc_2_high_s16 = vqmovn_s32(acc_2.high); - int16x4_t acc_3_low_s16 = vqmovn_s32(acc_3.low); - int16x4_t acc_3_high_s16 = vqmovn_s32(acc_3.high); - - int16x8_t res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16); - int16x8_t res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16); - int16x8_t res_2_s16 = vcombine_s16(acc_2_low_s16, acc_2_high_s16); - int16x8_t res_3_s16 = vcombine_s16(acc_3_low_s16, acc_3_high_s16); - - uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16); - uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16); - uint8x8_t res_2_u8 = vqmovun_s16(res_2_s16); - uint8x8_t res_3_u8 = vqmovun_s16(res_3_s16); - - vst1_u8(output_ptr, res_0_u8); - vst1_u8(output_ptr + output_depth, res_1_u8); - vst1_u8(output_ptr + output_depth * output_width, res_2_u8); - vst1_u8(output_ptr + output_depth * output_width + output_depth, res_3_u8); -} - -inline void DownquantizeAndStore(Int32x8 acc, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, - uint8* output_ptr) { - using gemmlowp::RoundingDivideByPOT; - const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); - const int32x4_t output_activation_min_vec = - vdupq_n_s32(output_activation_min); - const int32x4_t output_activation_max_vec = - vdupq_n_s32(output_activation_max); - - acc.low = vqrdmulhq_n_s32(acc.low, output_multiplier); - acc.high = vqrdmulhq_n_s32(acc.high, output_multiplier); - - acc.low = RoundingDivideByPOT(acc.low, output_shift); - acc.high = RoundingDivideByPOT(acc.high, output_shift); - - acc.low = vaddq_s32(acc.low, output_offset_vec); - acc.high = vaddq_s32(acc.high, output_offset_vec); - - acc.low = vmaxq_s32(acc.low, output_activation_min_vec); - acc.high = vmaxq_s32(acc.high, output_activation_min_vec); - - acc.low = vminq_s32(acc.low, output_activation_max_vec); - acc.high = vminq_s32(acc.high, output_activation_max_vec); - - int16x4_t acc_low_s16 = vqmovn_s32(acc.low); - int16x4_t acc_high_s16 = vqmovn_s32(acc.high); - - int16x8_t res_s16 = vcombine_s16(acc_low_s16, acc_high_s16); - uint8x8_t res_u8 = vqmovun_s16(res_s16); - vst1_u8(output_ptr, res_u8); -} +// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on +// Jetson TX-2. This compiler does not support the offsetof() macro. +#if defined(__aarch64__) && !defined(GOOGLE_L4T) +#include <stddef.h> +// clang-format gets confused with this file and ends up formatting lines to +// be larger than 80 characters. Turn off here and back on at the end of the +// file. -inline void DownquantizeAndStore2Output( - Int32x8 acc_0, Int32x8 acc_1, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_ptr, int output_ptr_offset) { - { - using gemmlowp::RoundingDivideByPOT; - const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); - const int32x4_t output_activation_min_vec = - vdupq_n_s32(output_activation_min); - const int32x4_t output_activation_max_vec = - vdupq_n_s32(output_activation_max); - - // Fixed-point multiplication. - acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier); - acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier); - acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier); - acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier); - - acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift); - acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift); - acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift); - acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift); - - // Add the output offset. - acc_0.low = vaddq_s32(acc_0.low, output_offset_vec); - acc_0.high = vaddq_s32(acc_0.high, output_offset_vec); - acc_1.low = vaddq_s32(acc_1.low, output_offset_vec); - acc_1.high = vaddq_s32(acc_1.high, output_offset_vec); - - // Apply the activation function. - acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec); - acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec); - acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec); - acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec); - - acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec); - acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec); - acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec); - acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec); - } - - // Saturating cast to uint8 and store to destination. - int16x8_t res_0_s16; - { - int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low); - int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high); - res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16); - } - - int16x8_t res_1_s16; - { - int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low); - int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high); - res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16); - } - - uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16); - uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16); - vst1_u8(output_ptr, res_0_u8); - vst1_u8(output_ptr + output_ptr_offset, res_1_u8); -} - -// Performs multiply accumulate on 3 inputs of depth 8. -inline Int32x8 MultiplyAccumulateRow(Int32x8 accum, int16x8_t f0, int16x8_t f1, - int16x8_t f2, int16x8_t i0, int16x8_t i1, - int16x8_t i2) { - accum.low = vmlal_s16(accum.low, vget_low_s16(f0), vget_low_s16(i0)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f0), vget_high_s16(i0)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f1), vget_low_s16(i1)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f1), vget_high_s16(i1)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f2), vget_low_s16(i2)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f2), vget_high_s16(i2)); - return accum; -} - -// Performs multiply accumulate on 3 inputs of depth 8. -inline Int32x8 MultiplyAccumulate3x3Filter(const Filter3x3x8& f, int16x8_t i0, - int16x8_t i1, int16x8_t i2, - int16x8_t i3, int16x8_t i4, - int16x8_t i5, int16x8_t i6, - int16x8_t i7, int16x8_t i8, - Int32x8 accum) { - accum.low = vmlal_s16(accum.low, vget_low_s16(f.f0), vget_low_s16(i0)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f.f0), vget_high_s16(i0)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f.f1), vget_low_s16(i1)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f.f1), vget_high_s16(i1)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f.f2), vget_low_s16(i2)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f.f2), vget_high_s16(i2)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f.f3), vget_low_s16(i3)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f.f3), vget_high_s16(i3)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f.f4), vget_low_s16(i4)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f.f4), vget_high_s16(i4)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f.f5), vget_low_s16(i5)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f.f5), vget_high_s16(i5)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f.f6), vget_low_s16(i6)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f.f6), vget_high_s16(i6)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f.f7), vget_low_s16(i7)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f.f7), vget_high_s16(i7)); - accum.low = vmlal_s16(accum.low, vget_low_s16(f.f8), vget_low_s16(i8)); - accum.high = vmlal_s16(accum.high, vget_high_s16(f.f8), vget_high_s16(i8)); - return accum; -} - -inline void DotProductAndStore(const Filter3x3x8& filter, int16x8_t i0, - int16x8_t i1, int16x8_t i2, int16x8_t i3, - int16x8_t i4, int16x8_t i5, int16x8_t i6, - int16x8_t i7, int16x8_t i8, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr) { - Int32x8 acc; - acc.low = vld1q_s32(bias_ptr); - acc.high = vld1q_s32(bias_ptr + 4); - - acc = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7, i8, - acc); - - DownquantizeAndStore(acc, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, - output_ptr); -} - -// Performs multiply-accumulate on a 3x4 input for 2 horizontal outputs. -inline void DotProductAndStore2xStride1( - const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2, - int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7, - int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11, - const int32* bias_ptr, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_ptr, int output_ptr_offset) { - Int32x8 acc_0, acc_1; - acc_0.low = vld1q_s32(bias_ptr); - acc_1.low = vld1q_s32(bias_ptr); - acc_0.high = vld1q_s32(bias_ptr + 4); - acc_1.high = vld1q_s32(bias_ptr + 4); - - acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i4, i5, i6, i8, i9, - i10, acc_0); - acc_1 = MultiplyAccumulate3x3Filter(filter, i1, i2, i3, i5, i6, i7, i9, i10, - i11, acc_1); - DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier, - output_shift, output_activation_min, - output_activation_max, output_ptr, - output_ptr_offset); -} - -// Performs multiply-accumulate on a 4x3 input for 2 vertical outputs. -inline void DotProductAndStore2yStride1( - const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2, - int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7, - int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11, - const int32* bias_ptr, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_ptr, int output_ptr_offset) { - Int32x8 acc_0, acc_1; - acc_0.low = vld1q_s32(bias_ptr); - acc_1.low = vld1q_s32(bias_ptr); - acc_0.high = vld1q_s32(bias_ptr + 4); - acc_1.high = vld1q_s32(bias_ptr + 4); - - acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7, - i8, acc_0); - acc_1 = MultiplyAccumulate3x3Filter(filter, i3, i4, i5, i6, i7, i8, i9, i10, - i11, acc_1); - DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier, - output_shift, output_activation_min, - output_activation_max, output_ptr, - output_ptr_offset); -} - -// A kernel that is optimized on the number of output cells in the x and y -// direction, and the stride. Assumes 3x3 filters of 8 depth. -template <int kFixedOutputY, int kFixedOutputX, int kFixedStrideWidth, - int kFixedStrideHeight> -struct ConvKernel3x3FilterDepth8 {}; - -template <> -struct ConvKernel3x3FilterDepth8<8, 8, 1, 1> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - const int output_row_size = output_depth * output_width; - - // To process 8x8 outputs using a 3x3 filter, we require 10x10 inputs. - // Load inputs for the first 2 filters on the top left, then slide to - // the right, down, left, down, right, etc. in a snake-like path. This - // minimizes the total number of loads. - // - // INPUT OUTPUT - // |\----------------\ |\------------\ - // | \ \ | \ \ - // | \----------------\ | \------------\ - // | | 0 ... 9 | | | 0 ... 7 | - // | | 10 ... 19 | ---> | | 8 ... 15 | - // | | 20 ... 29 | \ | .. ... .. | - // \ | .. ... .. | \| 56 ... 63 | - // \| 90 ... 109 | |------------| - // |----------------| - // - // The first set of loads corresponds to: - // - // INPUT OUTPUT - // |\----------------- |\----------- - // | \ | \ - // | \----------------- | \---------- - // | | 0 1 2 3 ... | | 0 1 ... - // | | 10 11 12 13 ... ---> | | .. ... - // | | 20 21 22 23 ... | .. ... - // | | .. ... ... - // - // The next set of loads correspond to a sliding window to the right. - // It loads inputs 4, 5, 14, 15, 23, 24 and keeps 2, 3, 12, 13, and 22: - // - // INPUT OUTPUT - // |\------------------- |\------------- - // | \ | \ - // | \------------------- | \------------ - // | | .. 2 3 4 5 ... | | .. 2 3 ... - // | | .. 12 13 14 15 ... ---> | | .. ... - // | | .. 21 22 23 24 ... | .. ... - // | | .. ... ... - // - // And so on... - - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11; - - // Load inputs for 1x2 outputs starting from the top left. Referring to the - // indexes in the diagram above, this corresponds to outputs (0) and (1). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth); - - // Slide to the right for outputs x = [2, 3], y = 0. Referring to the - // indexes in the diagram above, this corresponds to outputs (2) and (3). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_depth, output_depth); - - // Slide to the right again for outputs x = [4, 5], y = 0. Referring to the - // indexes in the diagram above, this corresponds to outputs (4) and (5). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 6 * input_depth; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 4 * output_depth, output_depth); - - // Slide to the right one last time for outputs x = [6, 7], y = 0. - // Referring to the indexes in the diagram above, this corresponds to - // outputs (6) and (7). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 8 * input_depth; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 6 * output_depth, output_depth); - - // Slide to down for outputs x = [6, 7], y = 1. Referring to the indexes in - // the diagram above, this corresponds to outputs (14) and (15). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 6 * input_depth + 3 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8, - input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 6 * output_depth + output_row_size, - output_depth); - - // Slide left for outputs x = [4, 5], y = 1. Referring to the indexes in - // the diagram above, this corresponds to outputs (12) and (13). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth + input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10, - input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 4 * output_depth + output_row_size, - output_depth); - - // Slide left again for outputs x = [2, 3], y = 1. Referring to the indexes - // in the diagram above, this corresponds to outputs (10) and (11). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 2 * input_depth + input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8, - input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_depth + output_row_size, - output_depth); - - // Slide left one more time for outputs x = [0, 1], y = 1. Referring to the - // indexes in the diagram above, this corresponds to outputs (8) and (9). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10, - input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + output_row_size, output_depth); - - // Slide down for outputs x = [0, 1], y = 2. Referring to the - // indexes in the diagram above, this corresponds to outputs (16) and (17). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 4 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2, - input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_row_size, output_depth); - - // Slide right for outputs x = [2, 3], y = 2. Referring to the - // indexes in the diagram above, this corresponds to outputs (18) and (19). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0, - input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 2 * output_depth + 2 * output_row_size, output_depth); - - // Slide right for outputs x = [4, 5], y = 2. Referring to the - // indexes in the diagram above, this corresponds to outputs (20) and (21). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 6 * input_depth + 2 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2, - input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 4 * output_depth + 2 * output_row_size, output_depth); - - // Slide right one more time for outputs x = [6, 7], y = 2. Referring to the - // indexes in the diagram above, this corresponds to outputs (22) and (23). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 8 * input_depth + 2 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0, - input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 6 * output_depth + 2 * output_row_size, output_depth); - - // Slide down for outputs x = [6, 7], y = 3. Referring to the indexes in - // the diagram above, this corresponds to outputs (30) and (31). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 6 * input_depth + 5 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 6 * output_depth + 3 * output_row_size, output_depth); - - // Slide left for outputs x = [4, 5], y = 3. Referring to the indexes in - // the diagram above, this corresponds to outputs (28) and (29). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth + 3 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 4 * output_depth + 3 * output_row_size, output_depth); - - // Slide left for outputs x = [2, 3], y = 3. Referring to the indexes in - // the diagram above, this corresponds to outputs (26) and (27). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 2 * output_depth + 3 * output_row_size, output_depth); - - // Slide left one more time for outputs x = [0, 1], y = 3. Referring to the - // indexes in the diagram above, this corresponds to outputs (24) and (25). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 3 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 3 * output_row_size, output_depth); - - // Slide down for outputs x = [0, 1], y = 4. Referring to the indexes in - // the diagram above, this corresponds to outputs (32) and (33). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 6 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10, - input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 4 * output_row_size, output_depth); - - // Slide right for outputs x = [2, 3], y = 4. Referring to the indexes in - // the diagram above, this corresponds to outputs (34) and (35). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth + 4 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8, - input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 2 * output_depth + 4 * output_row_size, output_depth); - - // Slide right for outputs x = [4, 5], y = 4. Referring to the indexes in - // the diagram above, this corresponds to outputs (36) and (37). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 6 * input_depth + 4 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10, - input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 4 * output_depth + 4 * output_row_size, output_depth); - - // Slide right one more time for outputs x = [6, 7], y = 4. Referring to the - // indexes in the diagram above, this corresponds to outputs (38) and (39). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 8 * input_depth + 4 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8, - input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 6 * output_depth + 4 * output_row_size, output_depth); - - // Slide down for outputs x = [6, 7], y = 5. Referring to the indexes in - // the diagram above, this corresponds to outputs (46) and (47). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 6 * input_depth + 7 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0, - input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 6 * output_depth + 5 * output_row_size, output_depth); - - // Slide left for outputs x = [4, 5], y = 5. Referring to the indexes in - // the diagram above, this corresponds to outputs (44) and (45). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth + 5 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2, - input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 4 * output_depth + 5 * output_row_size, output_depth); - - // Slide left for outputs x = [2, 3], y = 5. Referring to the indexes in - // the diagram above, this corresponds to outputs (42) and (43). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0, - input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 2 * output_depth + 5 * output_row_size, output_depth); - - // Slide left one more time for outputs x = [0, 1], y = 5. Referring to the - // indexes in the diagram above, this corresponds to outputs (40) and (41). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 5 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2, - input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 5 * output_row_size, output_depth); - - // Slide down for outputs x = [0, 1], y = 6. Referring to the indexes in - // the diagram above, this corresponds to outputs (48) and (49). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 8 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 6 * output_row_size, output_depth); - - // Slide right for outputs x = [2, 3], y = 6. Referring to the indexes in - // the diagram above, this corresponds to outputs (50) and (51). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth + 6 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 2 * output_depth + 6 * output_row_size, output_depth); - - // Slide right for outputs x = [4, 5], y = 6. Referring to the indexes in - // the diagram above, this corresponds to outputs (52) and (53). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 6 * input_depth + 6 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 4 * output_depth + 6 * output_row_size, output_depth); - - // Slide right one more time for outputs x = [6, 7], y = 6. Referring to the - // indexes in the diagram above, this corresponds to outputs (54) and (55). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 8 * input_depth + 6 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 6 * output_depth + 6 * output_row_size, output_depth); - - // Slide down for outputs x = [6, 7], y = 7. Referring to the indexes in the - // diagram above, this corresponds to outputs (62) and (63). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 6 * input_depth + 9 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8, - input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 6 * output_depth + 7 * output_row_size, output_depth); - - // Slide left for outputs x = [4, 5], y = 7. Referring to the indexes in the - // diagram above, this corresponds to outputs (60) and (61). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth + 7 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10, - input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 4 * output_depth + 7 * output_row_size, output_depth); - - // Slide left for outputs x = [2, 3], y = 7. Referring to the indexes in the - // diagram above, this corresponds to outputs (58) and (59). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 2 * input_depth + 7 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8, - input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 2 * output_depth + 7 * output_row_size, output_depth); - - // Slide left one more time for outputs x = [0, 1], y = 7. Referring to the - // indexes in the diagram above, this corresponds to outputs (56) and (57). - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 7 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10, - input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 7 * output_row_size, output_depth); - } -}; - -template <> -struct ConvKernel3x3FilterDepth8<4, 4, 1, 1> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - const int output_row_size = output_depth * output_width; - - // To process 4x4 outputs using a 3x3 filter, we require 6x6 inputs. - // Load inputs for the first 2 filters on the top left, then slide to - // the right, down, left, down, right, etc. in a snake-like path. This - // minimizes the total number of loads. - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11; - - // Load inputs for 1x2 outputs starting from the top left. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth); - - // Now load 1x2 inputs on the top right. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_depth, output_depth); - - // Now load next inputs when sliding window down. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8, - input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_depth + output_row_size, - output_depth); - - // Now load next inputs when sliding window left. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10, - input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + output_row_size, output_depth); - - // Now load next inputs when sliding window down. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 4 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2, - input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_row_size, output_depth); - - // Now load next inputs when sliding window right. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0, - input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 2 * output_depth + 2 * output_row_size, output_depth); - - // Now load next inputs when sliding window down. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, - output_ptr + 2 * output_depth + 3 * output_row_size, output_depth); - - // Now load next inputs when sliding window left. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 3 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 3 * output_row_size, output_depth); - } -}; - -template <> -struct ConvKernel3x3FilterDepth8<4, 2, 1, 1> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - const int output_row_size = output_depth * output_width; - - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11; - - // Load inputs for 1x2 outputs starting from the top. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth); - - output_ptr += output_row_size; - - // Now load next inputs one row down. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 3 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10, - input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth); - - output_ptr += output_row_size; - - // Now load next row. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 4 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2, - input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth); - - output_ptr += output_row_size; - - // Now load last row. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 5 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth); - } -}; - -template <> -struct ConvKernel3x3FilterDepth8<4, 1, 1, 1> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - const int output_row_size = output_depth * output_width; - - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11; - - // Load inputs for 2x1 outputs starting from the top. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2yStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_row_size); - - // Load inputs for bottom 2 rows. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - } - - DotProductAndStore2yStride1( - filter, input_6, input_7, input_8, input_9, input_10, input_11, input_0, - input_1, input_2, input_3, input_4, input_5, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_row_size, - output_row_size); - } -}; - -template <> -struct ConvKernel3x3FilterDepth8<2, 2, 1, 1> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - Int32x8 acc_0, acc_1, acc_2, acc_3; - - acc_0.low = vld1q_s32(bias_ptr); - acc_1.low = vld1q_s32(bias_ptr); - acc_2.low = vld1q_s32(bias_ptr); - acc_3.low = vld1q_s32(bias_ptr); - - bias_ptr += 4; - acc_0.high = vld1q_s32(bias_ptr); - acc_1.high = vld1q_s32(bias_ptr); - acc_2.high = vld1q_s32(bias_ptr); - acc_3.high = vld1q_s32(bias_ptr); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - - // Add scope for input registers to help the compiler know that it is - // not needed. - { - // To process 2x2 outputs using a 3x3 filter, we require 4x4 inputs. - // Load inputs for the top two filters first. - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11; - - const uint8* ptr = input_ptr; - - // Load top 3 rows. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - // Multiply-accum for top-left output. - acc_0 = MultiplyAccumulate3x3Filter(filter, input_0, input_1, input_2, - input_4, input_5, input_6, input_8, - input_9, input_10, acc_0); - - // Multiply-accum for top-right output. - acc_1 = MultiplyAccumulate3x3Filter(filter, input_1, input_2, input_3, - input_5, input_6, input_7, input_9, - input_10, input_11, acc_1); - - // Now load the bottom row. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - } +// clang-format off - // Multiply-accum for bottom-left output. - acc_2 = MultiplyAccumulate3x3Filter(filter, input_4, input_5, input_6, - input_8, input_9, input_10, input_0, - input_1, input_2, acc_2); - - // Multiply-accum for bottom-right output. - acc_3 = MultiplyAccumulate3x3Filter(filter, input_5, input_6, input_7, - input_9, input_10, input_11, input_1, - input_2, input_3, acc_3); - } - - DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset, - output_multiplier, output_shift, - output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); - } -}; - -template <> -struct ConvKernel3x3FilterDepth8<2, 4, 1, 1> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - const int output_row_size = output_depth * output_width; - - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11; - - // Load inputs for 1x2 outputs starting from the top left. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth); - - // Now load 1x2 inputs on the top right. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + 4 * input_depth; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_depth, output_depth); - - // Now load next inputs when sliding window down. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8, - input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_depth + output_row_size, - output_depth); - - // Now load next inputs when sliding window left. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10, - input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + output_row_size, output_depth); - } -}; - -template <> -struct ConvKernel3x3FilterDepth8<1, 4, 1, 1> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11; - - // Load inputs for 1x2 outputs starting from the left. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3; - - const uint8* ptr = input_ptr; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2xStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth); - - // Now load 1x2 inputs on the right. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr + input_depth * 4; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_2 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } +#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64 - DotProductAndStore2xStride1( - filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4, - input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr + 2 * output_depth, output_depth); - } +// Encapsulates constant parameters used in DepthwiseConv. +// 64-bit is used for types that will be added to 64-bit addresses in asm. +struct DepthwiseConvParams { + int64_t input_depth; + int64_t input_row_size; + int64_t output_depth; + int64_t output_row_size; + int64_t filter_row_size; + int32 input_offset; + int32 output_offset; + int32 filter_offset; + int32 output_multiplier; + int32 output_activation_min; + int32 output_activation_max; + int32 output_right_shift; + int32 input_width; + int32 input_height; + int32 stride_width; + int32 stride_height; + int32 output_width; + int32 output_height; }; -template <> -struct ConvKernel3x3FilterDepth8<2, 1, 1, 1> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - // To process 2x1 outputs using a 3x3 filter, we require 4x3 inputs. - // Load all inputs at the beginning. - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11; - - // Load inputs for 1x2 outputs starting from the top left. - { - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5; - - const uint8* ptr = input_ptr; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - input_10 = vaddq_s16(input_10, input_offset_vec); - input_11 = vaddq_s16(input_11, input_offset_vec); - } - - DotProductAndStore2yStride1( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth * output_width); - } -}; +#define STR(s) STR_UNEXPANDED(s) +#define STR_UNEXPANDED(s) #s + +// Represents the number of bytes offset from the start of the +// DepthwiseConvParams struct. This is used in the asm to load parameters. +// Keep these values in sync with the static_asserts below. +#define OFFSET_INPUT_DEPTH 0 +#define OFFSET_INPUT_ROW_SIZE 8 +#define OFFSET_OUTPUT_DEPTH 16 +#define OFFSET_OUTPUT_ROW_SIZE 24 +#define OFFSET_FILTER_ROW_SIZE 32 +#define OFFSET_INPUT_OFFSET 40 +#define OFFSET_OUTPUT_OFFSET 44 +#define OFFSET_FILTER_OFFSET 48 +#define OFFSET_OUTPUT_MULTIPLIER 52 +#define OFFSET_OUTPUT_ACTIVATION_MIN 56 +#define OFFSET_OUTPUT_ACTIVATION_MAX 60 +#define OFFSET_OUTPUT_RIGHT_SHIFT 64 +#define OFFSET_INPUT_WIDTH 68 +#define OFFSET_INPUT_HEIGHT 72 +#define OFFSET_STRIDE_WIDTH 76 +#define OFFSET_STRIDE_HEIGHT 80 +#define OFFSET_OUTPUT_WIDTH 84 +#define OFFSET_OUTPUT_HEIGHT 88 + +static_assert(offsetof(DepthwiseConvParams, input_depth) == + OFFSET_INPUT_DEPTH, ""); +static_assert(offsetof(DepthwiseConvParams, input_row_size) == + OFFSET_INPUT_ROW_SIZE, ""); +static_assert(offsetof(DepthwiseConvParams, output_depth) == + OFFSET_OUTPUT_DEPTH, ""); +static_assert(offsetof(DepthwiseConvParams, output_row_size) == + OFFSET_OUTPUT_ROW_SIZE, ""); +static_assert(offsetof(DepthwiseConvParams, filter_row_size) == + OFFSET_FILTER_ROW_SIZE, ""); +static_assert(offsetof(DepthwiseConvParams, input_offset) == + OFFSET_INPUT_OFFSET, ""); +static_assert(offsetof(DepthwiseConvParams, output_offset) == + OFFSET_OUTPUT_OFFSET, ""); +static_assert(offsetof(DepthwiseConvParams, filter_offset) == + OFFSET_FILTER_OFFSET, ""); +static_assert(offsetof(DepthwiseConvParams, output_multiplier) == + OFFSET_OUTPUT_MULTIPLIER, ""); +static_assert(offsetof(DepthwiseConvParams, output_activation_min) == + OFFSET_OUTPUT_ACTIVATION_MIN, ""); +static_assert(offsetof(DepthwiseConvParams, output_activation_max) == + OFFSET_OUTPUT_ACTIVATION_MAX, ""); +static_assert(offsetof(DepthwiseConvParams, output_right_shift) == + OFFSET_OUTPUT_RIGHT_SHIFT, ""); +static_assert(offsetof(DepthwiseConvParams, input_width) == + OFFSET_INPUT_WIDTH, ""); +static_assert(offsetof(DepthwiseConvParams, input_height) == + OFFSET_INPUT_HEIGHT, ""); +static_assert(offsetof(DepthwiseConvParams, stride_width) == + OFFSET_STRIDE_WIDTH, ""); +static_assert(offsetof(DepthwiseConvParams, stride_height) == + OFFSET_STRIDE_HEIGHT, ""); +static_assert(offsetof(DepthwiseConvParams, output_width) == + OFFSET_OUTPUT_WIDTH, ""); +static_assert(offsetof(DepthwiseConvParams, output_height) == + OFFSET_OUTPUT_HEIGHT, ""); + +template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight> +struct DepthwiseConvWindow {}; template <> -struct ConvKernel3x3FilterDepth8<4, 2, 2, 2> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - const int output_row_size = output_depth * output_width; - - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - Int32x8 acc_0, acc_1; - acc_0.low = vld1q_s32(bias_ptr); - acc_1.low = vld1q_s32(bias_ptr); - acc_0.high = vld1q_s32(bias_ptr + 4); - acc_1.high = vld1q_s32(bias_ptr + 4); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9; - - const uint8* ptr = input_ptr; - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4; - - // Load first 2 rows. - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2, - input_2, input_3, input_4); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5, - input_5, input_6, input_7); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5, - input_7, input_8, input_9); - - // Load next 2 rows. - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8, - input_2, input_3, input_4); - - DownquantizeAndStore2Output( - acc_0, acc_1, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_ptr, output_depth); - - output_ptr += output_row_size; - - // Moving onto the next row of outputs. - acc_0.low = vld1q_s32(bias_ptr); - acc_1.low = vld1q_s32(bias_ptr); - acc_0.high = vld1q_s32(bias_ptr + 4); - acc_1.high = vld1q_s32(bias_ptr + 4); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2, - input_2, input_3, input_4); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5, - input_5, input_6, input_7); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5, - input_7, input_8, input_9); - - // Load next 2 rows. - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8, - input_2, input_3, input_4); - - DownquantizeAndStore2Output( - acc_0, acc_1, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_ptr, output_depth); - - output_ptr += output_row_size; - - // Moving onto the next row of outputs. - acc_0.low = vld1q_s32(bias_ptr); - acc_1.low = vld1q_s32(bias_ptr); - acc_0.high = vld1q_s32(bias_ptr + 4); - acc_1.high = vld1q_s32(bias_ptr + 4); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2, - input_2, input_3, input_4); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5, - input_5, input_6, input_7); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5, - input_7, input_8, input_9); - - // Load next 2 rows. - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8, - input_2, input_3, input_4); - - DownquantizeAndStore2Output( - acc_0, acc_1, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_ptr, output_depth); - - output_ptr += output_row_size; - - // Moving onto the next row of outputs. - acc_0.low = vld1q_s32(bias_ptr); - acc_1.low = vld1q_s32(bias_ptr); - acc_0.high = vld1q_s32(bias_ptr + 4); - acc_1.high = vld1q_s32(bias_ptr + 4); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2, - input_2, input_3, input_4); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5, - input_5, input_6, input_7); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5, - input_7, input_8, input_9); - - // Load last row. - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8, - input_2, input_3, input_4); - - DownquantizeAndStore2Output( - acc_0, acc_1, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_ptr, output_depth); +struct DepthwiseConvWindow<8, 1, 1> { + public: + static inline void Run(const uint8* input_ptr, const uint8* filter_ptr, + const int32* bias_ptr, uint8* output_ptr, int64_t input_depth, + int64_t input_row_size, int32 output_window_height, + int32 output_window_width, + const DepthwiseConvParams* params_ptr) { + const int64_t input_width_increment = 2 * input_depth; + const int64_t input_height_increment = 2 * input_row_size; + const int64_t output_height_increment = 2 * params_ptr->output_row_size; + +#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1" +#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2" +#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3" +#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4" +#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5" +#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6" +#define DEPTHWISECONV_LABEL_HEIGHT_1 "7" +#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8" +#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9" +#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10" +#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11" + + asm volatile( + // Performs depthwise convolutions for a window specified by + // |output_window_height| and |output_window_width|. The inner-most loop + // processes 2x2 outputs, and any leftovers at the end. + // + // Algorithm works as follows: + // + // 1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter + // values. + // 2. For 2 output heights at a time: + // i. For 2 output widths at a time, load inputs for a 2x1 (2 + // height, 1 width) output window (4x3 input window). + // Registers v9--v20 hold input values. Mul-add with + // accumulators v21--v24. Then run activation, downquantize + // and store. Repeat for the next 2x1 output window, + // leveraging overlapping inputs. + // ii. Handle single leftover width if exists. + // 3. Handle single leftover height if exists. + // i. For 2 output widths at a time, load inputs for a 1x2 (1 + // height, 2 width) output window (3x4 input window). + // Registers v9--v20 hold input values. Mul-add with + // accumulators v21--v24. Then run activation, downquantize + // and store. Repeat for the next 1x2 output window, + // leveraging overlapping inputs. + // ii. Handle single leftover width if exists. + // + // Loads are placed as soon as the register is no longer needed and + // interleaved with arithmetic operations to take advantage of + // dual-issue pipelines. We also add input offsets as far from the loads + // as possible to give loads enough cycles to fetch data from memory. + + // Set "constant" registers. These registers may be replaced with temp + // values from time to time when there are not enough NEON registers. + // We use x9--x15 general purpose registers as they are caller-saved + // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf). // NOLINT + "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n" + "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n" + "cmp %w[output_window_height], #2\n" + "dup v26.8h, w9\n" + "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n" + "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n" + "dup v27.4s, w9\n" + "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n" + "dup v29.4s, w2\n" + "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n" + "dup v30.4s, w4\n" + "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n" + "dup v31.4s, w0\n" + "neg w9, w9\n" + "dup v28.4s, w9\n" + "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n" + "add x10, %[bias_ptr], #16\n" + "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n" + "dup v9.8h, w9\n" + + // Load filters and add offsets. + "ld1 {v0.8b}, [%[filter_ptr]], x3\n" + "ld1 {v1.8b}, [%[filter_ptr]], x3\n" + "uaddw v0.8h, v9.8h, v0.8b\n" + "ld1 {v2.8b}, [%[filter_ptr]], x3\n" + "uaddw v1.8h, v9.8h, v1.8b\n" + "ld1 {v3.8b}, [%[filter_ptr]], x3\n" + "uaddw v2.8h, v9.8h, v2.8b\n" + "ld1 {v4.8b}, [%[filter_ptr]], x3\n" + "uaddw v3.8h, v9.8h, v3.8b\n" + "ld1 {v5.8b}, [%[filter_ptr]], x3\n" + "uaddw v4.8h, v9.8h, v4.8b\n" + "ld1 {v6.8b}, [%[filter_ptr]], x3\n" + "uaddw v5.8h, v9.8h, v5.8b\n" + "ld1 {v7.8b}, [%[filter_ptr]], x3\n" + "uaddw v6.8h, v9.8h, v6.8b\n" + "ld1 {v8.8b}, [%[filter_ptr]], x3\n" + "uaddw v7.8h, v9.8h, v7.8b\n" + "uaddw v8.8h, v9.8h, v8.8b\n" + + "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n" + // This loop processes 2x2 outputs. To avoid register exhaustion, + // inputs for the left 2 outputs are loaded first, then the right + // two outputs. + "mov x11, %[input_ptr]\n" + "mov x12, x11\n" + "ld1 {v9.8b}, [x12], %[input_depth]\n" + "add x13, x11, %[input_row_size]\n" + "ld1 {v10.8b}, [x12], %[input_depth]\n" + "add x14, x13, %[input_row_size]\n" + "ld1 {v11.8b}, [x12], %[input_depth]\n" + "add x15, x14, %[input_row_size]\n" + "ld1 {v12.8b}, [x13], %[input_depth]\n" + "mov w5, %w[output_window_width]\n" + "ld1 {v13.8b}, [x13], %[input_depth]\n" + "mov x6, %[output_ptr]\n" + "ld1 {v14.8b}, [x13], %[input_depth]\n" + "add x7, %[output_ptr], x1\n" + "ld1 {v15.8b}, [x14], %[input_depth]\n" + // The height 2 / width 2 loop loads an extra 2x1 outputs (2 height, + // 1 width) in anticipation for the next iteration. Make sure + // |output_window_width| is large enough to handle the additional + // loads, otherwise jump to specific the appropriate label to handle + // smaller widths. + "cmp w5, #2\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "ld1 {v16.8b}, [x14], %[input_depth]\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "ld1 {v17.8b}, [x14], %[input_depth]\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + "ld1 {v18.8b}, [x15], %[input_depth]\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "ld1 {v19.8b}, [x15], %[input_depth]\n" + "uaddw v13.8h, v26.8h, v13.8b\n" + "ld1 {v20.8b}, [x15], %[input_depth]\n" + "uaddw v14.8h, v26.8h, v14.8b\n" + "ld1 {v21.4s}, [%[bias_ptr]]\n" + "uaddw v15.8h, v26.8h, v15.8b\n" + "ld1 {v22.4s}, [x10]\n" + "uaddw v16.8h, v26.8h, v16.8b\n" + "ld1 {v23.4s}, [%[bias_ptr]]\n" + "uaddw v17.8h, v26.8h, v17.8b\n" + "ld1 {v24.4s}, [x10]\n" + "uaddw v18.8h, v26.8h, v18.8b\n" + "uaddw v19.8h, v26.8h, v19.8b\n" + "uaddw v20.8h, v26.8h, v20.8b\n" + + "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n" + "cmp w5, #1\n" + "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n" + // Mul-add left outputs. + "smlal v21.4s, v0.4h, v9.4h\n" + "subs w5, w5, #2\n" + "smlal2 v22.4s, v0.8h, v9.8h\n" + "cmp w5, #3\n" + "smlal v23.4s, v0.4h, v12.4h\n" + "ld1 {v9.8b}, [x12]\n" + "smlal2 v24.4s, v0.8h, v12.8h\n" + "smlal v21.4s, v1.4h, v10.4h\n" + "smlal2 v22.4s, v1.8h, v10.8h\n" + "smlal v23.4s, v1.4h, v13.4h\n" + "smlal2 v24.4s, v1.8h, v13.8h\n" + "smlal v21.4s, v2.4h, v11.4h\n" + "smlal2 v22.4s, v2.8h, v11.8h\n" + "smlal v23.4s, v2.4h, v14.4h\n" + "smlal2 v24.4s, v2.8h, v14.8h\n" + "smlal v21.4s, v3.4h, v12.4h\n" + "smlal2 v22.4s, v3.8h, v12.8h\n" + "ld1 {v12.8b}, [x13]\n" + "smlal v23.4s, v3.4h, v15.4h\n" + "smlal2 v24.4s, v3.8h, v15.8h\n" + "smlal v21.4s, v4.4h, v13.4h\n" + "smlal2 v22.4s, v4.8h, v13.8h\n" + "smlal v23.4s, v4.4h, v16.4h\n" + "smlal2 v24.4s, v4.8h, v16.8h\n" + "smlal v21.4s, v5.4h, v14.4h\n" + "smlal2 v22.4s, v5.8h, v14.8h\n" + "smlal v23.4s, v5.4h, v17.4h\n" + "smlal2 v24.4s, v5.8h, v17.8h\n" + "smlal v21.4s, v6.4h, v15.4h\n" + "smlal2 v22.4s, v6.8h, v15.8h\n" + "ld1 {v15.8b}, [x14]\n" + "smlal v23.4s, v6.4h, v18.4h\n" + "smlal2 v24.4s, v6.8h, v18.8h\n" + "ld1 {v18.8b}, [x15]\n" + "smlal v21.4s, v7.4h, v16.4h\n" + "smlal2 v22.4s, v7.8h, v16.8h\n" + "smlal v23.4s, v7.4h, v19.4h\n" + "smlal2 v24.4s, v7.8h, v19.8h\n" + "smlal v21.4s, v8.4h, v17.4h\n" + "smlal2 v22.4s, v8.8h, v17.8h\n" + "smlal v23.4s, v8.4h, v20.4h\n" + "smlal2 v24.4s, v8.8h, v20.8h\n" + + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v25.16b, v21.16b, v28.16b\n" + "and v29.16b, v22.16b, v28.16b\n" + "and v30.16b, v23.16b, v28.16b\n" + "and v31.16b, v24.16b, v28.16b\n" + "sshr v25.4s, v25.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v21.4s, v21.4s, v25.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "dup v30.4s, w4\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "dup v31.4s, w0\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "srshl v23.4s, v23.4s, v28.4s\n" + "srshl v24.4s, v24.4s, v28.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "ld1 {v22.4s}, [x10]\n" + "sqxtn2 v23.8h, v24.4s\n" + "ld1 {v24.4s}, [x10]\n" + "sqxtun v21.8b, v21.8h\n" + "sqxtun v23.8b, v23.8h\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "st1 {v21.8b}, [x6], x3\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "st1 {v23.8b}, [x7], x3\n" + "uaddw v15.8h, v26.8h, v15.8b\n" + "ld1 {v21.4s}, [%[bias_ptr]]\n" + "uaddw v18.8h, v26.8h, v18.8b\n" + "ld1 {v23.4s}, [%[bias_ptr]]\n" + + // Mul-add right outputs. + "smlal v21.4s, v0.4h, v10.4h\n" + "add x11, x11, %[input_width_increment]\n" + "smlal2 v22.4s, v0.8h, v10.8h\n" + "mov x12, x11\n" + "smlal v23.4s, v0.4h, v13.4h\n" + "add x13, x11, %[input_row_size]\n" + "smlal2 v24.4s, v0.8h, v13.8h\n" + "add x14, x13, %[input_row_size]\n" + "smlal v21.4s, v1.4h, v11.4h\n" + "add x15, x14, %[input_row_size]\n" + "smlal2 v22.4s, v1.8h, v11.8h\n" + "smlal v23.4s, v1.4h, v14.4h\n" + "smlal2 v24.4s, v1.8h, v14.8h\n" + "smlal v21.4s, v2.4h, v9.4h\n" + "smlal2 v22.4s, v2.8h, v9.8h\n" + "ld1 {v9.8b}, [x12], %[input_depth]\n" + "smlal v23.4s, v2.4h, v12.4h\n" + "ld1 {v10.8b}, [x12], %[input_depth]\n" + "smlal2 v24.4s, v2.8h, v12.8h\n" + "ld1 {v11.8b}, [x12], %[input_depth]\n" + "smlal v21.4s, v3.4h, v13.4h\n" + "smlal2 v22.4s, v3.8h, v13.8h\n" + "smlal v23.4s, v3.4h, v16.4h\n" + "smlal2 v24.4s, v3.8h, v16.8h\n" + "smlal v21.4s, v4.4h, v14.4h\n" + "smlal2 v22.4s, v4.8h, v14.8h\n" + "smlal v23.4s, v4.4h, v17.4h\n" + "smlal2 v24.4s, v4.8h, v17.8h\n" + "smlal v21.4s, v5.4h, v12.4h\n" + "smlal2 v22.4s, v5.8h, v12.8h\n" + "ld1 {v12.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v5.4h, v15.4h\n" + "ld1 {v13.8b}, [x13], %[input_depth]\n" + "smlal2 v24.4s, v5.8h, v15.8h\n" + "ld1 {v14.8b}, [x13], %[input_depth]\n" + "smlal v21.4s, v6.4h, v16.4h\n" + "smlal2 v22.4s, v6.8h, v16.8h\n" + "smlal v23.4s, v6.4h, v19.4h\n" + "smlal2 v24.4s, v6.8h, v19.8h\n" + "smlal v21.4s, v7.4h, v17.4h\n" + "smlal2 v22.4s, v7.8h, v17.8h\n" + "smlal v23.4s, v7.4h, v20.4h\n" + "smlal2 v24.4s, v7.8h, v20.8h\n" + "smlal v21.4s, v8.4h, v15.4h\n" + "smlal2 v22.4s, v8.8h, v15.8h\n" + "ld1 {v15.8b}, [x14], %[input_depth]\n" + "smlal v23.4s, v8.4h, v18.4h\n" + "ld1 {v16.8b}, [x14], %[input_depth]\n" + "smlal2 v24.4s, v8.8h, v18.8h\n" + "ld1 {v17.8b}, [x14], %[input_depth]\n" + + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "ld1 {v18.8b}, [x15], %[input_depth]\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "ld1 {v19.8b}, [x15], %[input_depth]\n" + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "ld1 {v20.8b}, [x15], %[input_depth]\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v25.16b, v21.16b, v28.16b\n" + "and v29.16b, v22.16b, v28.16b\n" + "and v30.16b, v23.16b, v28.16b\n" + "and v31.16b, v24.16b, v28.16b\n" + "sshr v25.4s, v25.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v21.4s, v21.4s, v25.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "dup v30.4s, w4\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "dup v31.4s, w0\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "srshl v23.4s, v23.4s, v28.4s\n" + "srshl v24.4s, v24.4s, v28.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "ld1 {v22.4s}, [x10]\n" + "sqxtn2 v23.8h, v24.4s\n" + "ld1 {v24.4s}, [x10]\n" + "sqxtun v21.8b, v21.8h\n" + "sqxtun v23.8b, v23.8h\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "st1 {v21.8b}, [x6], x3\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "st1 {v23.8b}, [x7], x3\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "uaddw v13.8h, v26.8h, v13.8b\n" + "uaddw v14.8h, v26.8h, v14.8b\n" + "uaddw v15.8h, v26.8h, v15.8b\n" + "ld1 {v21.4s}, [%[bias_ptr]]\n" + "uaddw v16.8h, v26.8h, v16.8b\n" + "ld1 {v23.4s}, [%[bias_ptr]]\n" + "uaddw v17.8h, v26.8h, v17.8b\n" + "uaddw v18.8h, v26.8h, v18.8b\n" + "uaddw v19.8h, v26.8h, v19.8b\n" + "uaddw v20.8h, v26.8h, v20.8b\n" + + "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n" + + // At this point, there will be one of 2 width or 1 width leftover, + // not both. + "cmp w5, #2\n" + "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n" + + // Handle last 2 columns if exists. + DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n" + // Mul-add left outputs. + "smlal v21.4s, v0.4h, v9.4h\n" + "smlal2 v22.4s, v0.8h, v9.8h\n" + "smlal v23.4s, v0.4h, v12.4h\n" + "ld1 {v9.8b}, [x12]\n" + "smlal2 v24.4s, v0.8h, v12.8h\n" + "smlal v21.4s, v1.4h, v10.4h\n" + "smlal2 v22.4s, v1.8h, v10.8h\n" + "smlal v23.4s, v1.4h, v13.4h\n" + "smlal2 v24.4s, v1.8h, v13.8h\n" + "smlal v21.4s, v2.4h, v11.4h\n" + "smlal2 v22.4s, v2.8h, v11.8h\n" + "smlal v23.4s, v2.4h, v14.4h\n" + "smlal2 v24.4s, v2.8h, v14.8h\n" + "smlal v21.4s, v3.4h, v12.4h\n" + "smlal2 v22.4s, v3.8h, v12.8h\n" + "ld1 {v12.8b}, [x13]\n" + "smlal v23.4s, v3.4h, v15.4h\n" + "smlal2 v24.4s, v3.8h, v15.8h\n" + "smlal v21.4s, v4.4h, v13.4h\n" + "smlal2 v22.4s, v4.8h, v13.8h\n" + "smlal v23.4s, v4.4h, v16.4h\n" + "smlal2 v24.4s, v4.8h, v16.8h\n" + "smlal v21.4s, v5.4h, v14.4h\n" + "smlal2 v22.4s, v5.8h, v14.8h\n" + "smlal v23.4s, v5.4h, v17.4h\n" + "smlal2 v24.4s, v5.8h, v17.8h\n" + "smlal v21.4s, v6.4h, v15.4h\n" + "smlal2 v22.4s, v6.8h, v15.8h\n" + "ld1 {v15.8b}, [x14]\n" + "smlal v23.4s, v6.4h, v18.4h\n" + "smlal2 v24.4s, v6.8h, v18.8h\n" + "ld1 {v18.8b}, [x15]\n" + "smlal v21.4s, v7.4h, v16.4h\n" + "smlal2 v22.4s, v7.8h, v16.8h\n" + "smlal v23.4s, v7.4h, v19.4h\n" + "smlal2 v24.4s, v7.8h, v19.8h\n" + "smlal v21.4s, v8.4h, v17.4h\n" + "smlal2 v22.4s, v8.8h, v17.8h\n" + "smlal v23.4s, v8.4h, v20.4h\n" + "smlal2 v24.4s, v8.8h, v20.8h\n" + + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v25.16b, v21.16b, v28.16b\n" + "and v29.16b, v22.16b, v28.16b\n" + "and v30.16b, v23.16b, v28.16b\n" + "and v31.16b, v24.16b, v28.16b\n" + "sshr v25.4s, v25.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v21.4s, v21.4s, v25.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "dup v30.4s, w4\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "dup v31.4s, w0\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "srshl v23.4s, v23.4s, v28.4s\n" + "srshl v24.4s, v24.4s, v28.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "ld1 {v22.4s}, [x10]\n" + "sqxtn2 v23.8h, v24.4s\n" + "ld1 {v24.4s}, [x10]\n" + "sqxtun v21.8b, v21.8h\n" + "sqxtun v23.8b, v23.8h\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "st1 {v21.8b}, [x6], x3\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "st1 {v23.8b}, [x7], x3\n" + "uaddw v15.8h, v26.8h, v15.8b\n" + "ld1 {v21.4s}, [%[bias_ptr]]\n" + "uaddw v18.8h, v26.8h, v18.8b\n" + "ld1 {v23.4s}, [%[bias_ptr]]\n" + + // Mul-add right outputs. + "smlal v21.4s, v0.4h, v10.4h\n" + "smlal2 v22.4s, v0.8h, v10.8h\n" + "smlal v23.4s, v0.4h, v13.4h\n" + "smlal2 v24.4s, v0.8h, v13.8h\n" + "smlal v21.4s, v1.4h, v11.4h\n" + "smlal2 v22.4s, v1.8h, v11.8h\n" + "smlal v23.4s, v1.4h, v14.4h\n" + "smlal2 v24.4s, v1.8h, v14.8h\n" + "smlal v21.4s, v2.4h, v9.4h\n" + "smlal2 v22.4s, v2.8h, v9.8h\n" + "smlal v23.4s, v2.4h, v12.4h\n" + "smlal2 v24.4s, v2.8h, v12.8h\n" + "smlal v21.4s, v3.4h, v13.4h\n" + "smlal2 v22.4s, v3.8h, v13.8h\n" + "smlal v23.4s, v3.4h, v16.4h\n" + "smlal2 v24.4s, v3.8h, v16.8h\n" + "smlal v21.4s, v4.4h, v14.4h\n" + "smlal2 v22.4s, v4.8h, v14.8h\n" + "smlal v23.4s, v4.4h, v17.4h\n" + "smlal2 v24.4s, v4.8h, v17.8h\n" + "smlal v21.4s, v5.4h, v12.4h\n" + "smlal2 v22.4s, v5.8h, v12.8h\n" + "smlal v23.4s, v5.4h, v15.4h\n" + "smlal2 v24.4s, v5.8h, v15.8h\n" + "smlal v21.4s, v6.4h, v16.4h\n" + "smlal2 v22.4s, v6.8h, v16.8h\n" + "smlal v23.4s, v6.4h, v19.4h\n" + "smlal2 v24.4s, v6.8h, v19.8h\n" + "smlal v21.4s, v7.4h, v17.4h\n" + "smlal2 v22.4s, v7.8h, v17.8h\n" + "smlal v23.4s, v7.4h, v20.4h\n" + "smlal2 v24.4s, v7.8h, v20.8h\n" + "smlal v21.4s, v8.4h, v15.4h\n" + "smlal2 v22.4s, v8.8h, v15.8h\n" + "smlal v23.4s, v8.4h, v18.4h\n" + "smlal2 v24.4s, v8.8h, v18.8h\n" + + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v25.16b, v21.16b, v28.16b\n" + "and v29.16b, v22.16b, v28.16b\n" + "and v30.16b, v23.16b, v28.16b\n" + "and v31.16b, v24.16b, v28.16b\n" + "sshr v25.4s, v25.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v21.4s, v21.4s, v25.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "dup v30.4s, w4\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "dup v31.4s, w0\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "srshl v23.4s, v23.4s, v28.4s\n" + "srshl v24.4s, v24.4s, v28.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "sqxtn2 v23.8h, v24.4s\n" + "sqxtun v21.8b, v21.8h\n" + "sqxtun v23.8b, v23.8h\n" + "st1 {v21.8b}, [x6], x3\n" + "st1 {v23.8b}, [x7], x3\n" + "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n" + + DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n" + "smlal v21.4s, v0.4h, v9.4h\n" + "smlal2 v22.4s, v0.8h, v9.8h\n" + "smlal v23.4s, v0.4h, v12.4h\n" + "smlal2 v24.4s, v0.8h, v12.8h\n" + "smlal v21.4s, v1.4h, v10.4h\n" + "smlal2 v22.4s, v1.8h, v10.8h\n" + "smlal v23.4s, v1.4h, v13.4h\n" + "smlal2 v24.4s, v1.8h, v13.8h\n" + "smlal v21.4s, v2.4h, v11.4h\n" + "smlal2 v22.4s, v2.8h, v11.8h\n" + "smlal v23.4s, v2.4h, v14.4h\n" + "smlal2 v24.4s, v2.8h, v14.8h\n" + "smlal v21.4s, v3.4h, v12.4h\n" + "smlal2 v22.4s, v3.8h, v12.8h\n" + "smlal v23.4s, v3.4h, v15.4h\n" + "smlal2 v24.4s, v3.8h, v15.8h\n" + "smlal v21.4s, v4.4h, v13.4h\n" + "smlal2 v22.4s, v4.8h, v13.8h\n" + "smlal v23.4s, v4.4h, v16.4h\n" + "smlal2 v24.4s, v4.8h, v16.8h\n" + "smlal v21.4s, v5.4h, v14.4h\n" + "smlal2 v22.4s, v5.8h, v14.8h\n" + "smlal v23.4s, v5.4h, v17.4h\n" + "smlal2 v24.4s, v5.8h, v17.8h\n" + "smlal v21.4s, v6.4h, v15.4h\n" + "smlal2 v22.4s, v6.8h, v15.8h\n" + "smlal v23.4s, v6.4h, v18.4h\n" + "smlal2 v24.4s, v6.8h, v18.8h\n" + "smlal v21.4s, v7.4h, v16.4h\n" + "smlal2 v22.4s, v7.8h, v16.8h\n" + "smlal v23.4s, v7.4h, v19.4h\n" + "smlal2 v24.4s, v7.8h, v19.8h\n" + "smlal v21.4s, v8.4h, v17.4h\n" + "smlal2 v22.4s, v8.8h, v17.8h\n" + "smlal v23.4s, v8.4h, v20.4h\n" + "smlal2 v24.4s, v8.8h, v20.8h\n" + + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v9.16b, v21.16b, v28.16b\n" + "and v12.16b, v22.16b, v28.16b\n" + "and v15.16b, v23.16b, v28.16b\n" + "and v18.16b, v24.16b, v28.16b\n" + "sshr v9.4s, v9.4s, #31\n" + "sshr v12.4s, v12.4s, #31\n" + "sshr v15.4s, v15.4s, #31\n" + "sshr v18.4s, v18.4s, #31\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v12.4s\n" + "sqadd v23.4s, v23.4s, v15.4s\n" + "sqadd v24.4s, v24.4s, v18.4s\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "srshl v23.4s, v23.4s, v28.4s\n" + "srshl v24.4s, v24.4s, v28.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "sqxtn2 v23.8h, v24.4s\n" + "sqxtun v21.8b, v21.8h\n" + "sqxtun v23.8b, v23.8h\n" + "st1 {v21.8b}, [x6], x3\n" + "st1 {v23.8b}, [x7], x3\n" + + DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n" + "subs %w[output_window_height], %w[output_window_height], #2\n" + "add %[input_ptr], %[input_ptr], %[input_height_increment]\n" + "cmp %w[output_window_height], #2\n" + "add %[output_ptr], %[output_ptr], %[output_height_increment]\n" + "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n" + + DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n" + "cmp %w[output_window_height], #1\n" + "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n" + + DEPTHWISECONV_LABEL_HEIGHT_1 ":\n" + "mov x12, %[input_ptr]\n" + "ld1 {v9.8b}, [x12], %[input_depth]\n" + "add x13, %[input_ptr], %[input_row_size]\n" + "ld1 {v10.8b}, [x12], %[input_depth]\n" + "add x14, x13, %[input_row_size]\n" + "ld1 {v11.8b}, [x12], %[input_depth]\n" + "add x15, x14, %[input_row_size]\n" + "mov w5, %w[output_window_width]\n" + "ld1 {v13.8b}, [x13], %[input_depth]\n" + "mov x6, %[output_ptr]\n" + "ld1 {v14.8b}, [x13], %[input_depth]\n" + "add x7, %[output_ptr], x1\n" + "ld1 {v15.8b}, [x13], %[input_depth]\n" + // The height 1 / width 2 loop loads an extra 1x1 output in anticipation + // for the next iteration. Make sure |output_window_width| is large + // enough to handle the additional load, otherwise jump to the + // appropriate label to handle smaller widths. + "cmp w5, #2\n" + "ld1 {v17.8b}, [x14], %[input_depth]\n" + "ld1 {v18.8b}, [x14], %[input_depth]\n" + "ld1 {v19.8b}, [x14], %[input_depth]\n" + "ld1 {v21.4s}, [%[bias_ptr]]\n" + "ld1 {v22.4s}, [x10]\n" + "ld1 {v23.4s}, [%[bias_ptr]]\n" + "ld1 {v24.4s}, [x10]\n" + + "uaddw v9.8h, v26.8h, v9.8b\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + "uaddw v13.8h, v26.8h, v13.8b\n" + "uaddw v14.8h, v26.8h, v14.8b\n" + "uaddw v15.8h, v26.8h, v15.8b\n" + "uaddw v17.8h, v26.8h, v17.8b\n" + "uaddw v18.8h, v26.8h, v18.8b\n" + "uaddw v19.8h, v26.8h, v19.8b\n" + + "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n" + "cmp w5, #1\n" + "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n" + // Load inputs for 3x4 input window which corresponds to a 1x2 output + // window. + "smlal v21.4s, v0.4h, v9.4h\n" + "ld1 {v12.8b}, [x12]\n" + "smlal2 v22.4s, v0.8h, v9.8h\n" + "ld1 {v16.8b}, [x13]\n" + "smlal v23.4s, v0.4h, v10.4h\n" + "ld1 {v20.8b}, [x14]\n" + "smlal2 v24.4s, v0.8h, v10.8h\n" + "subs w5, w5, #2\n" + "smlal v21.4s, v1.4h, v10.4h\n" + "cmp w5, #3\n" + "smlal2 v22.4s, v1.8h, v10.8h\n" + "add %[input_ptr], %[input_ptr], %[input_width_increment]\n" + "smlal v23.4s, v1.4h, v11.4h\n" + "mov x12, %[input_ptr]\n" + "smlal2 v24.4s, v1.8h, v11.8h\n" + "ld1 {v9.8b}, [x12], %[input_depth]\n" + "smlal v21.4s, v2.4h, v11.4h\n" + "ld1 {v10.8b}, [x12], %[input_depth]\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "smlal2 v22.4s, v2.8h, v11.8h\n" + "ld1 {v11.8b}, [x12], %[input_depth]\n" + "add x13, %[input_ptr], %[input_row_size]\n" + "smlal v23.4s, v2.4h, v12.4h\n" + "add x14, x13, %[input_row_size]\n" + "smlal2 v24.4s, v2.8h, v12.8h\n" + "smlal v21.4s, v3.4h, v13.4h\n" + "add x15, x14, %[input_row_size]\n" + "smlal2 v22.4s, v3.8h, v13.8h\n" + "ld1 {v13.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v3.4h, v14.4h\n" + "smlal2 v24.4s, v3.8h, v14.8h\n" + "smlal v21.4s, v4.4h, v14.4h\n" + "smlal2 v22.4s, v4.8h, v14.8h\n" + "ld1 {v14.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v4.4h, v15.4h\n" + "smlal2 v24.4s, v4.8h, v15.8h\n" + "smlal v21.4s, v5.4h, v15.4h\n" + "uaddw v16.8h, v26.8h, v16.8b\n" + "smlal2 v22.4s, v5.8h, v15.8h\n" + "ld1 {v15.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v5.4h, v16.4h\n" + "smlal2 v24.4s, v5.8h, v16.8h\n" + "smlal v21.4s, v6.4h, v17.4h\n" + "smlal2 v22.4s, v6.8h, v17.8h\n" + "ld1 {v17.8b}, [x14], %[input_depth]\n" + "smlal v23.4s, v6.4h, v18.4h\n" + "smlal2 v24.4s, v6.8h, v18.8h\n" + "smlal v21.4s, v7.4h, v18.4h\n" + "smlal2 v22.4s, v7.8h, v18.8h\n" + "ld1 {v18.8b}, [x14], %[input_depth]\n" + "smlal v23.4s, v7.4h, v19.4h\n" + "smlal2 v24.4s, v7.8h, v19.8h\n" + "smlal v21.4s, v8.4h, v19.4h\n" + "uaddw v20.8h, v26.8h, v20.8b\n" + "smlal2 v22.4s, v8.8h, v19.8h\n" + "ld1 {v19.8b}, [x14], %[input_depth]\n" + "smlal v23.4s, v8.4h, v20.4h\n" + "smlal2 v24.4s, v8.8h, v20.8h\n" + + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v25.16b, v21.16b, v28.16b\n" + "and v29.16b, v22.16b, v28.16b\n" + "and v30.16b, v23.16b, v28.16b\n" + "and v31.16b, v24.16b, v28.16b\n" + "sshr v25.4s, v25.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v21.4s, v21.4s, v25.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "dup v30.4s, w4\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "dup v31.4s, w0\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "srshl v23.4s, v23.4s, v28.4s\n" + "srshl v24.4s, v24.4s, v28.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "ld1 {v22.4s}, [x10]\n" + "sqxtn2 v23.8h, v24.4s\n" + "ld1 {v24.4s}, [x10]\n" + "sqxtun v21.8b, v21.8h\n" + "sqxtun v23.8b, v23.8h\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "st1 {v21.8b}, [%[output_ptr]], x3\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "st1 {v23.8b}, [%[output_ptr]], x3\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "uaddw v13.8h, v26.8h, v13.8b\n" + "uaddw v14.8h, v26.8h, v14.8b\n" + "uaddw v15.8h, v26.8h, v15.8b\n" + "ld1 {v21.4s}, [%[bias_ptr]]\n" + "uaddw v16.8h, v26.8h, v16.8b\n" + "ld1 {v23.4s}, [%[bias_ptr]]\n" + "uaddw v17.8h, v26.8h, v17.8b\n" + "uaddw v18.8h, v26.8h, v18.8b\n" + "uaddw v19.8h, v26.8h, v19.8b\n" + "uaddw v20.8h, v26.8h, v20.8b\n" + + "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n" + + // At this point, there will be one of 2 width or 1 width leftover, + // not both. + "cmp w5, #2\n" + "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n" + + // Handle last two horizontal outputs if exists. + DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n" + "smlal v21.4s, v0.4h, v9.4h\n" + "ld1 {v12.8b}, [x12], %[input_depth]\n" + "smlal2 v22.4s, v0.8h, v9.8h\n" + "ld1 {v16.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v0.4h, v10.4h\n" + "ld1 {v20.8b}, [x14], %[input_depth]\n" + "smlal2 v24.4s, v0.8h, v10.8h\n" + "smlal v21.4s, v1.4h, v10.4h\n" + "smlal2 v22.4s, v1.8h, v10.8h\n" + "smlal v23.4s, v1.4h, v11.4h\n" + "smlal2 v24.4s, v1.8h, v11.8h\n" + "smlal v21.4s, v2.4h, v11.4h\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "smlal2 v22.4s, v2.8h, v11.8h\n" + "smlal v23.4s, v2.4h, v12.4h\n" + "smlal2 v24.4s, v2.8h, v12.8h\n" + "smlal v21.4s, v3.4h, v13.4h\n" + "smlal2 v22.4s, v3.8h, v13.8h\n" + "smlal v23.4s, v3.4h, v14.4h\n" + "smlal2 v24.4s, v3.8h, v14.8h\n" + "smlal v21.4s, v4.4h, v14.4h\n" + "smlal2 v22.4s, v4.8h, v14.8h\n" + "smlal v23.4s, v4.4h, v15.4h\n" + "smlal2 v24.4s, v4.8h, v15.8h\n" + "smlal v21.4s, v5.4h, v15.4h\n" + "uaddw v16.8h, v26.8h, v16.8b\n" + "smlal2 v22.4s, v5.8h, v15.8h\n" + "smlal v23.4s, v5.4h, v16.4h\n" + "smlal2 v24.4s, v5.8h, v16.8h\n" + "smlal v21.4s, v6.4h, v17.4h\n" + "smlal2 v22.4s, v6.8h, v17.8h\n" + "smlal v23.4s, v6.4h, v18.4h\n" + "smlal2 v24.4s, v6.8h, v18.8h\n" + "smlal v21.4s, v7.4h, v18.4h\n" + "smlal2 v22.4s, v7.8h, v18.8h\n" + "smlal v23.4s, v7.4h, v19.4h\n" + "smlal2 v24.4s, v7.8h, v19.8h\n" + "smlal v21.4s, v8.4h, v19.4h\n" + "uaddw v20.8h, v26.8h, v20.8b\n" + "smlal2 v22.4s, v8.8h, v19.8h\n" + "smlal v23.4s, v8.4h, v20.4h\n" + "smlal2 v24.4s, v8.8h, v20.8h\n" + + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v25.16b, v21.16b, v28.16b\n" + "and v29.16b, v22.16b, v28.16b\n" + "and v30.16b, v23.16b, v28.16b\n" + "and v31.16b, v24.16b, v28.16b\n" + "sshr v25.4s, v25.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v21.4s, v21.4s, v25.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "dup v30.4s, w4\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "dup v31.4s, w0\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "srshl v23.4s, v23.4s, v28.4s\n" + "srshl v24.4s, v24.4s, v28.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "sqxtn2 v23.8h, v24.4s\n" + "sqxtun v21.8b, v21.8h\n" + "sqxtun v23.8b, v23.8h\n" + "st1 {v21.8b}, [%[output_ptr]], x3\n" + "st1 {v23.8b}, [%[output_ptr]], x3\n" + "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n" + + // Handle bottom right output if exists. + DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n" + "smlal v21.4s, v0.4h, v9.4h\n" + "smlal2 v22.4s, v0.8h, v9.8h\n" + "smlal v21.4s, v1.4h, v10.4h\n" + "smlal2 v22.4s, v1.8h, v10.8h\n" + "smlal v21.4s, v2.4h, v11.4h\n" + "smlal2 v22.4s, v2.8h, v11.8h\n" + "smlal v21.4s, v3.4h, v13.4h\n" + "smlal2 v22.4s, v3.8h, v13.8h\n" + "smlal v21.4s, v4.4h, v14.4h\n" + "smlal2 v22.4s, v4.8h, v14.8h\n" + "smlal v21.4s, v5.4h, v15.4h\n" + "smlal2 v22.4s, v5.8h, v15.8h\n" + "smlal v21.4s, v6.4h, v17.4h\n" + "smlal2 v22.4s, v6.8h, v17.8h\n" + "smlal v21.4s, v7.4h, v18.4h\n" + "smlal2 v22.4s, v7.8h, v18.8h\n" + "smlal v21.4s, v8.4h, v19.4h\n" + "smlal2 v22.4s, v8.8h, v19.8h\n" + + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "and v9.16b, v21.16b, v28.16b\n" + "and v12.16b, v22.16b, v28.16b\n" + "sshr v9.4s, v9.4s, #31\n" + "sshr v12.4s, v12.4s, #31\n" + "sqadd v21.4s, v21.4s, v9.4s\n" + "sqadd v22.4s, v22.4s, v12.4s\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "sqxtun v21.8b, v21.8h\n" + "st1 {v21.8b}, [%[output_ptr]]\n" + DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n" + : + // Outputs. + [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr), + [output_ptr] "+r"(output_ptr), + [output_window_height] "+r"(output_window_height) + : + // Inputs. + [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size), + [input_depth] "r"(input_depth), + [output_window_width] "r"(output_window_width), + [input_width_increment] "r"(input_width_increment), + [input_height_increment] "r"(input_height_increment), + [output_height_increment] "r"(output_height_increment), + [params_ptr] "r"(params_ptr) + : + // Clobbers. + "cc", "memory", + // We use these NEON registers. + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", + // We use these general-purpose registers. + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x9", "x10", "x11", "x12", "x13", "x14", "x15"); +#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER +#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER +#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_1 +#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER +#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER +#undef DEPTHWISECONV_LABEL_HEIGHT_1_END } }; template <> -struct ConvKernel3x3FilterDepth8<4, 4, 2, 2> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - // Reuse 4x2 kernel twice. - ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run( - input_ptr, input_depth, input_offset, input_row_size, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_ptr, output_depth, - output_width); - - ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run( - input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size, - filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr + 2 * output_depth, output_depth, output_width); +struct DepthwiseConvWindow<8, 2, 2> { + static inline void Run(const uint8* input_ptr, const uint8* filter_ptr, + const int32* bias_ptr, uint8* output_ptr, int64_t input_depth, + int64_t input_row_size, int32 output_window_height, + int32 output_window_width, + const DepthwiseConvParams* params_ptr) { + const int64_t input_width_increment = 4 * input_depth; + const int64_t input_height_increment = 4 * input_row_size; + const int64_t output_height_increment = 2 * params_ptr->output_row_size; + +#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1" +#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2" +#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3" +#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4" +#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5" +#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6" +#define DEPTHWISECONV_LABEL_HEIGHT_1 "7" +#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8" +#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9" +#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10" +#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11" + + asm volatile( + // Performs depthwise convolutions for a window specified by + // |output_window_height| and |output_window_width|. The inner-most loop + // processes 2x2 outputs, and any leftovers at the end. + // + // Algorithm works as follows: + // + // 1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter + // values. + // 2. For 2 output heights at a time: + // i. For 2 output widths at a time at stride 2, a 5x5 input + // window is required. To avoid register exhaustion, we load + // the first 2 rows of the 5x5 input window into registers + // v9--v18, and use the same registers to load the next 2 + // rows, and finally v9--v13 to load the last row. + // Accumulators for all 2x2 outputs are reserved by registers + // v21-v22 (top left output), v23-v24 (top right output), + // v19-v20 (bottom left output), v25-v26 (bottom right + // output). + // ii. Handle single leftover width if exists. + // 3. Handle single leftover height if exists. + // i. For 2 output widths at a time at stride 2, load inputs for + // a 1x2 (1 height, 2 width) output window (3x5 input + // window). Registers v9--v24 hold input values. Mul-add with + // accumulators v24--v27. + // ii. Handle single leftover width if exists. + // + // Loads are placed as soon as the register is no longer needed and + // interleaved with arithmetic operations to take advantage of + // dual-issue pipelines. We also add input offsets as far from the loads + // as possible to give loads enough cycles to fetch data from memory. + + // Set "constant" registers. These registers may be replaced with temp + // values from time to time when there are not enough NEON registers. + // We use x9--x15 general purpose registers as they are caller-saved + // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf). // NOLINT + "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n" + "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n" + "cmp %w[output_window_height], #2\n" + "dup v28.8h, w0\n" + "neg w9, w9\n" + "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n" + "dup v26.4s, w9\n" + "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n" + "dup v27.4s, w1\n" + "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n" + "dup v29.4s, w2\n" + "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n" + "dup v30.4s, w3\n" + "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n" + "dup v31.4s, w4\n" + "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n" + "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n" + + // Load filters and add offsets. + "add x10, %[bias_ptr], #16\n" + "ld1 {v0.8b}, [%[filter_ptr]], x5\n" + "dup v9.8h, w20\n" + "ld1 {v1.8b}, [%[filter_ptr]], x5\n" + "uaddw v0.8h, v9.8h, v0.8b\n" + "ld1 {v2.8b}, [%[filter_ptr]], x5\n" + "uaddw v1.8h, v9.8h, v1.8b\n" + "ld1 {v3.8b}, [%[filter_ptr]], x5\n" + "uaddw v2.8h, v9.8h, v2.8b\n" + "ld1 {v4.8b}, [%[filter_ptr]], x5\n" + "uaddw v3.8h, v9.8h, v3.8b\n" + "ld1 {v5.8b}, [%[filter_ptr]], x5\n" + "uaddw v4.8h, v9.8h, v4.8b\n" + "ld1 {v6.8b}, [%[filter_ptr]], x5\n" + "uaddw v5.8h, v9.8h, v5.8b\n" + "ld1 {v7.8b}, [%[filter_ptr]], x5\n" + "uaddw v6.8h, v9.8h, v6.8b\n" + "ld1 {v8.8b}, [%[filter_ptr]]\n" + "uaddw v7.8h, v9.8h, v7.8b\n" + "uaddw v8.8h, v9.8h, v8.8b\n" + + "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n" + // Load the first two rows of the 5x5 input window, then reuse the + // same registers to load subsequent rows as they become available. + "mov x11, %[input_ptr]\n" + "mov x12, x11\n" + "add x13, x12, %[input_row_size]\n" + "ld1 {v9.8b}, [x12], %[input_depth]\n" + "mov w14, %w[output_window_width]\n" + "ld1 {v10.8b}, [x12], %[input_depth]\n" + // The height 2 / width 2 loop loads an extra 1 output horizontally in + // anticipation for the next iteration. Make sure + // |output_window_width| is large enough to handle the additional + // load, otherwise jump to the appropriate label to handle smaller + // widths. + "cmp w14, #2\n" + "ld1 {v11.8b}, [x12], %[input_depth]\n" + "add x15, x13, %[input_row_size]\n" + "ld1 {v14.8b}, [x13], %[input_depth]\n" + "mov x6, %[output_ptr]\n" + "ld1 {v15.8b}, [x13], %[input_depth]\n" + "add x7, %[output_ptr], x19\n" + "ld1 {v16.8b}, [x13], %[input_depth]\n" + "ld1 {v21.4s}, [%[bias_ptr]]\n" + "ld1 {v22.4s}, [x10]\n" + "ld1 {v23.4s}, [%[bias_ptr]]\n" + "uaddw v9.8h, v28.8h, v9.8b\n" + "ld1 {v24.4s}, [x10]\n" + "uaddw v10.8h, v28.8h, v10.8b\n" + "ld1 {v19.4s}, [%[bias_ptr]]\n" + "uaddw v11.8h, v28.8h, v11.8b\n" + "ld1 {v20.4s}, [x10]\n" + "uaddw v14.8h, v28.8h, v14.8b\n" + "ld1 {v25.4s}, [%[bias_ptr]]\n" + "uaddw v15.8h, v28.8h, v15.8b\n" + "ld1 {v26.4s}, [x10]\n" + "uaddw v16.8h, v28.8h, v16.8b\n" + + "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n" + "cmp w14, #1\n" + "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n" + "smlal v21.4s, v0.4h, v9.4h\n" + "ld1 {v12.8b}, [x12], %[input_depth]\n" + "smlal2 v22.4s, v0.8h, v9.8h\n" + "ld1 {v13.8b}, [x12]\n" + "add x12, x15, %[input_row_size]\n" + "smlal v23.4s, v0.4h, v11.4h\n" + "ld1 {v17.8b}, [x13], %[input_depth]\n" + "smlal2 v24.4s, v0.8h, v11.8h\n" + "ld1 {v18.8b}, [x13]\n" + "add x13, x12, %[input_row_size]\n" + "smlal v21.4s, v1.4h, v10.4h\n" + "ld1 {v9.8b}, [x15], %[input_depth]\n" + "smlal2 v22.4s, v1.8h, v10.8h\n" + "ld1 {v10.8b}, [x15], %[input_depth]\n" + "smlal v21.4s, v2.4h, v11.4h\n" + "smlal2 v22.4s, v2.8h, v11.8h\n" + "ld1 {v11.8b}, [x15], %[input_depth]\n" + "smlal v21.4s, v3.4h, v14.4h\n" + "smlal2 v22.4s, v3.8h, v14.8h\n" + "ld1 {v14.8b}, [x12], %[input_depth]\n" + "smlal v23.4s, v3.4h, v16.4h\n" + "subs w14, w14, #2\n" + "smlal2 v24.4s, v3.8h, v16.8h\n" + "cmp w14, #3\n" + "smlal v21.4s, v4.4h, v15.4h\n" + "uaddw v12.8h, v28.8h, v12.8b\n" + "smlal2 v22.4s, v4.8h, v15.8h\n" + "ld1 {v15.8b}, [x12], %[input_depth]\n" + "smlal v21.4s, v5.4h, v16.4h\n" + "uaddw v13.8h, v28.8h, v13.8b\n" + "smlal2 v22.4s, v5.8h, v16.8h\n" + "ld1 {v16.8b}, [x12], %[input_depth]\n" + "smlal v23.4s, v1.4h, v12.4h\n" + "uaddw v17.8h, v28.8h, v17.8b\n" + "smlal2 v24.4s, v1.8h, v12.8h\n" + "ld1 {v12.8b}, [x15], %[input_depth]\n" + "smlal v23.4s, v2.4h, v13.4h\n" + "uaddw v18.8h, v28.8h, v18.8b\n" + "smlal2 v24.4s, v2.8h, v13.8h\n" + "ld1 {v13.8b}, [x15]\n" + "smlal v23.4s, v4.4h, v17.4h\n" + "uaddw v9.8h, v28.8h, v9.8b\n" + "smlal2 v24.4s, v4.8h, v17.8h\n" + "ld1 {v17.8b}, [x12], %[input_depth]\n" + "smlal v23.4s, v5.4h, v18.4h\n" + "uaddw v10.8h, v28.8h, v10.8b\n" + "smlal2 v24.4s, v5.8h, v18.8h\n" + "ld1 {v18.8b}, [x12]\n" + + "smlal v21.4s, v6.4h, v9.4h\n" + "smlal2 v22.4s, v6.8h, v9.8h\n" + "smlal v19.4s, v0.4h, v9.4h\n" + "uaddw v11.8h, v28.8h, v11.8b\n" + "smlal2 v20.4s, v0.8h, v9.8h\n" + "ld1 {v9.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v6.4h, v11.4h\n" + "smlal2 v24.4s, v6.8h, v11.8h\n" + "smlal v21.4s, v7.4h, v10.4h\n" + "smlal2 v22.4s, v7.8h, v10.8h\n" + "uaddw v12.8h, v28.8h, v12.8b\n" + "smlal v19.4s, v1.4h, v10.4h\n" + "smlal2 v20.4s, v1.8h, v10.8h\n" + "ld1 {v10.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v7.4h, v12.4h\n" + "smlal2 v24.4s, v7.8h, v12.8h\n" + "smlal v25.4s, v1.4h, v12.4h\n" + "smlal2 v26.4s, v1.8h, v12.8h\n" + "smlal v21.4s, v8.4h, v11.4h\n" + "smlal2 v22.4s, v8.8h, v11.8h\n" + "add x11, x11, %[input_width_increment]\n" + "smlal v19.4s, v2.4h, v11.4h\n" + "mov x12, x11\n" + "smlal2 v20.4s, v2.8h, v11.8h\n" + "uaddw v13.8h, v28.8h, v13.8b\n" + "smlal v25.4s, v0.4h, v11.4h\n" + "smlal2 v26.4s, v0.8h, v11.8h\n" + "ld1 {v11.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v8.4h, v13.4h\n" + "ld1 {v12.8b}, [x13], %[input_depth]\n" + "smlal2 v24.4s, v8.8h, v13.8h\n" + "smlal v25.4s, v2.4h, v13.4h\n" + "smlal2 v26.4s, v2.8h, v13.8h\n" + "ld1 {v13.8b}, [x13]\n" + "add x13, x12, %[input_row_size]\n" + "add x15, x13, %[input_row_size]\n" + + "dup v28.4s, w9\n" + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v27.16b, v21.16b, v28.16b\n" + "and v29.16b, v22.16b, v28.16b\n" + "and v30.16b, v23.16b, v28.16b\n" + "and v31.16b, v24.16b, v28.16b\n" + "sshr v27.4s, v27.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v21.4s, v21.4s, v27.4s\n" + "dup v27.4s, w1\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "dup v30.4s, w3\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "dup v31.4s, w4\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "srshl v23.4s, v23.4s, v28.4s\n" + "srshl v24.4s, v24.4s, v28.4s\n" + "dup v28.8h, w0\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "ld1 {v22.4s}, [x10]\n" + "sqxtn2 v23.8h, v24.4s\n" + "ld1 {v24.4s}, [x10]\n" + "sqxtun v21.8b, v21.8h\n" + "sqxtun v23.8b, v23.8h\n" + "uaddw v9.8h, v28.8h, v9.8b\n" + "st1 {v21.8b}, [x6], x5\n" + "uaddw v10.8h, v28.8h, v10.8b\n" + "st1 {v23.8b}, [x6], x5\n" + "uaddw v11.8h, v28.8h, v11.8b\n" + + "smlal v19.4s, v6.4h, v9.4h\n" + "smlal2 v20.4s, v6.8h, v9.8h\n" + "ld1 {v9.8b}, [x12], %[input_depth]\n" + "smlal v25.4s, v6.4h, v11.4h\n" + "smlal2 v26.4s, v6.8h, v11.8h\n" + "smlal v19.4s, v7.4h, v10.4h\n" + "uaddw v12.8h, v28.8h, v12.8b\n" + "smlal2 v20.4s, v7.8h, v10.8h\n" + "ld1 {v10.8b}, [x12], %[input_depth]\n" + "smlal v25.4s, v7.4h, v12.4h\n" + "smlal2 v26.4s, v7.8h, v12.8h\n" + "smlal v19.4s, v8.4h, v11.4h\n" + "uaddw v13.8h, v28.8h, v13.8b\n" + "smlal2 v20.4s, v8.8h, v11.8h\n" + "ld1 {v11.8b}, [x12], %[input_depth]\n" + "smlal v25.4s, v8.4h, v13.4h\n" + "uaddw v14.8h, v28.8h, v14.8b\n" + "smlal2 v26.4s, v8.8h, v13.8h\n" + "uaddw v16.8h, v28.8h, v16.8b\n" + "smlal v19.4s, v3.4h, v14.4h\n" + "uaddw v15.8h, v28.8h, v15.8b\n" + "smlal2 v20.4s, v3.8h, v14.8h\n" + "ld1 {v14.8b}, [x13], %[input_depth]\n" + "smlal v25.4s, v3.4h, v16.4h\n" + "ld1 {v21.4s}, [%[bias_ptr]]\n" + "smlal2 v26.4s, v3.8h, v16.8h\n" + "ld1 {v23.4s}, [%[bias_ptr]]\n" + "smlal v19.4s, v4.4h, v15.4h\n" + "uaddw v17.8h, v28.8h, v17.8b\n" + "smlal2 v20.4s, v4.8h, v15.8h\n" + "ld1 {v15.8b}, [x13], %[input_depth]\n" + "smlal v25.4s, v4.4h, v17.4h\n" + "smlal2 v26.4s, v4.8h, v17.8h\n" + "smlal v19.4s, v5.4h, v16.4h\n" + "uaddw v18.8h, v28.8h, v18.8b\n" + "smlal2 v20.4s, v5.8h, v16.8h\n" + "ld1 {v16.8b}, [x13], %[input_depth]\n" + "smlal v25.4s, v5.4h, v18.4h\n" + "smlal2 v26.4s, v5.8h, v18.8h\n" + + "dup v28.4s, w9\n" + "sqrdmulh v19.4s, v19.4s, v27.4s\n" + "sqrdmulh v20.4s, v20.4s, v27.4s\n" + "sqrdmulh v25.4s, v25.4s, v27.4s\n" + "sqrdmulh v26.4s, v26.4s, v27.4s\n" + "and v27.16b, v19.16b, v28.16b\n" + "and v29.16b, v20.16b, v28.16b\n" + "and v30.16b, v25.16b, v28.16b\n" + "and v31.16b, v26.16b, v28.16b\n" + "sshr v27.4s, v27.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v19.4s, v19.4s, v27.4s\n" + "dup v27.4s, w1\n" + "sqadd v20.4s, v20.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v25.4s, v25.4s, v30.4s\n" + "dup v30.4s, w3\n" + "sqadd v26.4s, v26.4s, v31.4s\n" + "dup v31.4s, w4\n" + "srshl v19.4s, v19.4s, v28.4s\n" + "srshl v20.4s, v20.4s, v28.4s\n" + "srshl v25.4s, v25.4s, v28.4s\n" + "srshl v26.4s, v26.4s, v28.4s\n" + "dup v28.8h, w0\n" + "add v19.4s, v19.4s, v29.4s\n" + "add v20.4s, v20.4s, v29.4s\n" + "add v25.4s, v25.4s, v29.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "smax v19.4s, v19.4s, v30.4s\n" + "smax v20.4s, v20.4s, v30.4s\n" + "smax v25.4s, v25.4s, v30.4s\n" + "smax v26.4s, v26.4s, v30.4s\n" + "smin v19.4s, v19.4s, v31.4s\n" + "smin v20.4s, v20.4s, v31.4s\n" + "smin v25.4s, v25.4s, v31.4s\n" + "smin v26.4s, v26.4s, v31.4s\n" + "sqxtn v19.4h, v19.4s\n" + "sqxtn v25.4h, v25.4s\n" + "sqxtn2 v19.8h, v20.4s\n" + "ld1 {v20.4s}, [x10]\n" + "sqxtn2 v25.8h, v26.4s\n" + "ld1 {v26.4s}, [x10]\n" + "sqxtun v19.8b, v19.8h\n" + "sqxtun v25.8b, v25.8h\n" + "uaddw v9.8h, v28.8h, v9.8b\n" + "st1 {v19.8b}, [x7], x5\n" + "uaddw v10.8h, v28.8h, v10.8b\n" + "st1 {v25.8b}, [x7], x5\n" + "uaddw v11.8h, v28.8h, v11.8b\n" + "ld1 {v19.4s}, [%[bias_ptr]]\n" + "uaddw v14.8h, v28.8h, v14.8b\n" + "ld1 {v25.4s}, [%[bias_ptr]]\n" + "uaddw v15.8h, v28.8h, v15.8b\n" + "uaddw v16.8h, v28.8h, v16.8b\n" + + "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n" + + // At this point, there will be one of 2 width or 1 width leftover, + // not both. + "cmp w14, #2\n" + "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n" + + // Handle last 2 columns if exists. + DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n" + "smlal v21.4s, v0.4h, v9.4h\n" + "ld1 {v12.8b}, [x12], %[input_depth]\n" + "smlal2 v22.4s, v0.8h, v9.8h\n" + "ld1 {v13.8b}, [x12]\n" + "add x12, x15, %[input_row_size]\n" + "smlal v23.4s, v0.4h, v11.4h\n" + "ld1 {v17.8b}, [x13], %[input_depth]\n" + "smlal2 v24.4s, v0.8h, v11.8h\n" + "ld1 {v18.8b}, [x13]\n" + "add x13, x12, %[input_row_size]\n" + "smlal v21.4s, v1.4h, v10.4h\n" + "ld1 {v9.8b}, [x15], %[input_depth]\n" + "smlal2 v22.4s, v1.8h, v10.8h\n" + "ld1 {v10.8b}, [x15], %[input_depth]\n" + "smlal v21.4s, v2.4h, v11.4h\n" + "smlal2 v22.4s, v2.8h, v11.8h\n" + "ld1 {v11.8b}, [x15], %[input_depth]\n" + "smlal v21.4s, v3.4h, v14.4h\n" + "smlal2 v22.4s, v3.8h, v14.8h\n" + "ld1 {v14.8b}, [x12], %[input_depth]\n" + "smlal v23.4s, v3.4h, v16.4h\n" + "smlal2 v24.4s, v3.8h, v16.8h\n" + "smlal v21.4s, v4.4h, v15.4h\n" + "uaddw v12.8h, v28.8h, v12.8b\n" + "smlal2 v22.4s, v4.8h, v15.8h\n" + "ld1 {v15.8b}, [x12], %[input_depth]\n" + "smlal v21.4s, v5.4h, v16.4h\n" + "uaddw v13.8h, v28.8h, v13.8b\n" + "smlal2 v22.4s, v5.8h, v16.8h\n" + "ld1 {v16.8b}, [x12], %[input_depth]\n" + "smlal v23.4s, v1.4h, v12.4h\n" + "uaddw v17.8h, v28.8h, v17.8b\n" + "smlal2 v24.4s, v1.8h, v12.8h\n" + "ld1 {v12.8b}, [x15], %[input_depth]\n" + "smlal v23.4s, v2.4h, v13.4h\n" + "uaddw v18.8h, v28.8h, v18.8b\n" + "smlal2 v24.4s, v2.8h, v13.8h\n" + "ld1 {v13.8b}, [x15]\n" + "smlal v23.4s, v4.4h, v17.4h\n" + "uaddw v9.8h, v28.8h, v9.8b\n" + "smlal2 v24.4s, v4.8h, v17.8h\n" + "ld1 {v17.8b}, [x12], %[input_depth]\n" + "smlal v23.4s, v5.4h, v18.4h\n" + "uaddw v10.8h, v28.8h, v10.8b\n" + "smlal2 v24.4s, v5.8h, v18.8h\n" + "ld1 {v18.8b}, [x12]\n" + + "smlal v21.4s, v6.4h, v9.4h\n" + "smlal2 v22.4s, v6.8h, v9.8h\n" + "smlal v19.4s, v0.4h, v9.4h\n" + "uaddw v11.8h, v28.8h, v11.8b\n" + "smlal2 v20.4s, v0.8h, v9.8h\n" + "ld1 {v9.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v6.4h, v11.4h\n" + "smlal2 v24.4s, v6.8h, v11.8h\n" + "smlal v21.4s, v7.4h, v10.4h\n" + "smlal2 v22.4s, v7.8h, v10.8h\n" + "uaddw v12.8h, v28.8h, v12.8b\n" + "smlal v19.4s, v1.4h, v10.4h\n" + "smlal2 v20.4s, v1.8h, v10.8h\n" + "ld1 {v10.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v7.4h, v12.4h\n" + "smlal2 v24.4s, v7.8h, v12.8h\n" + "smlal v25.4s, v1.4h, v12.4h\n" + "smlal2 v26.4s, v1.8h, v12.8h\n" + "smlal v21.4s, v8.4h, v11.4h\n" + "smlal2 v22.4s, v8.8h, v11.8h\n" + "smlal v19.4s, v2.4h, v11.4h\n" + "smlal2 v20.4s, v2.8h, v11.8h\n" + "uaddw v13.8h, v28.8h, v13.8b\n" + "smlal v25.4s, v0.4h, v11.4h\n" + "smlal2 v26.4s, v0.8h, v11.8h\n" + "ld1 {v11.8b}, [x13], %[input_depth]\n" + "smlal v23.4s, v8.4h, v13.4h\n" + "ld1 {v12.8b}, [x13], %[input_depth]\n" + "smlal2 v24.4s, v8.8h, v13.8h\n" + "smlal v25.4s, v2.4h, v13.4h\n" + "smlal2 v26.4s, v2.8h, v13.8h\n" + "ld1 {v13.8b}, [x13]\n" + + "dup v28.4s, w9\n" + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v27.16b, v21.16b, v28.16b\n" + "and v29.16b, v22.16b, v28.16b\n" + "and v30.16b, v23.16b, v28.16b\n" + "and v31.16b, v24.16b, v28.16b\n" + "sshr v27.4s, v27.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v21.4s, v21.4s, v27.4s\n" + "dup v27.4s, w1\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "dup v30.4s, w3\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "dup v31.4s, w4\n" + "srshl v21.4s, v21.4s, v28.4s\n" + "srshl v22.4s, v22.4s, v28.4s\n" + "srshl v23.4s, v23.4s, v28.4s\n" + "srshl v24.4s, v24.4s, v28.4s\n" + "dup v28.8h, w0\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "ld1 {v22.4s}, [x10]\n" + "sqxtn2 v23.8h, v24.4s\n" + "ld1 {v24.4s}, [x10]\n" + "sqxtun v21.8b, v21.8h\n" + "sqxtun v23.8b, v23.8h\n" + "uaddw v9.8h, v28.8h, v9.8b\n" + "st1 {v21.8b}, [x6], x5\n" + "uaddw v10.8h, v28.8h, v10.8b\n" + "st1 {v23.8b}, [x6]\n" + "uaddw v11.8h, v28.8h, v11.8b\n" + + "smlal v19.4s, v6.4h, v9.4h\n" + "smlal2 v20.4s, v6.8h, v9.8h\n" + "smlal v25.4s, v6.4h, v11.4h\n" + "smlal2 v26.4s, v6.8h, v11.8h\n" + "smlal v19.4s, v7.4h, v10.4h\n" + "uaddw v12.8h, v28.8h, v12.8b\n" + "smlal2 v20.4s, v7.8h, v10.8h\n" + "smlal v25.4s, v7.4h, v12.4h\n" + "smlal2 v26.4s, v7.8h, v12.8h\n" + "smlal v19.4s, v8.4h, v11.4h\n" + "uaddw v13.8h, v28.8h, v13.8b\n" + "smlal2 v20.4s, v8.8h, v11.8h\n" + "smlal v25.4s, v8.4h, v13.4h\n" + "uaddw v14.8h, v28.8h, v14.8b\n" + "smlal2 v26.4s, v8.8h, v13.8h\n" + "uaddw v16.8h, v28.8h, v16.8b\n" + "smlal v19.4s, v3.4h, v14.4h\n" + "uaddw v15.8h, v28.8h, v15.8b\n" + "smlal2 v20.4s, v3.8h, v14.8h\n" + "smlal v25.4s, v3.4h, v16.4h\n" + "smlal2 v26.4s, v3.8h, v16.8h\n" + "smlal v19.4s, v4.4h, v15.4h\n" + "uaddw v17.8h, v28.8h, v17.8b\n" + "smlal2 v20.4s, v4.8h, v15.8h\n" + "smlal v25.4s, v4.4h, v17.4h\n" + "smlal2 v26.4s, v4.8h, v17.8h\n" + "smlal v19.4s, v5.4h, v16.4h\n" + "uaddw v18.8h, v28.8h, v18.8b\n" + "smlal2 v20.4s, v5.8h, v16.8h\n" + "smlal v25.4s, v5.4h, v18.4h\n" + "smlal2 v26.4s, v5.8h, v18.8h\n" + + "dup v28.4s, w9\n" + "sqrdmulh v19.4s, v19.4s, v27.4s\n" + "sqrdmulh v20.4s, v20.4s, v27.4s\n" + "sqrdmulh v25.4s, v25.4s, v27.4s\n" + "sqrdmulh v26.4s, v26.4s, v27.4s\n" + "and v27.16b, v19.16b, v28.16b\n" + "and v29.16b, v20.16b, v28.16b\n" + "and v30.16b, v25.16b, v28.16b\n" + "and v31.16b, v26.16b, v28.16b\n" + "sshr v27.4s, v27.4s, #31\n" + "sshr v29.4s, v29.4s, #31\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v19.4s, v19.4s, v27.4s\n" + "dup v27.4s, w1\n" + "sqadd v20.4s, v20.4s, v29.4s\n" + "dup v29.4s, w2\n" + "sqadd v25.4s, v25.4s, v30.4s\n" + "dup v30.4s, w3\n" + "sqadd v26.4s, v26.4s, v31.4s\n" + "dup v31.4s, w4\n" + "srshl v19.4s, v19.4s, v28.4s\n" + "srshl v20.4s, v20.4s, v28.4s\n" + "srshl v25.4s, v25.4s, v28.4s\n" + "srshl v26.4s, v26.4s, v28.4s\n" + "dup v28.8h, w0\n" + "add v19.4s, v19.4s, v29.4s\n" + "add v20.4s, v20.4s, v29.4s\n" + "add v25.4s, v25.4s, v29.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "smax v19.4s, v19.4s, v30.4s\n" + "smax v20.4s, v20.4s, v30.4s\n" + "smax v25.4s, v25.4s, v30.4s\n" + "smax v26.4s, v26.4s, v30.4s\n" + "smin v19.4s, v19.4s, v31.4s\n" + "smin v20.4s, v20.4s, v31.4s\n" + "smin v25.4s, v25.4s, v31.4s\n" + "smin v26.4s, v26.4s, v31.4s\n" + "sqxtn v19.4h, v19.4s\n" + "sqxtn v25.4h, v25.4s\n" + "sqxtn2 v19.8h, v20.4s\n" + "sqxtn2 v25.8h, v26.4s\n" + "sqxtun v19.8b, v19.8h\n" + "sqxtun v25.8b, v25.8h\n" + "st1 {v19.8b}, [x7], x5\n" + "st1 {v25.8b}, [x7]\n" + "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n" + + // Handle last column if exists. + DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n" + // Registers v9, v10, v11, v14, v15, and v16 have already been loaded + // with the correct values at this point. This corresponds to the + // first two input rows of the top left output. Now load the last + // input row for this output. Once these inputs are no longer needed, + // load the input rows for the bottom left output. + "add x12, x15, %[input_row_size]\n" + "add x13, x12, %[input_row_size]\n" + + "ld1 {v12.8b}, [x15], %[input_depth]\n" + "smlal v21.4s, v0.4h, v9.4h\n" + "ld1 {v13.8b}, [x15], %[input_depth]\n" + "smlal2 v22.4s, v0.8h, v9.8h\n" + "ld1 {v17.8b}, [x15]\n" + "smlal v21.4s, v1.4h, v10.4h\n" + "ld1 {v9.8b}, [x12], %[input_depth]\n" + "smlal2 v22.4s, v1.8h, v10.8h\n" + "ld1 {v10.8b}, [x12], %[input_depth]\n" + "smlal v21.4s, v2.4h, v11.4h\n" + "smlal2 v22.4s, v2.8h, v11.8h\n" + "ld1 {v11.8b}, [x12]\n" + "smlal v21.4s, v3.4h, v14.4h\n" + "smlal2 v22.4s, v3.8h, v14.8h\n" + "ld1 {v14.8b}, [x13], %[input_depth]\n" + "smlal v21.4s, v4.4h, v15.4h\n" + "smlal2 v22.4s, v4.8h, v15.8h\n" + "ld1 {v15.8b}, [x13], %[input_depth]\n" + "smlal v21.4s, v5.4h, v16.4h\n" + "uaddw v12.8h, v28.8h, v12.8b\n" + "smlal2 v22.4s, v5.8h, v16.8h\n" + "uaddw v13.8h, v28.8h, v13.8b\n" + "ld1 {v16.8b}, [x13]\n" + + "smlal v21.4s, v6.4h, v12.4h\n" + "smlal2 v22.4s, v6.8h, v12.8h\n" + "smlal v23.4s, v0.4h, v12.4h\n" + "uaddw v17.8h, v28.8h, v17.8b\n" + "smlal2 v24.4s, v0.8h, v12.8h\n" + "smlal v21.4s, v7.4h, v13.4h\n" + "smlal2 v22.4s, v7.8h, v13.8h\n" + "smlal v23.4s, v1.4h, v13.4h\n" + "smlal2 v24.4s, v1.8h, v13.8h\n" + "smlal v21.4s, v8.4h, v17.4h\n" + "smlal2 v22.4s, v8.8h, v17.8h\n" + "smlal v23.4s, v2.4h, v17.4h\n" + "smlal2 v24.4s, v2.8h, v17.8h\n" + + "dup v26.4s, w9\n" + "sqrdmulh v21.4s, v21.4s, v27.4s\n" + "sqrdmulh v22.4s, v22.4s, v27.4s\n" + "and v18.16b, v21.16b, v26.16b\n" + "and v19.16b, v22.16b, v26.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v21.4s, v21.4s, v18.4s\n" + "sqadd v22.4s, v22.4s, v19.4s\n" + "srshl v21.4s, v21.4s, v26.4s\n" + "srshl v22.4s, v22.4s, v26.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "smax v21.4s, v21.4s, v30.4s\n" + "smax v22.4s, v22.4s, v30.4s\n" + "smin v21.4s, v21.4s, v31.4s\n" + "smin v22.4s, v22.4s, v31.4s\n" + "sqxtn v21.4h, v21.4s\n" + "sqxtn2 v21.8h, v22.4s\n" + "sqxtun v21.8b, v21.8h\n" + "uaddw v9.8h, v28.8h, v9.8b\n" + "st1 {v21.8b}, [x6]\n" + "uaddw v10.8h, v28.8h, v10.8b\n" + + "smlal v23.4s, v3.4h, v9.4h\n" + "uaddw v11.8h, v28.8h, v11.8b\n" + "smlal2 v24.4s, v3.8h, v9.8h\n" + "uaddw v14.8h, v28.8h, v14.8b\n" + "smlal v23.4s, v4.4h, v10.4h\n" + "uaddw v15.8h, v28.8h, v15.8b\n" + "smlal2 v24.4s, v4.8h, v10.8h\n" + "uaddw v16.8h, v28.8h, v16.8b\n" + "smlal v23.4s, v5.4h, v11.4h\n" + "smlal2 v24.4s, v5.8h, v11.8h\n" + + "smlal v23.4s, v6.4h, v14.4h\n" + "smlal2 v24.4s, v6.8h, v14.8h\n" + "smlal v23.4s, v7.4h, v15.4h\n" + "smlal2 v24.4s, v7.8h, v15.8h\n" + "smlal v23.4s, v8.4h, v16.4h\n" + "smlal2 v24.4s, v8.8h, v16.8h\n" + + "sqrdmulh v23.4s, v23.4s, v27.4s\n" + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "and v18.16b, v23.16b, v26.16b\n" + "and v19.16b, v24.16b, v26.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v23.4s, v23.4s, v18.4s\n" + "sqadd v24.4s, v24.4s, v19.4s\n" + "srshl v23.4s, v23.4s, v26.4s\n" + "srshl v24.4s, v24.4s, v26.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "smax v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smin v23.4s, v23.4s, v31.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v23.8h, v24.4s\n" + "sqxtun v23.8b, v23.8h\n" + "st1 {v23.8b}, [x7]\n" + + DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n" + "subs %w[output_window_height], %w[output_window_height], #2\n" + "add %[input_ptr], %[input_ptr], %[input_height_increment]\n" + "cmp %w[output_window_height], #2\n" + "add %[output_ptr], %[output_ptr], %[output_height_increment]\n" + "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n" + + DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n" + "cmp %w[output_window_height], #1\n" + "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n" + + DEPTHWISECONV_LABEL_HEIGHT_1 ":\n" + "mov x11, %[input_ptr]\n" + "mov x12, x11\n" + "add x13, x12, %[input_row_size]\n" + "ld1 {v9.8b}, [x12], %[input_depth]\n" + "add x15, x13, %[input_row_size]\n" + "ld1 {v10.8b}, [x12], %[input_depth]\n" + "mov x6, %[output_ptr]\n" + "ld1 {v11.8b}, [x12], %[input_depth]\n" + "mov w14, %w[output_window_width]\n" + // The height 1 / width 2 loop loads an extra 1x1 output in anticipation + // for the next iteration. Make sure |output_window_width| is large + // enough to handle the additional load, otherwise jump to the + // appropriate label to handle smaller widths. + "cmp w14, #2\n" + "ld1 {v12.8b}, [x13], %[input_depth]\n" + "ld1 {v13.8b}, [x13], %[input_depth]\n" + "ld1 {v14.8b}, [x13], %[input_depth]\n" + "ld1 {v15.8b}, [x15], %[input_depth]\n" + "ld1 {v16.8b}, [x15], %[input_depth]\n" + "ld1 {v17.8b}, [x15], %[input_depth]\n" + + "uaddw v9.8h, v28.8h, v9.8b\n" + "ld1 {v24.4s}, [%[bias_ptr]]\n" + "uaddw v10.8h, v28.8h, v10.8b\n" + "ld1 {v25.4s}, [x10]\n" + "uaddw v11.8h, v28.8h, v11.8b\n" + "ld1 {v26.4s}, [%[bias_ptr]]\n" + "ld1 {v27.4s}, [x10]\n" + "uaddw v12.8h, v28.8h, v12.8b\n" + "uaddw v13.8h, v28.8h, v13.8b\n" + "uaddw v14.8h, v28.8h, v14.8b\n" + "uaddw v15.8h, v28.8h, v15.8b\n" + "uaddw v16.8h, v28.8h, v16.8b\n" + "uaddw v17.8h, v28.8h, v17.8b\n" + + "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n" + "cmp w14, #1\n" + "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n" + "smlal v24.4s, v0.4h, v9.4h\n" + "ld1 {v18.8b}, [x12], %[input_depth]\n" + "smlal2 v25.4s, v0.8h, v9.8h\n" + "ld1 {v19.8b}, [x12]\n" + "smlal v26.4s, v0.4h, v11.4h\n" + "ld1 {v20.8b}, [x13], %[input_depth]\n" + "smlal2 v27.4s, v0.8h, v11.8h\n" + "ld1 {v21.8b}, [x13]\n" + "smlal v24.4s, v1.4h, v10.4h\n" + "ld1 {v22.8b}, [x15], %[input_depth]\n" + "smlal2 v25.4s, v1.8h, v10.8h\n" + "ld1 {v23.8b}, [x15]\n" + "smlal v24.4s, v2.4h, v11.4h\n" + "subs w14, w14, #2\n" + "smlal2 v25.4s, v2.8h, v11.8h\n" + "cmp w14, #3\n" + "smlal v24.4s, v3.4h, v12.4h\n" + "add x11, x11, %[input_width_increment]\n" + "smlal2 v25.4s, v3.8h, v12.8h\n" + "mov x12, x11\n" + "smlal v26.4s, v3.4h, v14.4h\n" + "add x13, x12, %[input_row_size]\n" + "smlal2 v27.4s, v3.8h, v14.8h\n" + "add x15, x13, %[input_row_size]\n" + "smlal v24.4s, v4.4h, v13.4h\n" + "ld1 {v9.8b}, [x12], %[input_depth]\n" + "smlal2 v25.4s, v4.8h, v13.8h\n" + "ld1 {v10.8b}, [x12], %[input_depth]\n" + "smlal v24.4s, v5.4h, v14.4h\n" + "ld1 {v11.8b}, [x12], %[input_depth]\n" + "smlal2 v25.4s, v5.8h, v14.8h\n" + "ld1 {v12.8b}, [x13], %[input_depth]\n" + "smlal v24.4s, v6.4h, v15.4h\n" + "ld1 {v13.8b}, [x13], %[input_depth]\n" + "smlal2 v25.4s, v6.8h, v15.8h\n" + "ld1 {v14.8b}, [x13], %[input_depth]\n" + "smlal v26.4s, v6.4h, v17.4h\n" + "ld1 {v15.8b}, [x15], %[input_depth]\n" + "smlal2 v27.4s, v6.8h, v17.8h\n" + "smlal v24.4s, v7.4h, v16.4h\n" + "smlal2 v25.4s, v7.8h, v16.8h\n" + "ld1 {v16.8b}, [x15], %[input_depth]\n" + "smlal v24.4s, v8.4h, v17.4h\n" + "uaddw v18.8h, v28.8h, v18.8b\n" + "smlal2 v25.4s, v8.8h, v17.8h\n" + "ld1 {v17.8b}, [x15], %[input_depth]\n" + "uaddw v19.8h, v28.8h, v19.8b\n" + + "smlal v26.4s, v1.4h, v18.4h\n" + "uaddw v20.8h, v28.8h, v20.8b\n" + "smlal2 v27.4s, v1.8h, v18.8h\n" + "smlal v26.4s, v2.4h, v19.4h\n" + "uaddw v21.8h, v28.8h, v21.8b\n" + "smlal2 v27.4s, v2.8h, v19.8h\n" + "smlal v26.4s, v4.4h, v20.4h\n" + "smlal v26.4s, v5.4h, v21.4h\n" + "smlal2 v27.4s, v4.8h, v20.8h\n" + "uaddw v22.8h, v28.8h, v22.8b\n" + "smlal2 v27.4s, v5.8h, v21.8h\n" + "uaddw v23.8h, v28.8h, v23.8b\n" + "smlal v26.4s, v7.4h, v22.4h\n" + "smlal2 v27.4s, v7.8h, v22.8h\n" + "smlal v26.4s, v8.4h, v23.4h\n" + "smlal2 v27.4s, v8.8h, v23.8h\n" + + "dup v28.4s, w1\n" + "dup v29.4s, w9\n" + "sqrdmulh v24.4s, v24.4s, v28.4s\n" + "sqrdmulh v25.4s, v25.4s, v28.4s\n" + "sqrdmulh v26.4s, v26.4s, v28.4s\n" + "sqrdmulh v27.4s, v27.4s, v28.4s\n" + "dup v28.4s, w2\n" + "and v30.16b, v24.16b, v29.16b\n" + "and v31.16b, v25.16b, v29.16b\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v24.4s, v24.4s, v30.4s\n" + "sqadd v25.4s, v25.4s, v31.4s\n" + "and v30.16b, v26.16b, v29.16b\n" + "and v31.16b, v27.16b, v29.16b\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v26.4s, v26.4s, v30.4s\n" + "dup v30.4s, w3\n" + "sqadd v27.4s, v27.4s, v31.4s\n" + "dup v31.4s, w4\n" + "srshl v24.4s, v24.4s, v29.4s\n" + "srshl v25.4s, v25.4s, v29.4s\n" + "srshl v26.4s, v26.4s, v29.4s\n" + "srshl v27.4s, v27.4s, v29.4s\n" + "add v24.4s, v24.4s, v28.4s\n" + "add v25.4s, v25.4s, v28.4s\n" + "add v26.4s, v26.4s, v28.4s\n" + "add v27.4s, v27.4s, v28.4s\n" + "dup v28.8h, w0\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smax v25.4s, v25.4s, v30.4s\n" + "smax v26.4s, v26.4s, v30.4s\n" + "smax v27.4s, v27.4s, v30.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "smin v25.4s, v25.4s, v31.4s\n" + "smin v26.4s, v26.4s, v31.4s\n" + "smin v27.4s, v27.4s, v31.4s\n" + "sqxtn v24.4h, v24.4s\n" + "sqxtn v26.4h, v26.4s\n" + "sqxtn2 v24.8h, v25.4s\n" + "ld1 {v25.4s}, [x10]\n" + "sqxtn2 v26.8h, v27.4s\n" + "ld1 {v27.4s}, [x10]\n" + "sqxtun v24.8b, v24.8h\n" + "sqxtun v26.8b, v26.8h\n" + "uaddw v9.8h, v28.8h, v9.8b\n" + "st1 {v24.8b}, [x6], x5\n" + "uaddw v10.8h, v28.8h, v10.8b\n" + "st1 {v26.8b}, [x6], x5\n" + "uaddw v11.8h, v28.8h, v11.8b\n" + "uaddw v12.8h, v28.8h, v12.8b\n" + "uaddw v13.8h, v28.8h, v13.8b\n" + "uaddw v14.8h, v28.8h, v14.8b\n" + "ld1 {v24.4s}, [%[bias_ptr]]\n" + "uaddw v15.8h, v28.8h, v15.8b\n" + "ld1 {v26.4s}, [%[bias_ptr]]\n" + "uaddw v16.8h, v28.8h, v16.8b\n" + "uaddw v17.8h, v28.8h, v17.8b\n" + + "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n" + + // At this point, there will be one of 2 width or 1 width leftover, + // not both. + "cmp w14, #2\n" + "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n" + + // Handle last two horizontal outputs if exists. + DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n" + "smlal v24.4s, v0.4h, v9.4h\n" + "ld1 {v18.8b}, [x12], %[input_depth]\n" + "smlal2 v25.4s, v0.8h, v9.8h\n" + "ld1 {v19.8b}, [x12]\n" + "smlal v26.4s, v0.4h, v11.4h\n" + "ld1 {v20.8b}, [x13], %[input_depth]\n" + "smlal2 v27.4s, v0.8h, v11.8h\n" + "ld1 {v21.8b}, [x13]\n" + "smlal v24.4s, v1.4h, v10.4h\n" + "ld1 {v22.8b}, [x15], %[input_depth]\n" + "smlal2 v25.4s, v1.8h, v10.8h\n" + "ld1 {v23.8b}, [x15]\n" + "smlal v24.4s, v2.4h, v11.4h\n" + "smlal2 v25.4s, v2.8h, v11.8h\n" + "smlal v24.4s, v3.4h, v12.4h\n" + "smlal2 v25.4s, v3.8h, v12.8h\n" + "smlal v26.4s, v3.4h, v14.4h\n" + "smlal2 v27.4s, v3.8h, v14.8h\n" + "smlal v24.4s, v4.4h, v13.4h\n" + "smlal2 v25.4s, v4.8h, v13.8h\n" + "smlal v24.4s, v5.4h, v14.4h\n" + "smlal2 v25.4s, v5.8h, v14.8h\n" + "smlal v24.4s, v6.4h, v15.4h\n" + "smlal2 v25.4s, v6.8h, v15.8h\n" + "smlal v26.4s, v6.4h, v17.4h\n" + "smlal2 v27.4s, v6.8h, v17.8h\n" + "smlal v24.4s, v7.4h, v16.4h\n" + "smlal2 v25.4s, v7.8h, v16.8h\n" + "smlal v24.4s, v8.4h, v17.4h\n" + "uaddw v18.8h, v28.8h, v18.8b\n" + "smlal2 v25.4s, v8.8h, v17.8h\n" + "uaddw v19.8h, v28.8h, v19.8b\n" + + "smlal v26.4s, v1.4h, v18.4h\n" + "uaddw v20.8h, v28.8h, v20.8b\n" + "smlal2 v27.4s, v1.8h, v18.8h\n" + "smlal v26.4s, v2.4h, v19.4h\n" + "uaddw v21.8h, v28.8h, v21.8b\n" + "smlal2 v27.4s, v2.8h, v19.8h\n" + "smlal v26.4s, v4.4h, v20.4h\n" + "smlal v26.4s, v5.4h, v21.4h\n" + "smlal2 v27.4s, v4.8h, v20.8h\n" + "uaddw v22.8h, v28.8h, v22.8b\n" + "smlal2 v27.4s, v5.8h, v21.8h\n" + "uaddw v23.8h, v28.8h, v23.8b\n" + "smlal v26.4s, v7.4h, v22.4h\n" + "smlal2 v27.4s, v7.8h, v22.8h\n" + "smlal v26.4s, v8.4h, v23.4h\n" + "smlal2 v27.4s, v8.8h, v23.8h\n" + + "dup v28.4s, w1\n" + "dup v29.4s, w9\n" + "sqrdmulh v24.4s, v24.4s, v28.4s\n" + "sqrdmulh v25.4s, v25.4s, v28.4s\n" + "sqrdmulh v26.4s, v26.4s, v28.4s\n" + "sqrdmulh v27.4s, v27.4s, v28.4s\n" + "dup v28.4s, w2\n" + "and v30.16b, v24.16b, v29.16b\n" + "and v31.16b, v25.16b, v29.16b\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v24.4s, v24.4s, v30.4s\n" + "sqadd v25.4s, v25.4s, v31.4s\n" + "and v30.16b, v26.16b, v29.16b\n" + "and v31.16b, v27.16b, v29.16b\n" + "sshr v30.4s, v30.4s, #31\n" + "sshr v31.4s, v31.4s, #31\n" + "sqadd v26.4s, v26.4s, v30.4s\n" + "dup v30.4s, w3\n" + "sqadd v27.4s, v27.4s, v31.4s\n" + "dup v31.4s, w4\n" + "srshl v24.4s, v24.4s, v29.4s\n" + "srshl v25.4s, v25.4s, v29.4s\n" + "srshl v26.4s, v26.4s, v29.4s\n" + "srshl v27.4s, v27.4s, v29.4s\n" + "add v24.4s, v24.4s, v28.4s\n" + "add v25.4s, v25.4s, v28.4s\n" + "add v26.4s, v26.4s, v28.4s\n" + "add v27.4s, v27.4s, v28.4s\n" + "dup v28.8h, w0\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smax v25.4s, v25.4s, v30.4s\n" + "smax v26.4s, v26.4s, v30.4s\n" + "smax v27.4s, v27.4s, v30.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "smin v25.4s, v25.4s, v31.4s\n" + "smin v26.4s, v26.4s, v31.4s\n" + "smin v27.4s, v27.4s, v31.4s\n" + "sqxtn v24.4h, v24.4s\n" + "sqxtn v26.4h, v26.4s\n" + "sqxtn2 v24.8h, v25.4s\n" + "sqxtn2 v26.8h, v27.4s\n" + "sqxtun v24.8b, v24.8h\n" + "sqxtun v26.8b, v26.8h\n" + "st1 {v24.8b}, [x6], x5\n" + "st1 {v26.8b}, [x6]\n" + "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n" + + // Handle bottom right output if exists. + DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n" + "dup v26.4s, w9\n" + "dup v27.4s, w1\n" + "dup v29.4s, w2\n" + + "smlal v24.4s, v0.4h, v9.4h\n" + "smlal2 v25.4s, v0.8h, v9.8h\n" + "smlal v24.4s, v1.4h, v10.4h\n" + "smlal2 v25.4s, v1.8h, v10.8h\n" + "smlal v24.4s, v2.4h, v11.4h\n" + "smlal2 v25.4s, v2.8h, v11.8h\n" + "smlal v24.4s, v3.4h, v12.4h\n" + "smlal2 v25.4s, v3.8h, v12.8h\n" + "smlal v24.4s, v4.4h, v13.4h\n" + "smlal2 v25.4s, v4.8h, v13.8h\n" + "smlal v24.4s, v5.4h, v14.4h\n" + "smlal2 v25.4s, v5.8h, v14.8h\n" + "smlal v24.4s, v6.4h, v15.4h\n" + "smlal2 v25.4s, v6.8h, v15.8h\n" + "smlal v24.4s, v7.4h, v16.4h\n" + "smlal2 v25.4s, v7.8h, v16.8h\n" + "smlal v24.4s, v8.4h, v17.4h\n" + "smlal2 v25.4s, v8.8h, v17.8h\n" + + "sqrdmulh v24.4s, v24.4s, v27.4s\n" + "sqrdmulh v25.4s, v25.4s, v27.4s\n" + "and v18.16b, v24.16b, v26.16b\n" + "and v19.16b, v25.16b, v26.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v24.4s, v24.4s, v18.4s\n" + "sqadd v25.4s, v25.4s, v19.4s\n" + "srshl v24.4s, v24.4s, v26.4s\n" + "srshl v25.4s, v25.4s, v26.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "add v25.4s, v25.4s, v29.4s\n" + "smax v24.4s, v24.4s, v30.4s\n" + "smax v25.4s, v25.4s, v30.4s\n" + "smin v24.4s, v24.4s, v31.4s\n" + "smin v25.4s, v25.4s, v31.4s\n" + "sqxtn v24.4h, v24.4s\n" + "sqxtn2 v24.8h, v25.4s\n" + "sqxtun v24.8b, v24.8h\n" + "st1 {v24.8b}, [x6]\n" + + DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n" + : + // Outputs. + [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr), + [output_ptr] "+r"(output_ptr), + [output_window_height] "+r"(output_window_height) + : + // Inputs. + [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size), + [input_depth] "r"(input_depth), + [output_window_width] "r"(output_window_width), + [input_width_increment] "r"(input_width_increment), + [input_height_increment] "r"(input_height_increment), + [output_height_increment] "r"(output_height_increment), + [params_ptr] "r"(params_ptr) + : + // Clobbers. + "cc", "memory", + // We use these NEON registers. + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", + // We use these general-purpose registers. + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x19", "x20"); +#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER +#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER +#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_1 +#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP +#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER +#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER +#undef DEPTHWISECONV_LABEL_HEIGHT_1_END } }; -template <> -struct ConvKernel3x3FilterDepth8<4, 1, 2, 2> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - const int output_row_size = output_depth * output_width; - - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8; - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, - temp_8; - - const uint8* ptr = input_ptr; - - // Load all inputs for top output. - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_6 = vld1_u8(ptr); - temp_7 = vld1_u8(ptr + input_depth); - temp_8 = vld1_u8(ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - - DotProductAndStore( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - - // Second output. - output_ptr += output_row_size; - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - - DotProductAndStore( - filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3, - input_4, input_5, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - - // Third output. - output_ptr += output_row_size; - - ptr += input_row_size; - temp_6 = vld1_u8(ptr); - temp_7 = vld1_u8(ptr + input_depth); - temp_8 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - - DotProductAndStore( - filter, input_3, input_4, input_5, input_6, input_7, input_8, input_0, - input_1, input_2, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - - // Fourth output. - output_ptr += output_row_size; - - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_6 = vld1_u8(ptr); - temp_7 = vld1_u8(ptr + input_depth); - temp_8 = vld1_u8(ptr + 2 * input_depth); - - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8)); - - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - - DotProductAndStore( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - } -}; - -template <> -struct ConvKernel3x3FilterDepth8<2, 2, 2, 2> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - Int32x8 acc_0, acc_1, acc_2, acc_3; - acc_0.low = vld1q_s32(bias_ptr); - acc_1.low = vld1q_s32(bias_ptr); - acc_2.low = vld1q_s32(bias_ptr); - acc_3.low = vld1q_s32(bias_ptr); - - bias_ptr += 4; - acc_0.high = vld1q_s32(bias_ptr); - acc_1.high = vld1q_s32(bias_ptr); - acc_2.high = vld1q_s32(bias_ptr); - acc_3.high = vld1q_s32(bias_ptr); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - - // Add scope for input registers to help the compiler know that it is - // not needed. - { - // To process 2x2 outputs using a 3x3 filter at stride 2, we require - // 5x5 inputs. We load the first 5x2 inputs at a time. - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, input_9; - - const uint8* ptr = input_ptr; - - // Load inputs. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4; - - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2, - input_2, input_3, input_4); - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5, - input_5, input_6, input_7); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5, - input_7, input_8, input_9); - - // Load next inputs. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4; - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_9 = vaddq_s16(input_9, input_offset_vec); - } - - acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8, - input_0, input_1, input_2); - - acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8, - input_2, input_3, input_4); - - // Moving onto the two bottom outputs. - acc_2 = MultiplyAccumulateRow(acc_2, filter.f0, filter.f1, filter.f2, - input_0, input_1, input_2); - - acc_3 = MultiplyAccumulateRow(acc_3, filter.f0, filter.f1, filter.f2, - input_2, input_3, input_4); - - acc_2 = MultiplyAccumulateRow(acc_2, filter.f3, filter.f4, filter.f5, - input_5, input_6, input_7); - - acc_3 = MultiplyAccumulateRow(acc_3, filter.f3, filter.f4, filter.f5, - input_7, input_8, input_9); - - // Load last input row. - { - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4; +enum class EdgeType { kCorner, kHorizontal, kVertical, kCenter }; - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - temp_3 = vld1_u8(ptr + 3 * input_depth); - temp_4 = vld1_u8(ptr + 4 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - } - - acc_2 = MultiplyAccumulateRow(acc_2, filter.f6, filter.f7, filter.f8, - input_0, input_1, input_2); - - acc_3 = MultiplyAccumulateRow(acc_3, filter.f6, filter.f7, filter.f8, - input_2, input_3, input_4); - } - - DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset, - output_multiplier, output_shift, - output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); - } -}; +template <EdgeType kEdgeType, int kPadWidth, int kPadHeight> +struct DepthwiseConvPartial {}; template <> -struct ConvKernel3x3FilterDepth8<2, 4, 2, 2> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - // Reuse 2x2 kernel twice. - ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run( - input_ptr, input_depth, input_offset, input_row_size, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_ptr, output_depth, - output_width); - - ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run( - input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size, - filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr + 2 * output_depth, output_depth, output_width); +struct DepthwiseConvPartial<EdgeType::kCenter, 1, 1> { + static inline void Run(const uint8* input_ptr, const uint8* filter_ptr, + const int32* bias_ptr, uint8* output_ptr, + const DepthwiseConvParams* params_ptr) { +#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1" +#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2" + asm volatile( + // Performs depthwise convolutions for an input window of size 1x1 and + // padding of 1 across the full depth. Expects |input_ptr| and + // |filter_ptr| to be pointing to the 1x1 input and filter values. + "ld1 {v8.8b}, [%[input_ptr]], #8\n" + "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n" + "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n" + "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n" + "dup v26.8h, w9\n" + "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n" + "dup v27.4s, w10\n" + "ld1 {v0.8b}, [%[filter_ptr]], #8\n" + "cmp x11, #16\n" + "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n" + "dup v28.4s, w9\n" + "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n" + "neg w10, w10\n" + "dup v29.4s, w10\n" + "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n" + "dup v30.4s, w9\n" + "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n" + "dup v31.4s, w10\n" + "dup v25.8h, w9\n" + + "ld1 {v16.4s}, [%[bias_ptr]], #16\n" + "uaddw v8.8h, v26.8h, v8.8b\n" + "ld1 {v17.4s}, [%[bias_ptr]], #16\n" + "uaddw v0.8h, v25.8h, v0.8b\n" + + "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n" + "smlal v16.4s, v0.4h, v8.4h\n" + "subs x11, x11, #8\n" + "smlal2 v17.4s, v0.8h, v8.8h\n" + "ld1 {v8.8b}, [%[input_ptr]], #8\n" + "cmp x11, #16\n" + "ld1 {v0.8b}, [%[filter_ptr]], #8\n" + + "sqrdmulh v16.4s, v16.4s, v27.4s\n" + "sqrdmulh v17.4s, v17.4s, v27.4s\n" + "and v18.16b, v16.16b, v29.16b\n" + "and v19.16b, v17.16b, v29.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v16.4s, v16.4s, v18.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "srshl v17.4s, v17.4s, v29.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "smax v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v30.4s\n" + "smin v16.4s, v16.4s, v31.4s\n" + "smin v17.4s, v17.4s, v31.4s\n" + "sqxtn v16.4h, v16.4s\n" + "sqxtn2 v16.8h, v17.4s\n" + "sqxtun v16.8b, v16.8h\n" + "st1 {v16.8b}, [%[output_ptr]], #8\n" + "uaddw v8.8h, v26.8h, v8.8b\n" + "ld1 {v16.4s}, [%[bias_ptr]], #16\n" + "uaddw v0.8h, v25.8h, v0.8b\n" + "ld1 {v17.4s}, [%[bias_ptr]], #16\n" + + "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n" + + DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n" + "smlal v16.4s, v0.4h, v8.4h\n" + "smlal2 v17.4s, v0.8h, v8.8h\n" + + "sqrdmulh v16.4s, v16.4s, v27.4s\n" + "sqrdmulh v17.4s, v17.4s, v27.4s\n" + "and v18.16b, v16.16b, v29.16b\n" + "and v19.16b, v17.16b, v29.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v16.4s, v16.4s, v18.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "srshl v17.4s, v17.4s, v29.4s\n" + + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "smax v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v30.4s\n" + "smin v16.4s, v16.4s, v31.4s\n" + "smin v17.4s, v17.4s, v31.4s\n" + "sqxtn v16.4h, v16.4s\n" + "sqxtn2 v16.8h, v17.4s\n" + "sqxtun v16.8b, v16.8h\n" + "st1 {v16.8b}, [%[output_ptr]]\n" + : + // Outputs. + [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr), + [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr) + : + // Inputs. + [params_ptr] "r"(params_ptr) + : + // Clobbers. + "cc", "memory", + // We use these NEON registers. + "v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", + // We use these general-purpose registers. + "x9", "x10", "x11"); +#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP +#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP } }; template <> -struct ConvKernel3x3FilterDepth8<2, 1, 2, 2> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - const int output_row_size = output_depth * output_width; - - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8; - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, - temp_8; - - const uint8* ptr = input_ptr; - - // Load all inputs for top output. - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_6 = vld1_u8(ptr); - temp_7 = vld1_u8(ptr + input_depth); - temp_8 = vld1_u8(ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - - DotProductAndStore( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - - // Second output. - output_ptr += output_row_size; - - ptr += input_row_size; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - - DotProductAndStore( - filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3, - input_4, input_5, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); +struct DepthwiseConvPartial<EdgeType::kCorner, 1, 1> { + static inline void Run(const uint8* input_ptr, const uint8* filter_ptr, + const int32* bias_ptr, uint8* output_ptr, + const DepthwiseConvParams* params_ptr) { +#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1" +#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2" + asm volatile( + // Performs depthwise convolutions for an input window of size 2x2 and + // padding of 1 across the full depth. Expects |input_ptr| and + // |filter_ptr| to be pointing to the beginning of the 2x2 input and + // filter values. + + // Load input and filter values. + "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n" + "ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n" + "cmp x15, #16\n" + "add x12, %[input_ptr], x15\n" + "add x13, %[input_ptr], x9\n" + "ld1 {v8.8b}, [%[input_ptr]], #8\n" + "add x14, x13, x15\n" + "ld1 {v9.8b}, [x12], #8\n" + "ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n" + + "add x9, %[filter_ptr], x15\n" + "ld1 {v10.8b}, [x13], #8\n" + "add x10, %[filter_ptr], x6\n" + "ld1 {v11.8b}, [x14], #8\n" + "ld1 {v0.8b}, [%[filter_ptr]], #8\n" + "add x11, x10, x15\n" + "ld1 {v1.8b}, [x9], #8\n" + "ld1 {v2.8b}, [x10], #8\n" + "ld1 {v3.8b}, [x11], #8\n" + + // Load constants. + "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n" + "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n" + "dup v26.8h, w6\n" + "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n" + "dup v27.4s, w7\n" + "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n" + "dup v28.4s, w6\n" + "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n" + "neg w7, w7\n" + "dup v29.4s, w7\n" + "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n" + "dup v30.4s, w6\n" + "ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n" + "dup v31.4s, w7\n" + "dup v25.8h, w6\n" + + // Add input and filter offsets. + "uaddw v8.8h, v26.8h, v8.8b\n" + "ld1 {v16.4s}, [%[bias_ptr]], #16\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "ld1 {v17.4s}, [%[bias_ptr]], #16\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + + "uaddw v0.8h, v25.8h, v0.8b\n" + "uaddw v1.8h, v25.8h, v1.8b\n" + "uaddw v2.8h, v25.8h, v2.8b\n" + "uaddw v3.8h, v25.8h, v3.8b\n" + + "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n" + "smlal v16.4s, v0.4h, v8.4h\n" + "subs x15, x15, #8\n" + "smlal2 v17.4s, v0.8h, v8.8h\n" + "ld1 {v8.8b}, [%[input_ptr]], #8\n" + "cmp x15, #16\n" + "ld1 {v0.8b}, [%[filter_ptr]], #8\n" + "smlal v16.4s, v1.4h, v9.4h\n" + "smlal2 v17.4s, v1.8h, v9.8h\n" + "ld1 {v9.8b}, [x12], #8\n" + "smlal v16.4s, v2.4h, v10.4h\n" + "ld1 {v1.8b}, [x9], #8\n" + "smlal2 v17.4s, v2.8h, v10.8h\n" + "ld1 {v10.8b}, [x13], #8\n" + "smlal v16.4s, v3.4h, v11.4h\n" + "ld1 {v2.8b}, [x10], #8\n" + "smlal2 v17.4s, v3.8h, v11.8h\n" + "ld1 {v11.8b}, [x14], #8\n" + "ld1 {v3.8b}, [x11], #8\n" + + "sqrdmulh v16.4s, v16.4s, v27.4s\n" + "sqrdmulh v17.4s, v17.4s, v27.4s\n" + "and v18.16b, v16.16b, v29.16b\n" + "and v19.16b, v17.16b, v29.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v16.4s, v16.4s, v18.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "srshl v17.4s, v17.4s, v29.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "smax v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v30.4s\n" + "smin v16.4s, v16.4s, v31.4s\n" + "smin v17.4s, v17.4s, v31.4s\n" + "sqxtn v16.4h, v16.4s\n" + "sqxtn2 v16.8h, v17.4s\n" + "sqxtun v16.8b, v16.8h\n" + "st1 {v16.8b}, [%[output_ptr]], #8\n" + "uaddw v8.8h, v26.8h, v8.8b\n" + "ld1 {v16.4s}, [%[bias_ptr]], #16\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "ld1 {v17.4s}, [%[bias_ptr]], #16\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + "uaddw v0.8h, v25.8h, v0.8b\n" + "uaddw v1.8h, v25.8h, v1.8b\n" + "uaddw v2.8h, v25.8h, v2.8b\n" + "uaddw v3.8h, v25.8h, v3.8b\n" + + "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n" + + DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n" + "smlal v16.4s, v0.4h, v8.4h\n" + "smlal2 v17.4s, v0.8h, v8.8h\n" + "smlal v16.4s, v1.4h, v9.4h\n" + "smlal2 v17.4s, v1.8h, v9.8h\n" + "smlal v16.4s, v2.4h, v10.4h\n" + "smlal2 v17.4s, v2.8h, v10.8h\n" + "smlal v16.4s, v3.4h, v11.4h\n" + "smlal2 v17.4s, v3.8h, v11.8h\n" + + "sqrdmulh v16.4s, v16.4s, v27.4s\n" + "sqrdmulh v17.4s, v17.4s, v27.4s\n" + "and v18.16b, v16.16b, v29.16b\n" + "and v19.16b, v17.16b, v29.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v16.4s, v16.4s, v18.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "srshl v17.4s, v17.4s, v29.4s\n" + + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "smax v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v30.4s\n" + "smin v16.4s, v16.4s, v31.4s\n" + "smin v17.4s, v17.4s, v31.4s\n" + "sqxtn v16.4h, v16.4s\n" + "sqxtn2 v16.8h, v17.4s\n" + "sqxtun v16.8b, v16.8h\n" + "st1 {v16.8b}, [%[output_ptr]]\n" + : + // Outputs. + [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr), + [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr) + : + // Inputs. + [params_ptr] "r"(params_ptr) + : + // Clobbers. + "cc", "memory", + // We use these NEON registers. + "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18", + "v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + // We use these general-purpose registers. + "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15"); +#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP +#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP } }; template <> -struct ConvKernel3x3FilterDepth8<1, 2, 2, 2> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8; - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, - temp_8; - - const uint8* ptr = input_ptr; - - // Load all inputs for top output. - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_6 = vld1_u8(ptr); - temp_7 = vld1_u8(ptr + input_depth); - temp_8 = vld1_u8(ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - - DotProductAndStore( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - - // Second output. - output_ptr += output_depth; - - ptr = input_ptr + 3 * input_depth; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - ptr += input_row_size; - temp_6 = vld1_u8(ptr); - temp_7 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - - DotProductAndStore( - filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8, - input_6, input_7, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); +struct DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1> { + static inline void Run(const uint8* input_ptr, const uint8* filter_ptr, + const int32* bias_ptr, uint8* output_ptr, + const DepthwiseConvParams* params_ptr) { +#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1" +#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2" + asm volatile( + // Performs depthwise convolutions for an input window of size 2x3 and + // padding of 1 across the full depth. Expects |input_ptr| and + // |filter_ptr| to be pointing to the beginning of the 2x3 input and + // filter values. + + // Load input and filter values. + "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n" + "mov x12, %[input_ptr]\n" + "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n" + "mov x9, %[filter_ptr]\n" + "ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n" + "add x13, x12, x11\n" + "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n" + + "ld1 {v8.8b}, [x12], x7\n" + "add x10, x9, x14\n" + "ld1 {v9.8b}, [x12], x7\n" + "cmp x15, #16\n" + "ld1 {v10.8b}, [x12]\n" + "add %[input_ptr], %[input_ptr], #8\n" + "ld1 {v11.8b}, [x13], x7\n" + "add %[filter_ptr], %[filter_ptr], #8\n" + "ld1 {v12.8b}, [x13], x7\n" + "ld1 {v13.8b}, [x13]\n" + + "ld1 {v0.8b}, [x9], x7\n" + "ld1 {v1.8b}, [x9], x7\n" + "ld1 {v2.8b}, [x9]\n" + "ld1 {v3.8b}, [x10], x7\n" + "ld1 {v4.8b}, [x10], x7\n" + "ld1 {v5.8b}, [x10]\n" + + // Load constants. + "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n" + "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n" + "dup v26.8h, w12\n" + "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n" + "dup v27.4s, w13\n" + "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n" + "dup v28.4s, w12\n" + "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n" + "neg w13, w13\n" + "dup v29.4s, w13\n" + "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n" + "dup v30.4s, w12\n" + "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n" + "dup v31.4s, w13\n" + "dup v25.8h, w12\n" + + // Add input and filter offsets. + "uaddw v8.8h, v26.8h, v8.8b\n" + "ld1 {v16.4s}, [%[bias_ptr]], #16\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "ld1 {v17.4s}, [%[bias_ptr]], #16\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "uaddw v13.8h, v26.8h, v13.8b\n" + + "uaddw v0.8h, v25.8h, v0.8b\n" + "uaddw v1.8h, v25.8h, v1.8b\n" + "uaddw v2.8h, v25.8h, v2.8b\n" + "uaddw v3.8h, v25.8h, v3.8b\n" + "uaddw v4.8h, v25.8h, v4.8b\n" + "uaddw v5.8h, v25.8h, v5.8b\n" + + "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n" + "mov x12, %[input_ptr]\n" + "subs x15, x15, #8\n" + "add x13, x12, x11\n" + "cmp x15, #16\n" + "add %[input_ptr], %[input_ptr], #8\n" + + "smlal v16.4s, v0.4h, v8.4h\n" + "mov x9, %[filter_ptr]\n" + "smlal2 v17.4s, v0.8h, v8.8h\n" + "ld1 {v8.8b}, [x12], x7\n" + "smlal v16.4s, v1.4h, v9.4h\n" + "add x10, x9, x14\n" + "smlal2 v17.4s, v1.8h, v9.8h\n" + "ld1 {v9.8b}, [x12], x7\n" + "smlal v16.4s, v2.4h, v10.4h\n" + "add %[filter_ptr], %[filter_ptr], #8\n" + "smlal2 v17.4s, v2.8h, v10.8h\n" + "ld1 {v10.8b}, [x12]\n" + "smlal v16.4s, v3.4h, v11.4h\n" + "ld1 {v0.8b}, [x9], x7\n" + "smlal2 v17.4s, v3.8h, v11.8h\n" + "ld1 {v11.8b}, [x13], x7\n" + "smlal v16.4s, v4.4h, v12.4h\n" + "ld1 {v1.8b}, [x9], x7\n" + "smlal2 v17.4s, v4.8h, v12.8h\n" + "ld1 {v12.8b}, [x13], x7\n" + "smlal v16.4s, v5.4h, v13.4h\n" + "ld1 {v2.8b}, [x9]\n" + "smlal2 v17.4s, v5.8h, v13.8h\n" + "ld1 {v13.8b}, [x13]\n" + + "sqrdmulh v16.4s, v16.4s, v27.4s\n" + "ld1 {v3.8b}, [x10], x7\n" + "sqrdmulh v17.4s, v17.4s, v27.4s\n" + "ld1 {v4.8b}, [x10], x7\n" + "and v18.16b, v16.16b, v29.16b\n" + "ld1 {v5.8b}, [x10]\n" + "and v19.16b, v17.16b, v29.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v16.4s, v16.4s, v18.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "srshl v17.4s, v17.4s, v29.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "smax v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v30.4s\n" + "smin v16.4s, v16.4s, v31.4s\n" + "smin v17.4s, v17.4s, v31.4s\n" + "sqxtn v16.4h, v16.4s\n" + "sqxtn2 v16.8h, v17.4s\n" + "sqxtun v16.8b, v16.8h\n" + "uaddw v8.8h, v26.8h, v8.8b\n" + "st1 {v16.8b}, [%[output_ptr]], #8\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "uaddw v13.8h, v26.8h, v13.8b\n" + + "uaddw v0.8h, v25.8h, v0.8b\n" + "uaddw v1.8h, v25.8h, v1.8b\n" + "uaddw v2.8h, v25.8h, v2.8b\n" + "ld1 {v16.4s}, [%[bias_ptr]], #16\n" + "uaddw v3.8h, v25.8h, v3.8b\n" + "ld1 {v17.4s}, [%[bias_ptr]], #16\n" + "uaddw v4.8h, v25.8h, v4.8b\n" + "uaddw v5.8h, v25.8h, v5.8b\n" + + "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n" + + DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n" + "smlal v16.4s, v0.4h, v8.4h\n" + "smlal2 v17.4s, v0.8h, v8.8h\n" + "smlal v16.4s, v1.4h, v9.4h\n" + "smlal2 v17.4s, v1.8h, v9.8h\n" + "smlal v16.4s, v2.4h, v10.4h\n" + "smlal2 v17.4s, v2.8h, v10.8h\n" + "smlal v16.4s, v3.4h, v11.4h\n" + "smlal2 v17.4s, v3.8h, v11.8h\n" + "smlal v16.4s, v4.4h, v12.4h\n" + "smlal2 v17.4s, v4.8h, v12.8h\n" + "smlal v16.4s, v5.4h, v13.4h\n" + "smlal2 v17.4s, v5.8h, v13.8h\n" + + "sqrdmulh v16.4s, v16.4s, v27.4s\n" + "sqrdmulh v17.4s, v17.4s, v27.4s\n" + "and v18.16b, v16.16b, v29.16b\n" + "and v19.16b, v17.16b, v29.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v16.4s, v16.4s, v18.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "srshl v17.4s, v17.4s, v29.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "smax v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v30.4s\n" + "smin v16.4s, v16.4s, v31.4s\n" + "smin v17.4s, v17.4s, v31.4s\n" + "sqxtn v16.4h, v16.4s\n" + "sqxtn2 v16.8h, v17.4s\n" + "sqxtun v16.8b, v16.8h\n" + "st1 {v16.8b}, [%[output_ptr]]\n" + : + // Outputs. + [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr), + [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr) + : + // Inputs. + [params_ptr] "r"(params_ptr) + : + // Clobbers. + "cc", "memory", + // We use these NEON registers. + "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", + "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", + // We use these general-purpose registers. + "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15"); +#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP +#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP } }; template <> -struct ConvKernel3x3FilterDepth8<1, 4, 2, 2> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8; - uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, - temp_8; - - const uint8* ptr = input_ptr; - - // Load all inputs for top output. - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - temp_2 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - temp_5 = vld1_u8(ptr + 2 * input_depth); - ptr += input_row_size; - temp_6 = vld1_u8(ptr); - temp_7 = vld1_u8(ptr + input_depth); - temp_8 = vld1_u8(ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - - DotProductAndStore( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - - // Second output. - output_ptr += output_depth; - - ptr = input_ptr + 3 * input_depth; - temp_0 = vld1_u8(ptr); - temp_1 = vld1_u8(ptr + input_depth); - ptr += input_row_size; - temp_3 = vld1_u8(ptr); - temp_4 = vld1_u8(ptr + input_depth); - ptr += input_row_size; - temp_6 = vld1_u8(ptr); - temp_7 = vld1_u8(ptr + input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - - DotProductAndStore( - filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8, - input_6, input_7, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - - // Third output. - output_ptr += output_depth; - - ptr = input_ptr + 5 * input_depth; - temp_2 = vld1_u8(ptr); - temp_0 = vld1_u8(ptr + input_depth); - ptr += input_row_size; - temp_5 = vld1_u8(ptr); - temp_3 = vld1_u8(ptr + input_depth); - ptr += input_row_size; - temp_8 = vld1_u8(ptr); - temp_6 = vld1_u8(ptr + input_depth); - - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - - input_2 = vaddq_s16(input_2, input_offset_vec); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - - DotProductAndStore( - filter, input_1, input_2, input_0, input_4, input_5, input_3, input_7, - input_8, input_6, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - - // Fourth output. - output_ptr += output_depth; - - ptr = input_ptr + 7 * input_depth; - temp_1 = vld1_u8(ptr); - temp_2 = vld1_u8(ptr + input_depth); - ptr += input_row_size; - temp_4 = vld1_u8(ptr); - temp_5 = vld1_u8(ptr + input_depth); - ptr += input_row_size; - temp_7 = vld1_u8(ptr); - temp_8 = vld1_u8(ptr + input_depth); - - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8)); - - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - - DotProductAndStore( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); +struct DepthwiseConvPartial<EdgeType::kVertical, 1, 1> { + static inline void Run(const uint8* input_ptr, const uint8* filter_ptr, + const int32* bias_ptr, uint8* output_ptr, + const DepthwiseConvParams* params_ptr) { +#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1" +#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2" + asm volatile( + // Performs depthwise convolutions for an input window of size 3x2 and + // padding of 1 across the full depth. Expects |input_ptr| and + // |filter_ptr| to be pointing to the beginning of the 3x2 input and + // filter values. + + // Load input and filter values. + "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n" + "mov x12, %[input_ptr]\n" + "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n" + "mov x7, %[filter_ptr]\n" + "ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n" + "add x13, x12, x11\n" + "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n" + "add x14, x13, x11\n" + + "ld1 {v8.8b}, [x12], x6\n" + "add x9, x7, x5\n" + "ld1 {v9.8b}, [x12]\n" + "cmp x15, #16\n" + "add x10, x9, x5\n" + "ld1 {v10.8b}, [x13], x6\n" + "add %[input_ptr], %[input_ptr], #8\n" + "ld1 {v11.8b}, [x13]\n" + "add %[filter_ptr], %[filter_ptr], #8\n" + "ld1 {v12.8b}, [x14], x6\n" + "ld1 {v13.8b}, [x14]\n" + + "ld1 {v0.8b}, [x7], x6\n" + "ld1 {v1.8b}, [x7]\n" + "ld1 {v2.8b}, [x9], x6\n" + "ld1 {v3.8b}, [x9]\n" + "ld1 {v4.8b}, [x10], x6\n" + "ld1 {v5.8b}, [x10]\n" + + // Load constants. + "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n" + "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n" + "dup v26.8h, w12\n" + "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n" + "dup v27.4s, w13\n" + "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n" + "dup v28.4s, w12\n" + "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n" + "neg w13, w13\n" + "dup v29.4s, w13\n" + "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n" + "dup v30.4s, w12\n" + "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n" + "dup v31.4s, w13\n" + "dup v25.8h, w12\n" + + // Add input and filter offsets. + "uaddw v8.8h, v26.8h, v8.8b\n" + "ld1 {v16.4s}, [%[bias_ptr]], #16\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "ld1 {v17.4s}, [%[bias_ptr]], #16\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "uaddw v13.8h, v26.8h, v13.8b\n" + + "uaddw v0.8h, v25.8h, v0.8b\n" + "uaddw v1.8h, v25.8h, v1.8b\n" + "uaddw v2.8h, v25.8h, v2.8b\n" + "uaddw v3.8h, v25.8h, v3.8b\n" + "uaddw v4.8h, v25.8h, v4.8b\n" + "uaddw v5.8h, v25.8h, v5.8b\n" + + "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n" + + //"loop_%=:\n" + DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n" + "mov x12, %[input_ptr]\n" + "subs x15, x15, #8\n" + "add x13, x12, x11\n" + "cmp x15, #16\n" + "add x14, x13, x11\n" + "add %[input_ptr], %[input_ptr], #8\n" + + "smlal v16.4s, v0.4h, v8.4h\n" + "mov x7, %[filter_ptr]\n" + "smlal2 v17.4s, v0.8h, v8.8h\n" + "ld1 {v8.8b}, [x12], x6\n" + "smlal v16.4s, v1.4h, v9.4h\n" + "add x9, x7, x5\n" + "smlal2 v17.4s, v1.8h, v9.8h\n" + "add x10, x9, x5\n" + "ld1 {v9.8b}, [x12]\n" + "smlal v16.4s, v2.4h, v10.4h\n" + "add %[filter_ptr], %[filter_ptr], #8\n" + "smlal2 v17.4s, v2.8h, v10.8h\n" + "ld1 {v10.8b}, [x13], x6\n" + "smlal v16.4s, v3.4h, v11.4h\n" + "ld1 {v0.8b}, [x7], x6\n" + "smlal2 v17.4s, v3.8h, v11.8h\n" + "ld1 {v11.8b}, [x13]\n" + "smlal v16.4s, v4.4h, v12.4h\n" + "ld1 {v1.8b}, [x7]\n" + "smlal2 v17.4s, v4.8h, v12.8h\n" + "ld1 {v12.8b}, [x14], x6\n" + "smlal v16.4s, v5.4h, v13.4h\n" + "ld1 {v2.8b}, [x9], x6\n" + "smlal2 v17.4s, v5.8h, v13.8h\n" + "ld1 {v13.8b}, [x14]\n" + + "sqrdmulh v16.4s, v16.4s, v27.4s\n" + "ld1 {v3.8b}, [x9]\n" + "sqrdmulh v17.4s, v17.4s, v27.4s\n" + "ld1 {v4.8b}, [x10], x6\n" + "and v18.16b, v16.16b, v29.16b\n" + "ld1 {v5.8b}, [x10]\n" + "and v19.16b, v17.16b, v29.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v16.4s, v16.4s, v18.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "srshl v17.4s, v17.4s, v29.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "smax v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v30.4s\n" + "smin v16.4s, v16.4s, v31.4s\n" + "smin v17.4s, v17.4s, v31.4s\n" + "sqxtn v16.4h, v16.4s\n" + "sqxtn2 v16.8h, v17.4s\n" + "sqxtun v16.8b, v16.8h\n" + "uaddw v8.8h, v26.8h, v8.8b\n" + "st1 {v16.8b}, [%[output_ptr]], #8\n" + "uaddw v9.8h, v26.8h, v9.8b\n" + "uaddw v10.8h, v26.8h, v10.8b\n" + "uaddw v11.8h, v26.8h, v11.8b\n" + "uaddw v12.8h, v26.8h, v12.8b\n" + "uaddw v13.8h, v26.8h, v13.8b\n" + + "uaddw v0.8h, v25.8h, v0.8b\n" + "uaddw v1.8h, v25.8h, v1.8b\n" + "uaddw v2.8h, v25.8h, v2.8b\n" + "ld1 {v16.4s}, [%[bias_ptr]], #16\n" + "uaddw v3.8h, v25.8h, v3.8b\n" + "ld1 {v17.4s}, [%[bias_ptr]], #16\n" + "uaddw v4.8h, v25.8h, v4.8b\n" + "uaddw v5.8h, v25.8h, v5.8b\n" + + "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n" + + DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n" + "smlal v16.4s, v0.4h, v8.4h\n" + "smlal2 v17.4s, v0.8h, v8.8h\n" + "smlal v16.4s, v1.4h, v9.4h\n" + "smlal2 v17.4s, v1.8h, v9.8h\n" + "smlal v16.4s, v2.4h, v10.4h\n" + "smlal2 v17.4s, v2.8h, v10.8h\n" + "smlal v16.4s, v3.4h, v11.4h\n" + "smlal2 v17.4s, v3.8h, v11.8h\n" + "smlal v16.4s, v4.4h, v12.4h\n" + "smlal2 v17.4s, v4.8h, v12.8h\n" + "smlal v16.4s, v5.4h, v13.4h\n" + "smlal2 v17.4s, v5.8h, v13.8h\n" + + "sqrdmulh v16.4s, v16.4s, v27.4s\n" + "sqrdmulh v17.4s, v17.4s, v27.4s\n" + "and v18.16b, v16.16b, v29.16b\n" + "and v19.16b, v17.16b, v29.16b\n" + "sshr v18.4s, v18.4s, #31\n" + "sshr v19.4s, v19.4s, #31\n" + "sqadd v16.4s, v16.4s, v18.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "srshl v17.4s, v17.4s, v29.4s\n" + "add v16.4s, v16.4s, v28.4s\n" + "add v17.4s, v17.4s, v28.4s\n" + "smax v16.4s, v16.4s, v30.4s\n" + "smax v17.4s, v17.4s, v30.4s\n" + "smin v16.4s, v16.4s, v31.4s\n" + "smin v17.4s, v17.4s, v31.4s\n" + "sqxtn v16.4h, v16.4s\n" + "sqxtn2 v16.8h, v17.4s\n" + "sqxtun v16.8b, v16.8h\n" + "st1 {v16.8b}, [%[output_ptr]]\n" + : + // Outputs. + [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr), + [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr) + : + // Inputs. + [params_ptr] "r"(params_ptr) + : + // Clobbers. + "cc", "memory", + // We use these NEON registers. + "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", + "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", + // We use these general-purpose registers. + "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15"); +#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP +#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP } }; -template <int kFixedStrideWidth, int kFixedStrideHeight> -struct ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight> { - static inline void Run(const uint8* input_ptr, int input_depth, - int32 input_offset, int input_row_size, - const uint8* filter_ptr, int32 filter_offset, - const int32* bias_ptr, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_ptr, - int output_depth, int output_width) { - Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth); - - int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8; - - uint8x8_t temp_0 = vld1_u8(input_ptr); - uint8x8_t temp_1 = vld1_u8(input_ptr + input_depth); - uint8x8_t temp_2 = vld1_u8(input_ptr + 2 * input_depth); - - input_ptr += input_row_size; - uint8x8_t temp_3 = vld1_u8(input_ptr); - uint8x8_t temp_4 = vld1_u8(input_ptr + input_depth); - uint8x8_t temp_5 = vld1_u8(input_ptr + 2 * input_depth); - - input_ptr += input_row_size; - uint8x8_t temp_6 = vld1_u8(input_ptr); - uint8x8_t temp_7 = vld1_u8(input_ptr + input_depth); - uint8x8_t temp_8 = vld1_u8(input_ptr + 2 * input_depth); - - input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0)); - input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1)); - input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2)); - input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3)); - input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4)); - input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5)); - input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6)); - input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7)); - input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8)); - - const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); - input_0 = vaddq_s16(input_0, input_offset_vec); - input_1 = vaddq_s16(input_1, input_offset_vec); - input_2 = vaddq_s16(input_2, input_offset_vec); - input_3 = vaddq_s16(input_3, input_offset_vec); - input_4 = vaddq_s16(input_4, input_offset_vec); - input_5 = vaddq_s16(input_5, input_offset_vec); - input_6 = vaddq_s16(input_6, input_offset_vec); - input_7 = vaddq_s16(input_7, input_offset_vec); - input_8 = vaddq_s16(input_8, input_offset_vec); - - DotProductAndStore( - filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6, - input_7, input_8, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_ptr); - } -}; - -inline void ShuffleInput(const uint8* input_ptr, int input_depth, - int input_width, int input_height, int output_depth, - int output_width, int output_height, - uint8* output_ptr) { - const int input_row_size = input_depth * input_width; - - for (int y = 0; y < output_height; y++) { +#undef OFFSET_INPUT_DEPTH +#undef OFFSET_INPUT_ROW_SIZE +#undef OFFSET_OUTPUT_DEPTH +#undef OFFSET_OUTPUT_ROW_SIZE +#undef OFFSET_INPUT_OFFSET +#undef OFFSET_OUTPUT_OFFSET +#undef OFFSET_FILTER_OFFSET +#undef OFFSET_OUTPUT_MULTIPLIER +#undef OFFSET_OUTPUT_ACTIVATION_MIN +#undef OFFSET_OUTPUT_ACTIVATION_MAX +#undef OFFSET_OUTPUT_RIGHT_SHIFT +#undef OFFSET_INPUT_WIDTH +#undef OFFSET_INPUT_HEIGHT +#undef OFFSET_OUTPUT_WIDTH +#undef OFFSET_OUTPUT_HEIGHT +#undef STR +#undef STR_UNEXPANDED + +// Copies a subset of the input designated by |input_ptr| into |output_ptr| +// with the specified output dimensions. Supports output depths of 64 only as +// this is the cache line size. +inline void ShuffleInput(const uint8* input_ptr, int64_t input_depth, + int32 input_width, int32 input_height, + int64_t output_depth, int32 output_width, + int32 output_height, uint8* output_ptr) { + const int64_t input_row_size = input_depth * input_width; + for (int32 y = 0; y < output_height; y++) { const uint8* ptr = input_ptr; - for (int x = 0; x < output_width; x++) { + for (int32 x = 0; x < output_width; x++) { memcpy(output_ptr, ptr, output_depth); output_ptr += output_depth; ptr += input_depth; @@ -3873,561 +2937,265 @@ inline void ShuffleInput(const uint8* input_ptr, int input_depth, } } -template <int kFixedHeight, int kFixedStrideWidth, int kFixedStrideHeight> -struct ConvRow3x3FilterDepth8 {}; - -template <int kFixedStrideWidth, int kFixedStrideHeight> -struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth, kFixedStrideHeight> { - static inline void Run(const uint8* input_data, int start_x, int start_y, - int input_depth, int input_width, int input_height, - int input_row_size, int32 input_offset, - const uint8* filter_data, int32 filter_offset, - const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - int output_depth, int output_width, - uint8* shuffle_workspace) { - int out_x = start_x; - - // 1x4 at a time. - for (; out_x <= output_width - 4; out_x += 4) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<1, 4, kFixedStrideWidth, kFixedStrideHeight>:: - Run(input_ptr, input_depth, input_offset, input_row_size, - filter_ptr, filter_offset, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } - - input_data += 4 * kFixedStrideWidth * input_depth; - output_data += 4 * output_depth; - } - - // 1x1 at a time. - for (; out_x < output_width; out_x++) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight>:: - Run(input_ptr, input_depth, input_offset, input_row_size, - filter_ptr, filter_offset, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } +// Calculates the input size depending on stride and output. +inline int32 get_shuffle_input_size(int32 stride, int32 output) { + return stride * (output - 1) + 3; +} - input_data += kFixedStrideWidth * input_depth; - output_data += output_depth; - } +// Indicates the input and output dimensions used when shuffling input +// activations. +struct ShuffleParams { + int32 output_width; + int32 output_height; + int32 input_width; + int32 input_height; + + ShuffleParams() = default; + ShuffleParams(int32 output_width, int32 output_height, int32 stride_width, + int32 stride_height) + : output_width(output_width) + , output_height(output_height) + , input_width(get_shuffle_input_size(stride_width, output_width)) + , input_height(get_shuffle_input_size(stride_height, output_height)) { } }; -template <int kFixedStrideWidth, int kFixedStrideHeight> -struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth, kFixedStrideHeight> { - static inline void Run(const uint8* input_data, int start_x, int start_y, - int input_depth, int input_width, int input_height, - int input_row_size, int32 input_offset, - const uint8* filter_data, int32 filter_offset, - const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - int output_depth, int output_width, - uint8* shuffle_workspace) { - int out_x = start_x; - - // 2x4 at a time. - for (; out_x <= output_width - 4; out_x += 4) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<2, 4, kFixedStrideWidth, kFixedStrideHeight>:: - Run(input_ptr, input_depth, input_offset, input_row_size, - filter_ptr, filter_offset, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } - - input_data += 4 * kFixedStrideWidth * input_depth; - output_data += 4 * output_depth; - } - - // 2x2 at a time. - for (; out_x <= output_width - 2; out_x += 2) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<2, 2, kFixedStrideWidth, kFixedStrideHeight>:: - Run(input_ptr, input_depth, input_offset, input_row_size, - filter_ptr, filter_offset, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } - - input_data += 2 * kFixedStrideWidth * input_depth; - output_data += 2 * output_depth; - } - - // 2x1 at a time. - for (; out_x < output_width; out_x++) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<2, 1, kFixedStrideWidth, kFixedStrideHeight>:: - Run(input_ptr, input_depth, input_offset, input_row_size, - filter_ptr, filter_offset, bias_ptr, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } - - input_data += kFixedStrideWidth * input_depth; - output_data += output_depth; +template <int32 kStrideWidth, int32 kStrideHeight> +struct DepthwiseConvThroughDepth { + // Runs the DepthwiseConvWindow kernels through the depth dimension from + // |start_depth| to |end_depth|. Keep this not inlined to maintain a small + // binary size. We use a DepthwiseConvParams struct for read only params + // to minimize call overhead. + static __attribute__((noinline)) void Run(const uint8* input_ptr, + const uint8* filter_ptr, const int32* bias_ptr, uint8* output_ptr, + int64_t start_depth, int64_t end_depth, int64_t input_depth, + int64_t input_row_size, int32 output_window_height, + int32 output_window_width, const DepthwiseConvParams& params) { + for (; start_depth <= end_depth - 8; start_depth += 8) { + DepthwiseConvWindow<8, kStrideWidth, kStrideHeight>::Run( + input_ptr, filter_ptr, bias_ptr, output_ptr, input_depth, + input_row_size, output_window_height, output_window_width, ¶ms); + input_ptr += 8; + output_ptr += 8; + filter_ptr += 8; + bias_ptr += 8; } } }; -template <> -struct ConvRow3x3FilterDepth8<4, 1, 1> { - static inline void Run(const uint8* input_data, int start_x, int start_y, - int input_depth, int input_width, int input_height, - int input_row_size, int32 input_offset, - const uint8* filter_data, int32 filter_offset, - const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - int output_depth, int output_width, - uint8* shuffle_workspace) { - int out_x = start_x; - - // 4x4 at a time. - for (; out_x <= output_width - 4; out_x += 4) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<4, 4, 1, 1>::Run( - input_ptr, input_depth, input_offset, input_row_size, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } +template <int32 kStrideWidth, int32 kStrideHeight> +struct DepthwiseConvMultiRow { + using ConvKernel = DepthwiseConvThroughDepth<kStrideWidth, kStrideHeight>; - input_data += 4 * input_depth; - output_data += 4 * output_depth; - } - - // Handle the rest of the right side. - // 4x2 at a time. - for (; out_x <= output_width - 2; out_x += 2) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<4, 2, 1, 1>::Run( - input_ptr, input_depth, input_offset, input_row_size, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } - - input_data += 2 * input_depth; - output_data += 2 * output_depth; - } - - // 4x1 at a time. - for (; out_x < output_width; out_x++) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<4, 1, 1, 1>::Run( - input_ptr, input_depth, input_offset, input_row_size, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } - - input_data += input_depth; - output_data += output_depth; - } - } -}; - -template <> -struct ConvRow3x3FilterDepth8<4, 2, 2> { - // The buffer size of the shuffled input. - static inline constexpr int ShuffleWorkspaceSize() { return 64 * 9 * 9; } - - static inline void Run(const uint8* input_data, int start_x, int start_y, - int input_depth, int input_width, int input_height, - int input_row_size, int32 input_offset, - const uint8* filter_data, int32 filter_offset, - const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - int output_depth, int output_width, + static inline void Run(const uint8* input_data, int32 start_x, int32 end_x, + const uint8* filter_data, const int32* bias_data, + uint8* output_data, const DepthwiseConvParams& params, + const ShuffleParams& shuffle_params, uint8* shuffle_workspace) { - // Branch and cache misses increase substantially with stride 2 kernels. - // Adding prefetching reduces latency by as much as 2x. - const int i0 = 0; - const int i1 = input_depth; - const int i2 = 2 * input_depth; - const int i3 = 3 * input_depth; - const int i4 = 4 * input_depth; - const int i5 = 5 * input_depth; - const int i6 = 6 * input_depth; - const int i7 = 7 * input_depth; - const int i8 = 8 * input_depth; - -#define DEPTHWISECONV_PRELOAD_ROW(input_ptr, i) \ - preload_l1_keep(input_ptr + i * input_row_size + i0); \ - preload_l1_keep(input_ptr + i * input_row_size + i1); \ - preload_l1_keep(input_ptr + i * input_row_size + i2); \ - preload_l1_keep(input_ptr + i * input_row_size + i3); \ - preload_l1_keep(input_ptr + i * input_row_size + i4); \ - preload_l1_keep(input_ptr + i * input_row_size + i5); \ - preload_l1_keep(input_ptr + i * input_row_size + i6); \ - preload_l1_keep(input_ptr + i * input_row_size + i7); \ - preload_l1_keep(input_ptr + i * input_row_size + i8); - - int out_x = start_x; - // 4x4 at a time. - for (; out_x <= output_width - 4; out_x += 4) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - int depth = 0; - for (; depth <= output_depth - 64; depth += 64) { - // Preload 9x9 input. - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8); - - // For a large input window (64x9x9) that is small enough to fit in L1 - // cache, copy the input into a separate buffer and run the kernel on - // this new buffer. This reduces the likelihood of cache misses when - // the kernel is loading input data. If this size is ever changed, - // update the ShuffleWorkspaceSize() function to return the new size. - ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 9, - 9, shuffle_workspace); - const uint8* shuffled_ptr = &shuffle_workspace[0]; - - for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) { - ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run( - shuffled_ptr, 64, input_offset, 64 * 9, filter_ptr, filter_offset, - bias_ptr, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_ptr, - output_depth, output_width); - - shuffled_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; + TFLITE_DCHECK(shuffle_params.input_height == + get_shuffle_input_size(kStrideHeight, shuffle_params.output_height)); + TFLITE_DCHECK(shuffle_params.input_width == + get_shuffle_input_size(kStrideWidth, shuffle_params.output_width)); + TFLITE_DCHECK(64 * shuffle_params.input_width * shuffle_params.input_height + <= DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE); + + int32 out_x = start_x; + + // Run shuffling on inputs with sufficiently large depth and width. When + // these parameters are large enough, more time is taken to load inputs + // from memory. At this point, it becomes useful to prefetch and + // preshuffle the input data to maximize locality. + if (params.output_depth > 64 || + (params.output_depth <= 64 && params.input_width > 150)) { + for (; out_x <= (end_x - shuffle_params.output_width); + out_x += shuffle_params.output_width) { + const uint8* input_ptr = input_data; + const int32* bias_ptr = bias_data; + const uint8* filter_ptr = filter_data; + uint8* output_ptr = output_data; + int64_t depth = 0; + const int64_t shuffle_row_size = 64 * shuffle_params.input_width; + + for (; depth <= params.output_depth - 64; depth += 64) { + // Preload. + const uint8* h_ptr = input_ptr; + for (int32 i = 0; i < shuffle_params.input_height; i++) { + const uint8* ptr = h_ptr; + for (int32 j = 0; j < shuffle_params.input_width; j++) { + asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :); + ptr += params.input_depth; + } + h_ptr += params.input_row_size; + } + + // For a large enough input, shuffle into buckets. + ShuffleInput(input_ptr, params.input_depth, params.input_width, + params.input_height, 64, shuffle_params.input_width, + shuffle_params.input_height, shuffle_workspace); + ConvKernel::Run(shuffle_workspace, filter_ptr, bias_ptr, output_ptr, + 0, 64, 64, shuffle_row_size, + shuffle_params.output_height, + shuffle_params.output_width, params); + input_ptr += 64; + output_ptr += 64; + filter_ptr += 64; + bias_ptr += 64; } - input_ptr += 64; - } - - // Preload 9x9 input one more time for the rest of the depth. - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7); - DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8); - - for (; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run( - input_ptr, input_depth, input_offset, input_row_size, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } - - input_data += 4 * 2 * input_depth; - output_data += 4 * output_depth; - } - -#undef DEPTHWISECONV_PRELOAD_ROW - - // Handle the rest of the right side. - // 4x2 at a time. - for (; out_x <= output_width - 2; out_x += 2) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; + // Preload. + const uint8* h_ptr = input_ptr; + for (int32 i = 0; i < shuffle_params.input_height; i++) { + const uint8* ptr = h_ptr; + for (int32 j = 0; j < shuffle_params.input_width; j++) { + asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :); + ptr += params.input_depth; + } + h_ptr += params.input_row_size; + } - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run( - input_ptr, input_depth, input_offset, input_row_size, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); + // Handle leftover depth. + ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr, + depth, params.output_depth, params.input_depth, + params.input_row_size, shuffle_params.output_height, + shuffle_params.output_width, params); - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; + input_data += + shuffle_params.output_width * kStrideWidth * params.input_depth; + output_data += shuffle_params.output_width * params.output_depth; } - - input_data += 2 * 2 * input_depth; - output_data += 2 * output_depth; } - // 4x1 at a time. - for (; out_x < output_width; out_x++) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - for (int depth = 0; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<4, 1, 2, 2>::Run( - input_ptr, input_depth, input_offset, input_row_size, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } - - input_data += 2 * input_depth; - output_data += output_depth; + const int32 output_leftover_width = end_x - out_x; + if (output_leftover_width > 0) { + ConvKernel::Run(input_data, filter_data, bias_data, output_data, 0, + params.output_depth, params.input_depth, + params.input_row_size, shuffle_params.output_height, + output_leftover_width, params); } } }; -template <> -struct ConvRow3x3FilterDepth8<8, 2, 2> { - static inline void Run(const uint8* input_data, int start_x, int start_y, - int input_depth, int input_width, int input_height, - int input_row_size, int32 input_offset, - const uint8* filter_data, int32 filter_offset, - const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - int output_depth, int output_width, - uint8* shuffle_workspace) { - // Reuse 4 row kernels twice. - ConvRow3x3FilterDepth8<4, 2, 2>::Run( - input_data, start_x, start_y, input_depth, input_width, input_height, - input_row_size, input_offset, filter_data, filter_offset, bias_data, - output_offset, output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_depth, output_width, - shuffle_workspace); - - ConvRow3x3FilterDepth8<4, 2, 2>::Run( - input_data + 2 * 4 * input_row_size, start_x, start_y + 4, input_depth, - input_width, input_height, input_row_size, input_offset, filter_data, - filter_offset, bias_data, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_data + 4 * output_depth * output_width, output_depth, - output_width, shuffle_workspace); +// Processes the borders of the input for pad_width and pad_height = 1. +// Calls 4 asm kernels: +// * 1x1 input shape. +// * Corner edges. +// * Horizontal edges. +// * Vertical edges. +inline void DepthwiseConvHandlePadding(const uint8* input_data, + const uint8* filter_data, const int32* bias_data, uint8* output_data, + const DepthwiseConvParams& params) { + if (params.input_width == 1 && params.input_height == 1) { + const uint8* filter_ptr = filter_data + params.filter_row_size + + params.output_depth; + DepthwiseConvPartial<EdgeType::kCenter, 1, 1>::Run(input_data, filter_ptr, + bias_data, output_data, ¶ms); + return; } -}; -template <> -struct ConvRow3x3FilterDepth8<8, 1, 1> { - // The buffer size of the shuffled input. - static inline constexpr int ShuffleWorkspaceSize() { return 64 * 10 * 10; } - - static inline void Run(const uint8* input_data, int start_x, int start_y, - int input_depth, int input_width, int input_height, - int input_row_size, int32 input_offset, - const uint8* filter_data, int32 filter_offset, - const int32* bias_data, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - int output_depth, int output_width, - uint8* shuffle_workspace) { - int out_x = start_x; - // 8x8 at a time. - for (; out_x <= output_width - 8; out_x += 8) { - const int32* bias_ptr = bias_data; - const uint8* filter_ptr = filter_data; - - const uint8* input_ptr = input_data; - uint8* output_ptr = output_data; - - int depth = 0; - for (; depth <= output_depth - 64; depth += 64) { - // For a large input window (64x10x10) that is small enough to fit in L1 - // cache, copy the input into a separate buffer and run the kernel on - // this new buffer. This reduces the likelihood of cache misses when - // the kernel is loading input data. If the size of the input window - // changes, update the function ShuffleWorkspaceSize() with the new - // size. - ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 10, - 10, shuffle_workspace); - const uint8* shuffled_ptr = shuffle_workspace; - - for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) { - ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run( - shuffled_ptr, 64, input_offset, 64 * 10, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); - - shuffled_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } - input_ptr += 64; - } + const int32 out_x_start_corner = 0; + const int32 out_x_end_corner = params.output_width - 1; + const int32 out_y_start_corner = 0; + const int32 out_y_end_corner = params.output_height - 1; + + // Handle top row. + const uint8* input_ptr = input_data; + const uint8* filter_ptr = filter_data + params.filter_row_size + + params.output_depth; + uint8* output_ptr = output_data; + + DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr, + bias_data, output_ptr, ¶ms); + + input_ptr += (params.stride_width - 1) * params.input_depth; + filter_ptr = filter_data + params.filter_row_size; + output_ptr += params.output_depth; + + for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner; + out_x++) { + DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run( + input_ptr, filter_ptr, bias_data, output_ptr, ¶ms); + input_ptr += params.stride_width * params.input_depth; + output_ptr += params.output_depth; + } - for (; depth <= output_depth - 8; depth += 8) { - ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run( - input_ptr, input_depth, input_offset, input_row_size, filter_ptr, - filter_offset, bias_ptr, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_ptr, output_depth, output_width); - - input_ptr += 8; - output_ptr += 8; - filter_ptr += 8; - bias_ptr += 8; - } + DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr, + bias_data, output_ptr, ¶ms); - input_data += 8 * input_depth; - output_data += 8 * output_depth; - } + // Handle left side. + input_ptr = input_data + (params.stride_width - 1) * params.input_row_size; + filter_ptr = filter_data + params.input_depth; + output_ptr = output_data + params.output_row_size; - // Handle the rest of the right side by re-using 4 row kernels twice. - ConvRow3x3FilterDepth8<4, 1, 1>::Run( - input_data, out_x, start_y, input_depth, input_width, input_height, - input_row_size, input_offset, filter_data, filter_offset, bias_data, - output_offset, output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_depth, output_width, - shuffle_workspace); - - ConvRow3x3FilterDepth8<4, 1, 1>::Run( - input_data + 4 * input_row_size, out_x, start_y + 4, input_depth, - input_width, input_height, input_row_size, input_offset, filter_data, - filter_offset, bias_data, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_data + 4 * output_depth * output_width, output_depth, - output_width, shuffle_workspace); + for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner; + out_y++) { + DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run( + input_ptr, filter_ptr, bias_data, output_ptr, ¶ms); + input_ptr += params.stride_width * params.input_row_size; + output_ptr += params.output_row_size; } -}; -inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims, - const Dims<4>& filter_dims, - int stride_width, int stride_height, - int pad_width, int pad_height, - int depth_multiplier, - const Dims<4>& output_dims) { - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - - bool supported = filter_width == 3 && filter_height == 3 && - depth_multiplier == 1 && - (stride_width == 1 || stride_width == 2) && - (stride_height == 1 || stride_height == 2) && - (stride_width == stride_height) && pad_width == 0 && - pad_height == 0 && (input_depth % 8) == 0; + // Handle right side. + input_ptr = input_data + (params.input_width - 2) * params.input_depth + + (params.stride_width - 1) * params.input_row_size; + filter_ptr = filter_data; + output_ptr = output_data + params.output_row_size + + (params.output_width - 1) * params.output_depth; + + for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner; + out_y++) { + DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run( + input_ptr, filter_ptr, bias_data, output_ptr, ¶ms); + input_ptr += params.stride_width * params.input_row_size; + output_ptr += params.output_row_size; + } + + // Handle bottom row. + input_ptr = input_data + (params.input_height - 2) * params.input_row_size; + filter_ptr = filter_data + params.output_depth; + output_ptr = output_data + + (params.output_height - 1) * params.output_row_size; + + DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr, + bias_data, output_ptr, ¶ms); + + input_ptr += (params.stride_width == 1) ? 0 : params.input_depth; + filter_ptr = filter_data; + output_ptr += params.output_depth; + + for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner; + out_x++) { + DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run( + input_ptr, filter_ptr, bias_data, output_ptr, ¶ms); + input_ptr += params.stride_width * params.input_depth; + output_ptr += params.output_depth; + } + + DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr, + bias_data, output_ptr, ¶ms); +} + +inline bool Fast3x3FilterKernelSupported( + const RuntimeShape& input_shape, const RuntimeShape& filter_shape, + int32 stride_width, int32 stride_height, int32 dilation_width_factor, + int32 dilation_height_factor, int32 pad_width, int32 pad_height, + int32 depth_multiplier, const RuntimeShape& output_shape, + int32 output_shift) { + const int32 input_height = input_shape.Dims(1); + const int32 input_width = input_shape.Dims(2); + const int32 input_depth = input_shape.Dims(3); + const int32 filter_height = filter_shape.Dims(1); + const int32 filter_width = filter_shape.Dims(2); + const int32 output_height = output_shape.Dims(1); + const int32 output_width = output_shape.Dims(2); + + bool supported = + filter_width == 3 && filter_height == 3 && depth_multiplier == 1 && + (stride_width == 1 || stride_width == 2) && + (stride_height == 1 || stride_height == 2) && + (stride_width == stride_height) && (pad_width == 0 || pad_width == 1) && + (pad_height == 0 || pad_height == 1) && (pad_width == pad_height) && + (input_depth % 8) == 0 && (output_shift <= 0) && + dilation_width_factor == 1 && dilation_height_factor == 1; if (!supported) { return false; @@ -4436,145 +3204,205 @@ inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims, // Handle case where padding is zero but padding type is not kValid. // This would require special boundary case handling that is not supported. - const int out_x = output_width - 1; - const int out_y = output_height - 1; + const int32 out_x = output_width - 1; + const int32 out_y = output_height - 1; - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int32 in_x_origin = (out_x * stride_width) - pad_width; + const int32 in_y_origin = (out_y * stride_height) - pad_height; - const int in_x_end = in_x_origin + filter_width; - const int in_y_end = in_y_origin + filter_height; + const int32 in_x_end = in_x_origin + filter_width; + const int32 in_y_end = in_y_origin + filter_height; // Supported only if filter on the right and bottom boundary lies completely - // within the input. - return in_x_end <= input_width && in_y_end <= input_height; + // within the input if padding is zero. + if (pad_width == 0 && pad_height == 0) { + return in_x_end <= input_width && in_y_end <= input_height; + } + + // Else if padding is 1, supported if bottom right filter lies +1 past input + // width and height. + supported = in_x_end <= (input_width + 1) && in_y_end <= (input_height + 1); + + if (!supported) { + return false; + } + + // Shapes with width 1 and height > 1, and vice versa are not supported yet. + if (input_width == 1) { + supported = (input_width == input_height); + } else if (input_height == 1) { + supported = (input_width == input_height); + } + return supported; } inline void DepthwiseConv3x3Filter( - const uint8* input_data, const Dims<4>& input_dims, int32 input_offset, - const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int stride_width, - int stride_height, int pad_width, int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - - // Algorithm assumes below constraints. It is optimized for depth multiplier - // of 1, 3x3 filter, no padding and strides 1 and 2. - TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); + const DepthwiseParams& rt_params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data) { + gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); + DepthwiseConvParams params; + + const int32 stride_width = rt_params.stride_width; + const int32 stride_height = rt_params.stride_height; + const int32 pad_width = rt_params.padding_values.width; + const int32 pad_height = rt_params.padding_values.height; + const int32 depth_multiplier = rt_params.depth_multiplier; + const int32 output_activation_min = rt_params.quantized_activation_min; + const int32 output_activation_max = rt_params.quantized_activation_max; + const int32 input_offset = rt_params.input_offset; + const int32 filter_offset = rt_params.weights_offset; + const int32 output_offset = rt_params.output_offset; + const int32 output_multiplier = rt_params.output_multiplier; + const int32 output_shift = rt_params.output_shift; + + params.input_depth = input_shape.Dims(3); + params.input_width = input_shape.Dims(2); + params.input_height = input_shape.Dims(1); + params.input_row_size = params.input_depth * params.input_width; + params.input_offset = input_offset; + params.stride_width = stride_width; + params.stride_height = stride_height; + params.output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + params.output_width = output_shape.Dims(2); + params.output_height = output_shape.Dims(1); + params.output_row_size = params.output_depth * params.output_width; + params.output_offset = output_offset; + params.filter_offset = filter_offset; + params.output_multiplier = output_multiplier; + params.output_right_shift = -output_shift; + params.output_activation_min = output_activation_min; + params.output_activation_max = output_activation_max; + + const int32 filter_height = filter_shape.Dims(1); + const int32 filter_width = filter_shape.Dims(2); + params.filter_row_size = params.output_depth * filter_width; + + // Algorithm assumes below constraints. It is optimized for depth + // multiplier of 1, 3x3 filter, no padding and strides 1 and 2. + TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier); TFLITE_DCHECK(depth_multiplier == 1); TFLITE_DCHECK(filter_height == 3); TFLITE_DCHECK(filter_width == 3); - TFLITE_DCHECK(pad_height == 0); - TFLITE_DCHECK(pad_width == 0); TFLITE_DCHECK(stride_height == 1 || stride_height == 2); TFLITE_DCHECK(stride_width == 1 || stride_width == 2); TFLITE_DCHECK(stride_width == stride_height); + TFLITE_DCHECK(pad_height == 0 || pad_height == 1); + TFLITE_DCHECK(pad_width == 0 || pad_width == 1); + TFLITE_DCHECK(pad_width == pad_height); + + const int32 batches = MatchingDim(input_shape, 0, output_shape, 0); + const int64_t input_batch_size = params.input_row_size * params.input_height; + const int64_t output_batch_size = + params.output_row_size * params.output_height; + + ShuffleParams one_row_shuffle_params, two_row_shuffle_params, + four_row_shuffle_params, eight_row_shuffle_params; + if (stride_width == 1) { + one_row_shuffle_params = ShuffleParams(30, 1, 1, 1); + two_row_shuffle_params = ShuffleParams(22, 2, 1, 1); + four_row_shuffle_params = ShuffleParams(14, 4, 1, 1); + eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1); + } else { + one_row_shuffle_params = ShuffleParams(14, 1, 2, 2); + two_row_shuffle_params = ShuffleParams(8, 2, 2, 2); + four_row_shuffle_params = ShuffleParams(4, 4, 2, 2); + eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2); + } - const int input_row_size = input_depth * (input_width + 2 * pad_width); - const int output_row_size = output_depth * output_width; - const int input_batch_size = input_row_size * (input_height + 2 * pad_height); - const int output_batch_size = output_depth * output_width * output_height; - - using conv_row_func_t = decltype(&ConvRow3x3FilterDepth8<1, 1, 1>::Run); - conv_row_func_t conv_1_output_row = ConvRow3x3FilterDepth8<1, 1, 1>::Run; - conv_row_func_t conv_2_output_rows = ConvRow3x3FilterDepth8<2, 1, 1>::Run; - conv_row_func_t conv_4_output_rows = ConvRow3x3FilterDepth8<4, 1, 1>::Run; - conv_row_func_t conv_8_output_rows = ConvRow3x3FilterDepth8<8, 1, 1>::Run; - + using conv_multirow_func_t = decltype(&DepthwiseConvMultiRow<1, 1>::Run); + conv_multirow_func_t conv_multirow_func = DepthwiseConvMultiRow<1, 1>::Run; if (stride_width == 2) { - conv_1_output_row = ConvRow3x3FilterDepth8<1, 2, 2>::Run; - conv_2_output_rows = ConvRow3x3FilterDepth8<2, 2, 2>::Run; - conv_4_output_rows = ConvRow3x3FilterDepth8<4, 2, 2>::Run; - conv_8_output_rows = ConvRow3x3FilterDepth8<8, 2, 2>::Run; + conv_multirow_func = DepthwiseConvMultiRow<2, 2>::Run; } // Allocate maximum memory needed for shuffled input. // TODO(mariewhite): The size of this workspace is small enough to be // allocated on the stack. Eventually we will want to move it to the heap - // and have it allocated outside of this function, like the im2col_array used - // in gemmlowp. -#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64 + // and have it allocated outside of this function, like the im2col_array + // used in gemmlowp. uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE]; - // Make sure the kernels using this buffer will not run out of bounds. - static_assert(ConvRow3x3FilterDepth8<8, 1, 1>::ShuffleWorkspaceSize() <= - DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE, - "Shuffle workspace size is too small."); - static_assert(ConvRow3x3FilterDepth8<4, 2, 2>::ShuffleWorkspaceSize() <= - DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE, - "Shuffle workspace size is too small."); - -#undef DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE - - for (int b = 0; b < batches; ++b) { + for (int32 b = 0; b < batches; ++b) { const uint8* input_ptr = input_data + b * input_batch_size; uint8* output_ptr = output_data + b * output_batch_size; - int out_y = 0; + int32 out_x = 0; + int32 out_y = 0; + int32 end_x = params.output_width; + int32 end_y = params.output_height; + + if (pad_width == 1 && pad_height == 1) { + DepthwiseConvHandlePadding(input_ptr, filter_data, bias_data, output_ptr, + params); + + // Update extents now that the edges have been handled. + out_x = 1; + end_x = params.output_width - 1; + out_y = 1; + end_y = params.output_height - 1; + const int in_x = (out_x * stride_width) - pad_width; + const int in_y = (out_y * stride_height) - pad_height; + input_ptr += in_y * params.input_row_size + in_x * params.input_depth; + output_ptr += out_y * params.output_row_size + + out_x * params.output_depth; + } + + // Shuffling shapes that maximize width over the shuffle workspace size + // perform better since the inputs are closer together, minimizing + // shuffling time. + // + // If the input shape has width large enough for the 2 row kernels, + // we prefer to use this. The innermost loop of the kernels handle + // 2 height x 2 width so this is the fastest path. + // + // If the input shape has smaller width but larger height, shuffling is + // still useful and can benefit from kernels 4 row and 8 row kernels. // Handle 8 rows at a time. - for (; out_y <= output_height - 8; out_y += 8) { - conv_8_output_rows(input_ptr, 0, out_y, input_depth, input_width, - input_height, input_row_size, input_offset, - filter_data, filter_offset, bias_data, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth, - output_width, shuffle_workspace); - - input_ptr += 8 * stride_height * input_row_size; - output_ptr += 8 * output_row_size; + if (params.input_width < four_row_shuffle_params.input_width) { + for (; out_y <= end_y - 8; out_y += 8) { + conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data, + output_ptr, params, eight_row_shuffle_params, + shuffle_workspace); + input_ptr += 8 * stride_height * params.input_row_size; + output_ptr += 8 * params.output_row_size; + } } // Handle 4 rows at a time. - for (; out_y <= output_height - 4; out_y += 4) { - conv_4_output_rows(input_ptr, 0, out_y, input_depth, input_width, - input_height, input_row_size, input_offset, - filter_data, filter_offset, bias_data, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth, - output_width, shuffle_workspace); - - input_ptr += 4 * stride_height * input_row_size; - output_ptr += 4 * output_row_size; + if (params.input_width < two_row_shuffle_params.input_width) { + for (; out_y <= end_y - 4; out_y += 4) { + conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data, + output_ptr, params, four_row_shuffle_params, + shuffle_workspace); + input_ptr += 4 * stride_height * params.input_row_size; + output_ptr += 4 * params.output_row_size; + } } // Handle 2 rows at a time. - for (; out_y <= output_height - 2; out_y += 2) { - conv_2_output_rows(input_ptr, 0, out_y, input_depth, input_width, - input_height, input_row_size, input_offset, - filter_data, filter_offset, bias_data, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth, - output_width, shuffle_workspace); - - input_ptr += 2 * stride_height * input_row_size; - output_ptr += 2 * output_row_size; + for (; out_y <= end_y - 2; out_y += 2) { + conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data, + output_ptr, params, two_row_shuffle_params, + shuffle_workspace); + input_ptr += 2 * stride_height * params.input_row_size; + output_ptr += 2 * params.output_row_size; } // Handle one row at a time. - for (; out_y < output_height; out_y++) { - conv_1_output_row(input_ptr, 0, out_y, input_depth, input_width, - input_height, input_row_size, input_offset, filter_data, - filter_offset, bias_data, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_ptr, output_depth, - output_width, shuffle_workspace); - - input_ptr += stride_height * input_row_size; - output_ptr += output_row_size; + for (; out_y < end_y; out_y++) { + conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data, + output_ptr, params, one_row_shuffle_params, + shuffle_workspace); + input_ptr += stride_height * params.input_row_size; + output_ptr += params.output_row_size; } } } +// clang-format on #endif // __aarch64__ diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h index d85e06a5d5..6443f425b7 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h @@ -33,7 +33,7 @@ limitations under the License. #include <functional> #ifdef _WIN32 -#include <winbase.h> +#include <windows.h> #elif defined(__APPLE__) #include <mach/mach_time.h> #else @@ -140,4 +140,4 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" #include "Eigen/src/Core/util/ReenableStupidWarnings.h" -#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_H +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h new file mode 100644 index 0000000000..4218be20a4 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h @@ -0,0 +1,1872 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_ + +#include <stdint.h> +#include <sys/types.h> + +#include "tensorflow/contrib/lite/kernels/internal/common.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" + +namespace tflite { +namespace optimized_ops { + +// Unoptimized reference ops: +using reference_ops::ArgMax; +using reference_ops::ArgMinMax; +using reference_ops::Broadcast4DSlowGreater; +using reference_ops::Broadcast4DSlowGreaterEqual; +using reference_ops::Broadcast4DSlowGreaterEqualWithScaling; +using reference_ops::Broadcast4DSlowGreaterWithScaling; +using reference_ops::Broadcast4DSlowLess; +using reference_ops::Broadcast4DSlowLessEqual; +using reference_ops::Broadcast4DSlowLessEqualWithScaling; +using reference_ops::Broadcast4DSlowLessWithScaling; +using reference_ops::BroadcastAdd4DSlow; +using reference_ops::BroadcastGreater; +using reference_ops::BroadcastGreaterEqual; +using reference_ops::BroadcastLess; +using reference_ops::BroadcastLessEqual; +using reference_ops::BroadcastMul4DSlow; +using reference_ops::BroadcastSub4DSlow; +using reference_ops::Concatenation; +using reference_ops::ConcatenationWithScaling; +using reference_ops::DepthConcatenation; +using reference_ops::Dequantize; +using reference_ops::Div; +using reference_ops::FakeQuant; +using reference_ops::Gather; +using reference_ops::Greater; +using reference_ops::GreaterEqual; +using reference_ops::GreaterEqualWithScaling; +using reference_ops::GreaterWithScaling; +using reference_ops::Less; +using reference_ops::LessEqual; +using reference_ops::LessEqualWithScaling; +using reference_ops::LessWithScaling; +using reference_ops::Mean; +using reference_ops::RankOneSelect; +using reference_ops::Relu1; +using reference_ops::Relu6; +using reference_ops::ReluX; +using reference_ops::Select; +using reference_ops::SpaceToBatchND; +using reference_ops::Split; +using reference_ops::StridedSlice; +using reference_ops::TensorFlowSplit; +using reference_ops::Transpose; + +static constexpr int kDepthwiseReverseShift = -1; + +template <typename Scalar, int N> +VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) { + const int size = FlatSize(dims); + return VectorMap<Scalar>(data, size, 1); +} + +template <typename Scalar, int N> +MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data, + const Dims<N>& dims) { + const int rows = dims.sizes[0]; + int cols = 1; + for (int d = 1; d < N; d++) { + cols *= dims.sizes[d]; + } + return MatrixMap<Scalar>(data, rows, cols); +} + +template <typename Scalar, int N> +MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data, + const Dims<N>& dims) { + const int cols = dims.sizes[N - 1]; + int rows = 1; + for (int d = 0; d < N - 1; d++) { + rows *= dims.sizes[d]; + } + return MatrixMap<Scalar>(data, rows, cols); +} + +template <typename Scalar, int N> +ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data, + const Dims<N>& dims) { + const int rows = dims.sizes[0]; + int cols = 1; + for (int d = 1; d < N; d++) { + cols *= dims.sizes[d]; + } + return ArrayMap<Scalar>(data, rows, cols); +} + +// TODO(b/62193649): this function is only needed as long +// as we have the --variable_batch hack. +template <typename Scalar, int N> +MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data, + const Dims<N>& dims, + int rows) { + const int flatsize = FlatSize(dims); + TFLITE_DCHECK((flatsize % rows) == 0); + const int cols = flatsize / rows; + return MatrixMap<Scalar>(data, rows, cols); +} + +inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) { + for (int i = 0; i < 4; i++) { + if (dims1.sizes[i] != dims2.sizes[i]) { + return false; + } + } + return true; +} + +inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, + int dilation_width_factor, int dilation_height_factor, + int pad_width, int pad_height, int depth_multiplier, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { + tflite::DepthwiseParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.depth_multiplier = depth_multiplier; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + DepthwiseConv(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data); +} + +inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { + DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data, + bias_dims, stride_width, stride_height, 1, 1, pad_width, + pad_height, depth_multiplier, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, float* output_data, + const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data, + bias_dims, stride_width, stride_height, pad_width, pad_height, + depth_multiplier, output_activation_min, output_activation_max, + output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, int stride, + int pad_width, int pad_height, int depth_multiplier, + float* output_data, const Dims<4>& output_dims) { + DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, + bias_dims, stride, stride, pad_width, pad_height, + depth_multiplier, output_data, output_dims); +} + +inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, + int dilation_width_factor, int dilation_height_factor, + int pad_width, int pad_height, int depth_multiplier, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::DepthwiseParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.depth_multiplier = depth_multiplier; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kDepthwiseReverseShift * output_shift; + + DepthwiseConv(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data); +} + +inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride_width, + stride_height, 1, 1, pad_width, pad_height, depth_multiplier, + output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, + output_dims); +} + +// Legacy, for compatibility with old checked-in code. +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, int32 output_offset, + int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride_width, + stride_height, pad_width, pad_height, depth_multiplier, + output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, + output_dims); +} + +// Legacy, for compatibility with old checked-in code. +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, int stride, + int pad_width, int pad_height, int depth_multiplier, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data, + filter_dims, filter_offset, bias_data, bias_dims, stride, + stride, pad_width, pad_height, depth_multiplier, + output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, + output_dims); +} + +inline void AddBiasAndEvalActivationFunction(const float* bias_data, + const Dims<4>& bias_dims, + float* array_data, + const Dims<4>& array_dims, + float output_activation_min, + float output_activation_max) { + AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max, + DimsToShape(bias_dims), bias_data, + DimsToShape(array_dims), array_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void AddBiasAndEvalActivationFunction(const float* bias_data, + const Dims<4>& bias_dims, + float* array_data, + const Dims<4>& array_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + AddBiasAndEvalActivationFunction(bias_data, bias_dims, array_data, array_dims, + output_activation_min, + output_activation_max); +} + +inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, + const float* weights_data, + const Dims<4>& weights_dims, const float* bias_data, + const Dims<4>& bias_dims, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { + tflite::FullyConnectedParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + FullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(weights_dims), weights_data, + DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void FullyConnected(const float* input_data, const Dims<4>& input_dims, + const float* weights_data, const Dims<4>& weights_dims, + const float* bias_data, const Dims<4>& bias_dims, + float* output_data, const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, + bias_dims, output_activation_min, output_activation_max, + output_data, output_dims); +} + +inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + tflite::FullyConnectedParams op_params; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + FullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data, + gemm_context); +} + +inline void FullyConnected( + const uint8* input_data, const Dims<4>& input_dims, int32 input_offset, + const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data_int32, const Dims<4>& bias_dims, int32 output_offset, + int32 output_multiplier, int output_shift, int32 output_activation_min, + int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + tflite::FullyConnectedParams op_params; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + FullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data_int32, DimsToShape(output_dims), output_data, + gemm_context); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, output_offset, + output_multiplier, output_shift, output_activation_min, + output_activation_max, output_data, output_dims, gemm_context); +} + +inline void ShuffledFullyConnected( + const uint8* input_data, const Dims<4>& input_dims, + const uint8* shuffled_weights_data, const Dims<4>& weights_dims, + const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + int16* output_data, const Dims<4>& output_dims, + uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { + tflite::FullyConnectedParams op_params; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(weights_dims), shuffled_weights_data, + DimsToShape(bias_dims), bias_data, + DimsToShape(output_dims), output_data, + shuffled_input_workspace_data, gemm_context); +} + +template <typename T> +inline void ExtractPatchIntoBufferColumn( + const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth, + int stride_width, int stride_height, int pad_width, int pad_height, + int in_width, int in_height, int in_depth, int single_buffer_length, + int buffer_id, const T* in_data, T* conv_buffer_data, uint8 zero_byte) { + ExtractPatchIntoBufferColumn( + DimsToShape(input_dims), w, h, b, kheight, kwidth, stride_width, + stride_height, pad_width, pad_height, in_width, in_height, in_depth, + single_buffer_length, buffer_id, in_data, conv_buffer_data, zero_byte); +} + +template <typename T> +void DilatedIm2col(const T* input_data, const Dims<4>& input_dims, + const Dims<4>& filter_dims, int stride_width, + int stride_height, int dilation_width_factor, + int dilation_height_factor, int pad_width, int pad_height, + const Dims<4>& output_dims, uint8 zero_byte, + T* im2col_data) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + + DilatedIm2col(op_params, zero_byte, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), DimsToShape(output_dims), + im2col_data); +} + +template <typename T> +void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width, + int stride_height, int pad_width, int pad_height, int kheight, + int kwidth, uint8 zero_byte, T* output_data, + const Dims<4>& output_dims) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = 1; + op_params.dilation_height_factor = 1; + + Im2col(op_params, kheight, kwidth, zero_byte, DimsToShape(input_dims), + input_data, DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +template <typename T> +void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int kheight, int kwidth, + uint8 zero_byte, T* output_data, const Dims<4>& output_dims) { + Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, + kwidth, zero_byte, output_data, output_dims); +} + +inline void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int dilation_width_factor, + int dilation_height_factor, int pad_width, int pad_height, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims, + float* im2col_data, const Dims<4>& im2col_dims) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims), + filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), + output_data, DimsToShape(im2col_dims), im2col_data); +} + +inline void HybridConv(const int8_t* input_data, const Dims<4>& input_dims, + const int8_t* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, float* scaling_factors_ptr, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims, + int8_t* im2col_data, const Dims<4>& im2col_dims) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + HybridConv(op_params, scaling_factors_ptr, DimsToShape(input_dims), + input_data, DimsToShape(filter_dims), filter_data, + DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), + output_data, DimsToShape(im2col_dims), im2col_data); +} + +template <FusedActivationFunctionType Ac> +void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, int stride_width, + int stride_height, int dilation_width_factor, + int dilation_height_factor, int pad_width, int pad_height, + float* output_data, const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, + stride_width, stride_height, dilation_width_factor, + dilation_height_factor, pad_width, pad_height, output_activation_min, + output_activation_max, output_data, output_dims, im2col_data, + im2col_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, int stride_width, + int stride_height, int pad_width, int pad_height, float* output_data, + const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, + stride_width, stride_height, 1, 1, pad_width, pad_height, + output_activation_min, output_activation_max, output_data, output_dims, + im2col_data, im2col_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, int stride, + int pad_width, int pad_height, float* output_data, + const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, + bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data, + output_dims, im2col_data, im2col_dims); +} + +inline void Conv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int dilation_width_factor, + int dilation_height_factor, int pad_width, int pad_height, + int32 output_offset, int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims, + uint8* im2col_data, const Dims<4>& im2col_dims, + gemmlowp::GemmContext* gemm_context) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims), + filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), + output_data, DimsToShape(im2col_dims), im2col_data, gemm_context); +} + +inline void Conv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, uint8* im2col_data, + const Dims<4>& im2col_dims, + gemmlowp::GemmContext* gemm_context) { + Conv(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1, + pad_width, pad_height, output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, output_dims, + im2col_data, im2col_dims, gemm_context); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +inline void Conv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, uint8* im2col_data, + const Dims<4>& im2col_dims, + gemmlowp::GemmContext* gemm_context) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + Conv(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride_width, stride_height, + pad_width, pad_height, output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, output_dims, + im2col_data, im2col_dims, gemm_context); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void Conv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, int stride, + int pad_width, int pad_height, int32 output_offset, + int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, + const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + Conv(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride, stride, pad_width, + pad_height, output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, output_dims, + im2col_data, im2col_dims, gemm_context); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac, typename T> +void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int kheight, int kwidth, + uint8 zero_byte, T* output_data, const Dims<4>& output_dims) { + Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, + kwidth, zero_byte, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void ConvAsGemm(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("ConvAsGemm"); + + const auto input_matrix_map = + MapAsMatrixWithFirstDimAsRows(input_data, input_dims); + const auto filter_matrix_map = + MapAsMatrixWithLastDimAsCols(filter_data, filter_dims); + auto output_matrix_map = + MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + + Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map); + + AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data, + output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit"); + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + const int input_rows = input_dims.sizes[0]; + const int input_cols = FlatSizeSkipDim(input_dims, 0); + const int filter_rows = filter_dims.sizes[3]; + const int filter_cols = FlatSizeSkipDim(filter_dims, 3); + const int output_rows = output_dims.sizes[0]; + const int output_cols = FlatSizeSkipDim(output_dims, 0); + TFLITE_DCHECK_EQ(output_rows, filter_rows); + TFLITE_DCHECK_EQ(output_cols, input_cols); + TFLITE_DCHECK_EQ(filter_cols, input_rows); + TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows); + TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1); + TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1); + TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1); + gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( + filter_data, output_rows, filter_cols, filter_cols); + gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( + input_data, filter_cols, output_cols, filter_cols); + gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( + output_data, output_rows, output_cols, output_rows); + const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp( + bias_data, output_rows, output_offset, output_multiplier, -output_shift, + output_activation_min, output_activation_max); + gemmlowp::GemmWithOutputPipeline<uint8, uint8, + gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, + input_offset, output_pipeline); +} + +inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, float* output_data, + const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + + TransposeConv(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(output_dims), + output_data, DimsToShape(im2col_dims), im2col_data); +} + +template <typename T> +void TransposeIm2col(const T* input_data, const Dims<4>& input_dims, + const Dims<4>& filter_dims, int stride_width, + int stride_height, int pad_width, int pad_height, + const Dims<4>& output_dims, uint8 zero_byte, + T* im2col_data) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + + TransposeIm2col(op_params, zero_byte, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), DimsToShape(output_dims), + im2col_data); +} + +inline void LstmCell(const float* input_data, const Dims<4>& input_dims, + const float* prev_activ_data, + const Dims<4>& prev_activ_dims, const float* weights_data, + const Dims<4>& weights_dims, const float* bias_data, + const Dims<4>& bias_dims, const float* prev_state_data, + const Dims<4>& prev_state_dims, float* output_state_data, + const Dims<4>& output_state_dims, float* output_activ_data, + const Dims<4>& output_activ_dims, float* concat_temp_data, + const Dims<4>& concat_temp_dims, float* activ_temp_data, + const Dims<4>& activ_temp_dims) { + tflite::LstmCellParams op_params; + // Float LSTM cell does not need parameters to be set: leave untouched. + + LstmCell(op_params, DimsToShape(input_dims), input_data, + DimsToShape(prev_activ_dims), prev_activ_data, + DimsToShape(weights_dims), weights_data, DimsToShape(bias_dims), + bias_data, DimsToShape(prev_state_dims), prev_state_data, + DimsToShape(output_state_dims), output_state_data, + DimsToShape(output_activ_dims), output_activ_data, + DimsToShape(concat_temp_dims), concat_temp_data, + DimsToShape(activ_temp_dims), activ_temp_data); +} + +template <int StateIntegerBits> +void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, + const uint8* prev_activ_data_uint8, + const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8, + const Dims<4>& weights_dims, const int32* bias_data_int32, + const Dims<4>& bias_dims, const int16* prev_state_data_int16, + const Dims<4>& prev_state_dims, int16* output_state_data_int16, + const Dims<4>& output_state_dims, uint8* output_activ_data_uint8, + const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8, + const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16, + const Dims<4>& activ_temp_dims, int32 weights_zero_point, + int32 accum_multiplier, int accum_shift, + gemmlowp::GemmContext* gemm_context) { + tflite::LstmCellParams op_params; + op_params.weights_zero_point = weights_zero_point; + op_params.accum_multiplier = accum_multiplier; + op_params.accum_shift = accum_shift; + + LstmCell<StateIntegerBits>( + op_params, DimsToShape(input_dims), input_data_uint8, + DimsToShape(prev_activ_dims), prev_activ_data_uint8, + DimsToShape(weights_dims), weights_data_uint8, DimsToShape(bias_dims), + bias_data_int32, DimsToShape(prev_state_dims), prev_state_data_int16, + DimsToShape(output_state_dims), output_state_data_int16, + DimsToShape(output_activ_dims), output_activ_data_uint8, + DimsToShape(concat_temp_dims), concat_temp_data_uint8, + DimsToShape(activ_temp_dims), activ_temp_data_int16, gemm_context); +} + +template <typename T> +void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + BroadcastDiv4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +template <FusedActivationFunctionType Ac> +void L2Normalization(const float* input_data, const RuntimeShape& input_shape, + float* output_data, const RuntimeShape& output_shape) { + static_assert(Ac == FusedActivationFunctionType::kNone, ""); + tflite::L2NormalizationParams op_params; + // No params need to be set for float, but reserved in signature for future + // activations. + + L2Normalization(op_params, input_shape, input_data, output_shape, + output_data); +} + +inline void L2Normalization(const uint8* input_data, + const RuntimeShape& input_shape, + int32 input_zero_point, uint8* output_data, + const RuntimeShape& output_shape) { + tflite::L2NormalizationParams op_params; + op_params.input_zero_point = input_zero_point; + + L2Normalization(op_params, input_shape, input_data, output_shape, + output_data); +} + +template <FusedActivationFunctionType Ac> +void L2Normalization(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + L2Normalization<Ac>(input_data, DimsToShape(input_dims), output_data, + DimsToShape(output_dims)); +} + +inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, + int32 input_zero_point, uint8* output_data, + const Dims<4>& output_dims) { + L2Normalization(input_data, DimsToShape(input_dims), input_zero_point, + output_data, DimsToShape(output_dims)); +} + +inline void Relu(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Relu(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void Add(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + + tflite::ArithmeticParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + Add(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <FusedActivationFunctionType Ac> +inline void Add(int left_shift, const uint8* input1_data, + const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, + const uint8* input2_data, const Dims<4>& input2_dims, + int32 input2_offset, int32 input2_multiplier, int input2_shift, + int32 output_offset, int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + constexpr int kReverseShift = -1; + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + + tflite::ArithmeticParams op_params; + op_params.left_shift = left_shift; + op_params.input1_offset = input1_offset; + op_params.input1_multiplier = input1_multiplier; + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_offset = input2_offset; + op_params.input2_multiplier = input2_multiplier; + op_params.input2_shift = kReverseShift * input2_shift; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + Add(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <FusedActivationFunctionType Ac> +void Add(const int32* input1_data, const Dims<4>& input1_dims, + const int32* input2_data, const Dims<4>& input2_dims, + int32* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Add/int32"); + TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); + + tflite::ArithmeticParams op_params; + op_params.quantized_activation_min = std::numeric_limits<int32>::min(); + op_params.quantized_activation_max = std::numeric_limits<int32>::max(); + Add(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <typename T> +void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +template <FusedActivationFunctionType Ac> +inline void BroadcastAdd(int left_shift, const uint8* input1_data, + const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, + const uint8* input2_data, const Dims<4>& input2_dims, + int32 input2_offset, int32 input2_multiplier, + int input2_shift, int32 output_offset, + int32 output_multiplier, int output_shift, + int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + constexpr int kReverseShift = -1; + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + + tflite::ArithmeticParams op_params; + op_params.left_shift = left_shift; + op_params.input1_offset = input1_offset; + op_params.input1_multiplier = input1_multiplier; + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_offset = input2_offset; + op_params.input2_multiplier = input2_multiplier; + op_params.input2_shift = kReverseShift * input2_shift; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +template <FusedActivationFunctionType Ac> +inline void BroadcastAddFivefold( + int y0, int y1, int y2, int y3, int y4, int left_shift, + const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, + int input2_shift, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + constexpr int kReverseShift = -1; + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + tflite::ArithmeticParams op_params; + op_params.broadcast_category = + tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; + op_params.left_shift = left_shift; + op_params.input1_offset = input1_offset; + op_params.input1_multiplier = input1_multiplier; + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_offset = input2_offset; + op_params.input2_multiplier = input2_multiplier; + op_params.input2_shift = kReverseShift * input2_shift; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + op_params.broadcast_shape[4] = y0; + op_params.broadcast_shape[3] = y1; + op_params.broadcast_shape[2] = y2; + op_params.broadcast_shape[1] = y3; + op_params.broadcast_shape[0] = y4; + BroadcastAddFivefold(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac, typename T> +void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T* output_data, const Dims<4>& output_dims) { + T output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + + BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims, + output_activation_min, output_activation_max, output_data, + output_dims); +} + +template <FusedActivationFunctionType Ac> +inline void Add(const int16* input1_data, const Dims<4>& input1_dims, + int input1_shift, const int16* input2_data, + const Dims<4>& input2_dims, int input2_shift, + int16 output_activation_min, int16 output_activation_max, + int16* output_data, const Dims<4>& output_dims) { + constexpr int kReverseShift = -1; + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, -32768); + TFLITE_DCHECK_EQ(output_activation_max, 32767); + } + + tflite::ArithmeticParams op_params; + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_shift = kReverseShift * input2_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + Add(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void Sub(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(FusedActivationFunctionType::kNone, + &output_activation_min, &output_activation_max); + tflite::ArithmeticParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + Sub(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <typename T> +void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, + const Dims<4>& input2_dims, T* output_data, + const Dims<4>& output_dims) { + T output_activation_min, output_activation_max; + GetActivationMinMax(FusedActivationFunctionType::kNone, + &output_activation_min, &output_activation_max); + tflite::ArithmeticParams op_params; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + Sub(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, + int32 input1_offset, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + op_params.input1_offset = input1_offset; + op_params.input2_offset = input2_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = kReverseShift * output_shift; + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, + int32 input1_offset, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + BroadcastMul(input1_data, input1_dims, input1_offset, input2_data, + input2_dims, input2_offset, output_offset, output_multiplier, + output_shift, output_activation_min, output_activation_max, + output_data, output_dims); +} + +inline void AveragePool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int kwidth, int kheight, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { + tflite::PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = kheight; + params.filter_width = kwidth; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.float_activation_min = output_activation_min; + params.float_activation_max = output_activation_max; + AveragePool(params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void AveragePool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int kwidth, int kheight, float* output_data, + const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + + AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, kwidth, kheight, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, + int filter_height, float* output_data, + const Dims<4>& output_dims) { + AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_data, output_dims); +} + +inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int filter_width, int filter_height, + int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = filter_height; + params.filter_width = filter_width; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.quantized_activation_min = output_activation_min; + params.quantized_activation_max = output_activation_max; + AveragePool(params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void AveragePool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int filter_width, int filter_height, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, + int filter_height, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +inline void MaxPool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int kwidth, int kheight, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims) { + tflite::PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = kheight; + params.filter_width = kwidth; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.float_activation_min = output_activation_min; + params.float_activation_max = output_activation_max; + MaxPool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void MaxPool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, int pad_height, + int kwidth, int kheight, float* output_data, + const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, kwidth, kheight, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, int filter_height, + float* output_data, const Dims<4>& output_dims) { + MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_data, output_dims); +} + +inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int filter_width, int filter_height, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = filter_height; + params.filter_width = filter_width; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.quantized_activation_min = output_activation_min; + params.quantized_activation_max = output_activation_max; + MaxPool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void MaxPool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, int pad_height, + int filter_width, int filter_height, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, int filter_height, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +inline void L2Pool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int filter_width, int filter_height, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims) { + PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = filter_height; + params.filter_width = filter_width; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.float_activation_min = output_activation_min; + params.float_activation_max = output_activation_max; + L2Pool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void L2Pool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, int pad_height, + int filter_width, int filter_height, float* output_data, + const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + L2Pool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, int filter_height, + float* output_data, const Dims<4>& output_dims) { + L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_data, output_dims); +} + +inline void Softmax(const float* input_data, const RuntimeShape& input_shape, + float beta, float* output_data, + const RuntimeShape& output_shape) { + SoftmaxParams params; + params.beta = beta; + Softmax(params, input_shape, input_data, output_shape, output_data); +} + +inline void Softmax(const float* input_data, const Dims<4>& input_dims, + float beta, float* output_data, + const Dims<4>& output_dims) { + Softmax(input_data, DimsToShape(input_dims), beta, output_data, + DimsToShape(output_dims)); +} + +inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape, + int32 input_beta_multiplier, int32 input_beta_left_shift, + int diff_min, uint8* output_data, + const RuntimeShape& output_shape) { + SoftmaxParams params; + params.input_multiplier = input_beta_multiplier; + params.input_left_shift = input_beta_left_shift; + params.diff_min = diff_min; + Softmax(params, input_shape, input_data, output_shape, output_data); +} +inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, + int32 input_beta_multiplier, int32 input_beta_left_shift, + int diff_min, uint8* output_data, + const Dims<4>& output_dims) { + Softmax(input_data, DimsToShape(input_dims), input_beta_multiplier, + input_beta_left_shift, diff_min, output_data, + DimsToShape(output_dims)); +} + +inline void LogSoftmax(const float* input_data, const RuntimeShape& input_shape, + float* output_data, const RuntimeShape& output_shape) { + SoftmaxParams params; + // No params currently used for float LogSoftmax. + LogSoftmax(params, input_shape, input_data, output_shape, output_data); +} + +inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + LogSoftmax(input_data, DimsToShape(input_dims), output_data, + DimsToShape(output_dims)); +} + +inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape, + int32 input_multiplier, int32 input_left_shift, + int32 reverse_scaling_divisor, + int32 reverse_scaling_right_shift, int diff_min, + uint8* output_data, const RuntimeShape& output_shape) { + SoftmaxParams params; + params.input_multiplier = input_multiplier; + params.input_left_shift = input_left_shift; + params.reverse_scaling_divisor = reverse_scaling_divisor; + params.reverse_scaling_right_shift = reverse_scaling_right_shift; + params.diff_min = diff_min; + LogSoftmax(params, input_shape, input_data, output_shape, output_data); +} + +inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, + int32 input_multiplier, int32 input_left_shift, + int32 reverse_scaling_divisor, + int32 reverse_scaling_right_shift, int diff_min, + uint8* output_data, const Dims<4>& output_dims) { + LogSoftmax(input_data, DimsToShape(input_dims), input_multiplier, + input_left_shift, reverse_scaling_divisor, + reverse_scaling_right_shift, diff_min, output_data, + DimsToShape(output_dims)); +} + +inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape, + int32 input_zero_point, int32 input_range_radius, + int32 input_multiplier, int input_left_shift, + uint8* output_data, const RuntimeShape& output_shape) { + LogisticParams params; + params.input_zero_point = input_zero_point; + params.input_range_radius = input_range_radius; + params.input_multiplier = input_multiplier; + params.input_left_shift = input_left_shift; + Logistic(params, input_shape, input_data, output_shape, output_data); +} + +inline void Logistic(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, + int32 input_zero_point, int32 input_range_radius, + int32 input_multiplier, int input_left_shift, + uint8* output_data, const Dims<4>& output_dims) { + Logistic(input_data, DimsToShape(input_dims), input_zero_point, + input_range_radius, input_multiplier, input_left_shift, output_data, + DimsToShape(output_dims)); +} + +inline void Logistic(const RuntimeShape& input_shape, const int16* input_data, + const RuntimeShape& output_shape, int16* output_data) { + LogisticParams params; + // No params currently needed by int16 Logistic. + Logistic(params, input_shape, input_data, output_shape, output_data); +} + +inline void Logistic(const int16* input_data, const RuntimeShape& input_shape, + int16* output_data, const RuntimeShape& output_shape) { + LogisticParams params; + // No params currently needed by int16 Logistic. + Logistic(params, input_shape, input_data, output_shape, output_data); +} + +inline void Logistic(const int16* input_data, const Dims<4>& input_dims, + int16* output_data, const Dims<4>& output_dims) { + Logistic(input_data, DimsToShape(input_dims), output_data, + DimsToShape(output_dims)); +} + +inline void Tanh(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Tanh(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape, + int32 input_zero_point, int32 input_range_radius, + int32 input_multiplier, int input_left_shift, + uint8* output_data, const RuntimeShape& output_shape) { + TanhParams params; + params.input_zero_point = input_zero_point; + params.input_range_radius = input_range_radius; + params.input_multiplier = input_multiplier; + params.input_left_shift = input_left_shift; + Tanh(params, input_shape, input_data, output_shape, output_data); +} + +inline void Tanh(const uint8* input_data, const Dims<4>& input_dims, + int32 input_zero_point, int32 input_range_radius, + int32 input_multiplier, int input_left_shift, + uint8* output_data, const Dims<4>& output_dims) { + Tanh(input_data, DimsToShape(input_dims), input_zero_point, + input_range_radius, input_multiplier, input_left_shift, output_data, + DimsToShape(output_dims)); +} + +inline void Tanh(const int16* input_data, const RuntimeShape& input_shape, + int input_left_shift, int16* output_data, + const RuntimeShape& output_shape) { + TanhParams params; + params.input_left_shift = input_left_shift; + Tanh(params, input_shape, input_data, output_shape, output_data); +} + +inline void Tanh(const int16* input_data, const Dims<4>& input_dims, + int input_left_shift, int16* output_data, + const Dims<4>& output_dims) { + Tanh(input_data, DimsToShape(input_dims), input_left_shift, output_data, + DimsToShape(output_dims)); +} + +template <typename T> +inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, + int block_size, T* output_data, + const Dims<4>& output_dims) { + tflite::DepthToSpaceParams op_params; + op_params.block_size = block_size; + + DepthToSpace(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, + int block_size, T* output_data, + const Dims<4>& output_dims) { + tflite::SpaceToDepthParams op_params; + op_params.block_size = block_size; + + SpaceToDepth(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +inline void Mul(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <FusedActivationFunctionType Ac> +void Mul(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + + Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, + output_activation_max, output_data, output_dims); +} + +inline void Mul(const int32* input1_data, const Dims<4>& input1_dims, + const int32* input2_data, const Dims<4>& input2_dims, + int32 output_activation_min, int32 output_activation_max, + int32* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <FusedActivationFunctionType Ac> +void Mul(const int32* input1_data, const Dims<4>& input1_dims, + const int32* input2_data, const Dims<4>& input2_dims, + int32* output_data, const Dims<4>& output_dims) { + TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); + tflite::ArithmeticParams op_params; + // No parameters needed. + + MulNoActivation(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, + const int16* input2_data, const Dims<4>& input2_dims, + int16* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + // No parameters needed. + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, + const int16* input2_data, const Dims<4>& input2_dims, + int32 output_offset, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.output_offset = output_offset; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <typename T> +void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +// For compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +inline void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + float float_activation_min; + float float_activation_max; + GetActivationMinMax(Ac, &float_activation_min, &float_activation_max); + SetActivationParams(float_activation_min, float_activation_max, &op_params); + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +inline void LocalResponseNormalization(const float* input_data, + const Dims<4>& input_dims, int range, + float bias, float alpha, float beta, + float* output_data, + const Dims<4>& output_dims) { + tflite::LocalResponseNormalizationParams op_params; + op_params.range = range; + op_params.bias = bias; + op_params.alpha = alpha; + op_params.beta = beta; + + LocalResponseNormalization(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename SrcT, typename DstT> +void Cast(const SrcT* input_data, const Dims<4>& input_dims, DstT* output_data, + const Dims<4>& output_dims) { + Cast(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void Floor(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Floor(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, + const int32* output_size_data, + const Dims<4>& output_size_dims, float* output_data, + const Dims<4>& output_dims, bool align_corners) { + tflite::ResizeBilinearParams op_params; + op_params.align_corners = align_corners; + ResizeBilinear(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_size_dims), output_size_data, + DimsToShape(output_dims), output_data); +} + +inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims, + const int32* output_size_data, + const Dims<4>& output_size_dims, uint8* output_data, + const Dims<4>& output_dims, bool align_corners) { + tflite::ResizeBilinearParams op_params; + op_params.align_corners = align_corners; + ResizeBilinear(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_size_dims), output_size_data, + DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, + const int32* output_size_data, + const Dims<4>& output_size_dims, float* output_data, + const Dims<4>& output_dims) { + ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims, + output_data, output_dims, /*align_corners=*/false); +} + +// legacy, for compatibility with old checked-in code +inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims, + const int32* output_size_data, + const Dims<4>& output_size_dims, uint8* output_data, + const Dims<4>& output_dims) { + ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims, + output_data, output_dims, /*align_corners=*/false); +} + +template <typename T> +inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, + const int32* block_shape_data, + const Dims<4>& block_shape_dims, + const int32* crops_data, const Dims<4>& crops_dims, + T* output_data, const Dims<4>& output_dims) { + BatchToSpaceND(DimsToShape(input_dims), input_data, + DimsToShape(block_shape_dims), block_shape_data, + DimsToShape(crops_dims), crops_data, DimsToShape(output_dims), + output_data); +} + +// Legacy signature, function covered both Pad and PadV2. +template <typename T> +inline void PadV2(const T* input_data, const Dims<4>& input_dims, + const std::vector<int>& left_paddings, + const std::vector<int>& right_paddings, T* output_data, + const Dims<4>& output_dims, const T pad_value) { + TFLITE_DCHECK_EQ(left_paddings.size(), 4); + TFLITE_DCHECK_EQ(right_paddings.size(), 4); + tflite::PadParams op_params; + op_params.left_padding_count = 4; + op_params.right_padding_count = 4; + for (int i = 0; i < 4; ++i) { + op_params.left_padding[i] = left_paddings[3 - i]; + op_params.right_padding[i] = right_paddings[3 - i]; + } + const T pad_value_copy = pad_value; + + Pad(op_params, DimsToShape(input_dims), input_data, &pad_value_copy, + DimsToShape(output_dims), output_data); +} + +// Old Pad that calls legacy PadV2. +template <typename T> +inline void Pad(const T* input_data, const Dims<4>& input_dims, + const std::vector<int>& left_paddings, + const std::vector<int>& right_paddings, T* output_data, + const Dims<4>& output_dims, const int32_t pad_value) { + const T converted_pad_value = static_cast<T>(pad_value); + PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data, + output_dims, converted_pad_value); +} + +// Old Pad that only padded with 0. +template <typename T> +inline void Pad(const T* input_data, const Dims<4>& input_dims, + const std::vector<int>& left_paddings, + const std::vector<int>& right_paddings, T* output_data, + const Dims<4>& output_dims) { + const T pad_value = static_cast<T>(0); + PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data, + output_dims, pad_value); +} + +template <typename T> +inline void Slice(const T* input_data, const Dims<4>& input_dims, + const std::vector<int>& begin, const std::vector<int>& size, + T* output_data, const Dims<4>& output_dims) { + tflite::SliceParams op_params; + op_params.begin_count = 4; + op_params.size_count = 4; + for (int i = 0; i < 4; ++i) { + op_params.begin[i] = begin[3 - i]; + op_params.size[i] = size[3 - i]; + } + + Slice(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, T* output_data, + const Dims<4>& output_dims) { + Minimum(DimsToShape(input1_dims), input1_data, input2_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, T* output_data, + const Dims<4>& output_dims) { + Maximum(DimsToShape(input1_dims), input1_data, input2_data, + DimsToShape(output_dims), output_data); +} + +} // namespace optimized_ops +} // namespace tflite +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h index 27d9224512..4139cf4eba 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV -#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_ #include <assert.h> #include <stdint.h> @@ -26,7 +26,7 @@ limitations under the License. #include <tuple> #include <type_traits> -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" #include "tensorflow/contrib/lite/kernels/internal/common.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" @@ -35,35 +35,6 @@ limitations under the License. namespace tflite { namespace multithreaded_ops { -class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface { - public: - explicit EigenThreadPoolWrapper(Eigen::ThreadPool* pool) : pool_(pool) {} - ~EigenThreadPoolWrapper() override {} - - void Schedule(std::function<void()> fn) override { - pool_->Schedule(std::move(fn)); - } - int NumThreads() const override { return pool_->NumThreads(); } - int CurrentThreadId() const override { return pool_->CurrentThreadId(); } - - private: - Eigen::ThreadPool* pool_ = nullptr; -}; - -// We have a single global threadpool for all convolution operations. This means -// that inferences started from different threads may block each other, but -// since the underlying resource of CPU cores should be consumed by the -// operations anyway, it shouldn't affect overall performance. -const Eigen::ThreadPoolDevice& GetThreadPoolDevice() { - const int thread_count = 4; - static Eigen::ThreadPool* tp = new Eigen::ThreadPool(thread_count); - static EigenThreadPoolWrapper* thread_pool_wrapper = - new EigenThreadPoolWrapper(tp); - static Eigen::ThreadPoolDevice* device = - new Eigen::ThreadPoolDevice(thread_pool_wrapper, thread_count); - return *device; -} - // Shorthands for the types we need when interfacing with the EigenTensor // library. typedef Eigen::TensorMap< @@ -98,13 +69,13 @@ struct MatMulConvFunctor { template <class T> class EigenTensorConvFunctor { private: - Eigen::PaddingType TfLitePadding2EigenPadding(TfLitePadding padding) { + Eigen::PaddingType RuntimePadding2EigenPadding(PaddingType padding) { switch (padding) { - case kTfLitePaddingValid: + case PaddingType::kValid: return Eigen::PADDING_VALID; - case kTfLitePaddingSame: + case PaddingType::kSame: return Eigen::PADDING_SAME; - case kTfLitePaddingUnknown: + case PaddingType::kNone: assert(false); // should never get here. return Eigen::PADDING_VALID; } @@ -113,14 +84,13 @@ class EigenTensorConvFunctor { } public: - void operator()(const T* input_data, T* im2col_buffer, int input_batches, - int input_height, int input_width, int input_depth, - const T* filter_data, int filter_height, int filter_width, - int filter_count, int stride_rows, int stride_cols, - int pad_width, int pad_height, TfLitePadding padding, - T* output_data, int output_height, int output_width) { - const Eigen::ThreadPoolDevice& device = GetThreadPoolDevice(); - + void operator()(const Eigen::ThreadPoolDevice& device, const T* input_data, + T* im2col_buffer, int input_batches, int input_height, + int input_width, int input_depth, const T* filter_data, + int filter_height, int filter_width, int filter_count, + int stride_rows, int stride_cols, int pad_width, + int pad_height, PaddingType padding, T* output_data, + int output_height, int output_width) { const bool is_1x1_kernel = (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1); if (is_1x1_kernel) { @@ -143,8 +113,8 @@ class EigenTensorConvFunctor { filter_width * filter_height * input_depth; Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); - EigenMatrix output(output_data, 1, filter_count); - ConstEigenMatrix input(input_data, 1, k); + EigenMatrix output(output_data, input_batches, filter_count); + ConstEigenMatrix input(input_data, input_batches, k); ConstEigenMatrix filter(filter_data, k, filter_count); MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input, filter, dim_pair); @@ -157,40 +127,51 @@ class EigenTensorConvFunctor { input_depth, filter_count); output.device(device) = Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows, - TfLitePadding2EigenPadding(padding)); + RuntimePadding2EigenPadding(padding)); } } }; -inline void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, TfLitePadding padding, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims, - float* im2col_data, const Dims<4>& im2col_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); - const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); +inline void Conv(const Eigen::ThreadPoolDevice& device, + const ConvParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data, const RuntimeShape& im2col_shape, + float* im2col_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const PaddingType padding = params.padding_type; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); EigenTensorConvFunctor<float> conv_functor; - conv_functor(input_data, im2col_data, batches, input_height, input_width, - input_depth, filter_data, filter_height, filter_width, - output_depth, stride_height, stride_width, pad_height, pad_width, - padding, output_data, output_height, output_width); + conv_functor(device, input_data, im2col_data, batches, input_height, + input_width, input_depth, filter_data, filter_height, + filter_width, output_depth, stride_height, stride_width, + pad_height, pad_width, padding, output_data, output_height, + output_width); optimized_ops::AddBiasAndEvalActivationFunction( - bias_data, bias_dims, output_data, output_dims, output_activation_min, - output_activation_max); + output_activation_min, output_activation_max, bias_shape, bias_data, + output_shape, output_data); } } // namespace multithreaded_ops } // namespace tflite -#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc index 08f7cfa5a5..36c15dbc57 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc +++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc @@ -15,7 +15,7 @@ limitations under the License. #include <stdlib.h> #include <string.h> -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/internal/common.h" #include "tensorflow/contrib/lite/kernels/internal/compatibility.h" @@ -55,83 +55,33 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, const int postamble_start = m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1)); - // The arrays used to cache the vector. - void* aligned_vector_cache_free = nullptr; - float32x4_t* vector_cache_float32x4 = - reinterpret_cast<float32x4_t*>(aligned_alloc( - sizeof(float32x4_t), (postamble_start >> 2) * sizeof(float32x4_t), - &aligned_vector_cache_free)); - - const int kUnrollSize = 2; for (int b = 0; b < n_batch; b++) { float* result_in_batch = result + b * m_rows * result_stride; const float* vector_in_batch = vector + b * m_cols; + const float* matrix_row = matrix; - const float* matrix_ptr0 = matrix; - // If there is only 1 row, we don't want to assign an illegal pointer. - const float* matrix_ptr1 = nullptr; - if (m_rows > 1) { - matrix_ptr1 = matrix + m_cols; - } - - // Cache the vector. - for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { - vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c); - } - - // Main matrix by vector multiplication loop, which handles two rows of - // matrix by vector multiplication. - for (int r = 0; r < (m_rows & ~(kUnrollSize - 1)); r += kUnrollSize) { - float32x4_t acc0_32x4 = vmovq_n_f32(0.0); - float32x4_t acc1_32x4 = vmovq_n_f32(0.0); + // Main matrix by vector multiplication loop + for (int r = 0; r < m_rows; r++) { + float32x4_t acc_32x4 = vmovq_n_f32(0.0); for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { - float32x4_t temp = vector_cache_float32x4[c >> 2]; - // Load 4 float values from vector1 and vector2 and accumulator. - float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c); - float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr1 + c); - // Vector multiply-accumulate 4 float - acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp); - acc1_32x4 = vmlaq_f32(acc1_32x4, v1_f32x4, temp); + // Load 4 float values from vector and matrix row. + float32x4_t vector_f32x4 = vld1q_f32(vector_in_batch + c); + float32x4_t matrix_f32x4 = vld1q_f32(matrix_row + c); + // Multiply the vector and matrix row and add to accumulator. + acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4); } // Add the 4 intermediate sum values to get the final dot-prod value for // this column. *result_in_batch += - (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) + - vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3)); - *(result_in_batch + result_stride) += - (vgetq_lane_f32(acc1_32x4, 0) + vgetq_lane_f32(acc1_32x4, 1) + - vgetq_lane_f32(acc1_32x4, 2) + vgetq_lane_f32(acc1_32x4, 3)); + (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) + + vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3)); for (int c = postamble_start; c < m_cols; c++) { - *result_in_batch += matrix_ptr0[c] * vector_in_batch[c]; - *(result_in_batch + result_stride) += - matrix_ptr1[c] * vector_in_batch[c]; - } - matrix_ptr0 += kUnrollSize * m_cols; - matrix_ptr1 += kUnrollSize * m_cols; - result_in_batch += kUnrollSize * result_stride; - } - for (int r = (m_rows & ~(kUnrollSize - 1)); r < m_rows; r++) { - float32x4_t acc0_32x4 = vmovq_n_f32(0.0); - for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { - float32x4_t temp = vector_cache_float32x4[c >> 2]; - // Load 4 float values from vector1 and vector2 and accumulator. - float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c); - // Vector multiply-accumulate 4 float - acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp); + *result_in_batch += matrix_row[c] * vector_in_batch[c]; } - // Add the 4 intermediate sum values to get the final dot-prod value for - // this column. - *result_in_batch += - (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) + - vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3)); - for (int c = postamble_start; c < m_cols; c++) { - *result_in_batch += matrix_ptr0[c] * vector_in_batch[c]; - } - matrix_ptr0 += m_cols; + matrix_row += m_cols; result_in_batch += result_stride; } } - free(aligned_vector_cache_free); } void NeonMatrixBatchVectorMultiplyAccumulate( @@ -162,7 +112,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate( int batch, row, col; for (batch = 0; batch < n_batch; ++batch) { - const float batch_scaling_factor_inv = 1.0 / scaling_factors[batch]; + const float batch_scaling_factor = scaling_factors[batch]; // Copy the vector data to an aligned vector. memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8) * m_cols); // Compute dot-product for every column. @@ -232,7 +182,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate( int32 neon_sum = vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1); - *result += ((neon_sum + postable_sum) * batch_scaling_factor_inv); + *result += ((neon_sum + postable_sum) * batch_scaling_factor); } // for row } // for batch @@ -286,6 +236,35 @@ void NeonVectorVectorCwiseProductAccumulate(const float* vector1, } } +void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size, + const float* batch_vector, int n_batch, + float* result) { + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = + v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + + for (int b = 0; b < n_batch; b++) { + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { + // Load from memory to vectors. + float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector + v); + float32x4_t vector_f32x4 = vld1q_f32(vector + v); + // Multiply. + float32x4_t result_f32x4 = vmulq_f32(batch_vector_f32x4, vector_f32x4); + // Store. + vst1q_f32(result + v, result_f32x4); + } + // Postamble loop + for (int v = postamble_start; v < v_size; v++) { + result[v] = vector[v] * batch_vector[v]; + } + // Update the pointers. + result += v_size; + batch_vector += v_size; + } +} + void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, const float* batch_vector, @@ -296,17 +275,6 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); - // The arrays used to cache the vector. - void* aligned_vector_cache_free = nullptr; - float32x4_t* vector_cache_float32x4 = - reinterpret_cast<float32x4_t*>(aligned_alloc( - sizeof(float32x4_t), (postamble_start >> 2) * sizeof(float32x4_t), - &aligned_vector_cache_free)); - - for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { - vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v); - } - float* result_ptr = result; const float* batch_vector_ptr = batch_vector; for (int b = 0; b < n_batch; b++) { @@ -314,9 +282,9 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, // Load from memory to vectors. float32x4_t result_f32x4 = vld1q_f32(result_ptr + v); float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v); + float32x4_t vector_f32x4 = vld1q_f32(vector + v); // Multiply-accumulate. - result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4, - vector_cache_float32x4[v >> 2]); + result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4, vector_f32x4); // Store. vst1q_f32(result_ptr + v, result_f32x4); } @@ -328,7 +296,6 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, result_ptr += v_size; batch_vector_ptr += v_size; } - free(aligned_vector_cache_free); } void NeonSub1Vector(const float* vector, int v_size, float* result) { @@ -352,6 +319,30 @@ void NeonSub1Vector(const float* vector, int v_size, float* result) { } } +bool NeonIsZeroVector(const float* vector, int v_size) { + // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot + // use the main vectorized loop, and we need to process sequentially. + // postamble_start shows the start index where this should happen. + const int postamble_start = + v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + + const float32x4_t zero_x4_float = vmovq_n_f32(0.0f); + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { + const float32x4_t i_x4_float = vld1q_f32(vector + v); + uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float); + if (vgetq_lane_u32(cmp_result, 0) == 0) return false; + if (vgetq_lane_u32(cmp_result, 1) == 0) return false; + if (vgetq_lane_u32(cmp_result, 2) == 0) return false; + if (vgetq_lane_u32(cmp_result, 3) == 0) return false; + } + + // Postamble loop + for (int v = postamble_start; v < v_size; ++v) { + if (vector[v] != 0.0) return false; + } + return true; +} + void NeonClipVector(const float* vector, int v_size, float abs_limit, float* result) { // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main @@ -380,6 +371,77 @@ void NeonClipVector(const float* vector, int v_size, float abs_limit, } } +void NeonVectorScalarMultiply(const int8_t* vector, const int v_size, + const float scale, float* result) { + // Here the assumption is that each buffer is 4-byte aligned. + const int kWeightsPerUint32 = 4; + TFLITE_CHECK_EQ((intptr_t)(&vector[0]) & (kWeightsPerUint32 - 1), 0); + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int kWeightsPerNeonLane = 16; + const int postamble_start = v_size - (v_size & (kWeightsPerNeonLane - 1)); + + // Create a vector of 4 floats with the scale value. + const float32x4_t scale_f32x4 = vdupq_n_f32(scale); + int v = 0; + for (; v < postamble_start; v += kWeightsPerNeonLane) { + // Load int8 values, sixteen at a time. + const int8x16_t v_i8x16 = vld1q_s8(vector + v); + // Split it into two components of size eight. + const int8x8_t v0_i8x8 = vget_low_s8(v_i8x16); + const int8x8_t v1_i8x8 = vget_high_s8(v_i8x16); + // Convert both components to int16 first. + const int16x8_t v0_i16x8 = vmovl_s8(v0_i8x8); + const int16x8_t v1_i16x8 = vmovl_s8(v1_i8x8); + // Split each of them into two components each. + const int16x4_t v0_i16x4 = vget_low_s16(v0_i16x8); + const int16x4_t v1_i16x4 = vget_high_s16(v0_i16x8); + const int16x4_t v2_i16x4 = vget_low_s16(v1_i16x8); + const int16x4_t v3_i16x4 = vget_high_s16(v1_i16x8); + // Convert these to int32 and then to float. + float32x4_t v0_f32x4 = vcvtq_f32_s32(vmovl_s16(v0_i16x4)); + float32x4_t v1_f32x4 = vcvtq_f32_s32(vmovl_s16(v1_i16x4)); + float32x4_t v2_f32x4 = vcvtq_f32_s32(vmovl_s16(v2_i16x4)); + float32x4_t v3_f32x4 = vcvtq_f32_s32(vmovl_s16(v3_i16x4)); + // Vector multiply four floats at a time. + v0_f32x4 = vmulq_f32(v0_f32x4, scale_f32x4); + v1_f32x4 = vmulq_f32(v1_f32x4, scale_f32x4); + v2_f32x4 = vmulq_f32(v2_f32x4, scale_f32x4); + v3_f32x4 = vmulq_f32(v3_f32x4, scale_f32x4); + // Store the results. + vst1q_f32(result + v, v0_f32x4); + vst1q_f32(result + v + 4, v1_f32x4); + vst1q_f32(result + v + 8, v2_f32x4); + vst1q_f32(result + v + 12, v3_f32x4); + } + + if (v_size - postamble_start >= (kWeightsPerNeonLane >> 1)) { + // Load eight int8 values, if there is at least eight remaining. + const int8x8_t v_i8x8 = vld1_s8(vector + v); + // Convert them to int16 first. + const int16x8_t v_i16x8 = vmovl_s8(v_i8x8); + // Split it into two components. + const int16x4_t v0_i16x4 = vget_low_s16(v_i16x8); + const int16x4_t v1_i16x4 = vget_high_s16(v_i16x8); + // Convert the components two floats. + float32x4_t v0_f32x4 = vcvtq_f32_s32(vmovl_s16(v0_i16x4)); + float32x4_t v1_f32x4 = vcvtq_f32_s32(vmovl_s16(v1_i16x4)); + // Vector multiply four floats at a time. + v0_f32x4 = vmulq_f32(v0_f32x4, scale_f32x4); + v1_f32x4 = vmulq_f32(v1_f32x4, scale_f32x4); + // Store the results. + vst1q_f32(result + v, v0_f32x4); + vst1q_f32(result + v + 4, v1_f32x4); + v += (kWeightsPerNeonLane >> 1); + } + + // Postamble loop. + for (; v < v_size; v++) { + result[v] = scale * vector[v]; + } +} + void NeonSymmetricQuantizeFloats(const float* values, const int size, int8_t* quantized_values, float* min, float* max, float* scaling_factor) { @@ -394,13 +456,14 @@ void NeonSymmetricQuantizeFloats(const float* values, const int size, *scaling_factor = 1; return; } - *scaling_factor = kScale / range; + *scaling_factor = range / kScale; + const float scaling_factor_inv = kScale / range; const int postamble_start = size - (size & (2 * kFloatWeightsPerNeonLane - 1)); // Vectorized constants. - const float32x4_t q_factor_f32x4 = vmovq_n_f32(*scaling_factor); + const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv); const float32x4_t point5_f32x4 = vmovq_n_f32(0.5); const float32x4_t zero_f32x4 = vmovq_n_f32(0.0); const int32x4_t scale_i32x4 = vmovq_n_s32(kScale); @@ -452,7 +515,7 @@ void NeonSymmetricQuantizeFloats(const float* values, const int size, for (int i = postamble_start; i < size; ++i) { const int32 quantized_value = - static_cast<int32>(TfLiteRound(*scaling_factor * values[i])); + static_cast<int32>(TfLiteRound(scaling_factor_inv * values[i])); quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value)); } } diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h index 9e60d0657b..630a6bbf29 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h @@ -17,7 +17,7 @@ limitations under the License. // TODO(ghodrat): Remove this header file and the dependency to internal data // structure. -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h" @@ -52,6 +52,13 @@ void VectorVectorCwiseProductAccumulate(const float* vector1, result); } +void VectorBatchVectorCwiseProduct(const float* vector, int v_size, + const float* batch_vector, int n_batch, + float* result) { + NEON_OR_PORTABLE(VectorBatchVectorCwiseProduct, vector, v_size, batch_vector, + n_batch, result); +} + void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, const float* batch_vector, int n_batch, float* result) { @@ -72,6 +79,11 @@ void BatchVectorBatchVectorDotProduct(const float* vector1, n_batch, result, result_stride); } +void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch, + float* batch_vector) { + PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector); +} + void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch, float* batch_vector) { PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector); @@ -100,16 +112,25 @@ void ZeroVector(float* vector, int v_size) { float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); } +// Check if all entries of a vector are zero. +bool IsZeroVector(const float* vector, int v_size) { + return NEON_OR_PORTABLE(IsZeroVector, vector, v_size); +} + +void VectorScalarMultiply(const int8_t* vector, int v_size, float scale, + float* result) { + NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result); +} void ClipVector(const float* vector, int v_size, float abs_limit, float* result) { NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result); } void SymmetricQuantizeFloats(const float* values, const int size, - int8_t* quantized_values, float* min, float* max, - float* scaling_factor) { - NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min, - max, scaling_factor); + int8_t* quantized_values, float* min_value, + float* max_value, float* scaling_factor) { + NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, + min_value, max_value, scaling_factor); } void VectorShiftLeft(float* vector, int v_size, float shift_value) { @@ -122,6 +143,13 @@ void ReductionSumVector(const float* input_vector, float* output_vector, reduction_size); } +void MeanStddevNormalization(const float* input_vector, float* output_vector, + int v_size, int n_batch, + float normalization_epsilon) { + PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch, + normalization_epsilon); +} + } // namespace tensor_utils } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 64ba5e62f6..77f84e0c1c 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -1,4 +1,4 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_ -#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_ #include <assert.h> #include <stdint.h> @@ -34,22 +34,58 @@ limitations under the License. #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/round.h" #include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include "tensorflow/contrib/lite/kernels/internal/types.h" namespace tflite { namespace optimized_ops { // Unoptimized reference ops: -using reference_ops::BroadcastGreater; -using reference_ops::BroadcastGreaterEqual; -using reference_ops::BroadcastLess; -using reference_ops::BroadcastLessEqual; +using reference_ops::ArgMax; +using reference_ops::ArgMinMax; +using reference_ops::Broadcast4DSlowGreater; +using reference_ops::Broadcast4DSlowGreaterEqual; +using reference_ops::Broadcast4DSlowGreaterEqualWithScaling; +using reference_ops::Broadcast4DSlowGreaterWithScaling; +using reference_ops::Broadcast4DSlowLess; +using reference_ops::Broadcast4DSlowLessEqual; +using reference_ops::Broadcast4DSlowLessEqualWithScaling; +using reference_ops::Broadcast4DSlowLessWithScaling; +using reference_ops::BroadcastAdd4DSlow; +using reference_ops::BroadcastMul4DSlow; +using reference_ops::BroadcastSub4DSlow; +using reference_ops::Concatenation; +using reference_ops::ConcatenationWithScaling; +using reference_ops::DepthConcatenation; +using reference_ops::Dequantize; +using reference_ops::Div; +using reference_ops::FakeQuant; +using reference_ops::Gather; using reference_ops::Greater; using reference_ops::GreaterEqual; +using reference_ops::GreaterEqualWithScaling; +using reference_ops::GreaterWithScaling; using reference_ops::Less; using reference_ops::LessEqual; +using reference_ops::LessEqualWithScaling; +using reference_ops::LessWithScaling; +using reference_ops::Mean; using reference_ops::RankOneSelect; +using reference_ops::Relu1; +using reference_ops::Relu6; +using reference_ops::ReluX; using reference_ops::Select; +using reference_ops::SpaceToBatchND; +using reference_ops::Split; +using reference_ops::StridedSlice; +using reference_ops::Transpose; + +// TODO(b/80247582) Remove this constant. +// This will be phased out as the shifts are revised with more thought. Use of a +// constant enables us to track progress on this work. +// +// Used to convert from old-style shifts (right) to new-style (left). +static constexpr int kReverseShift = -1; // Make a local VectorMap typedef allowing to map a float array // as a Eigen vector expression. The std::conditional here is to @@ -65,9 +101,9 @@ using VectorMap = typename std::conditional< Eigen::Dynamic, 1>>, Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type; -template <typename Scalar, int N> -VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) { - const int size = FlatSize(dims); +template <typename Scalar> +VectorMap<Scalar> MapAsVector(Scalar* data, const RuntimeShape& shape) { + const int size = shape.FlatSize(); return VectorMap<Scalar>(data, size, 1); } @@ -81,25 +117,20 @@ using MatrixMap = typename std::conditional< Eigen::Dynamic, Eigen::Dynamic>>, Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; -template <typename Scalar, int N> -MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data, - const Dims<N>& dims) { - const int rows = dims.sizes[0]; - int cols = 1; - for (int d = 1; d < N; d++) { - cols *= dims.sizes[d]; - } +template <typename Scalar> +MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar* data, + const RuntimeShape& shape) { + const int dims_count = shape.DimensionsCount(); + const int rows = shape.Dims(dims_count - 1); + const int cols = FlatSizeSkipDim(shape, dims_count - 1); return MatrixMap<Scalar>(data, rows, cols); } -template <typename Scalar, int N> -MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data, - const Dims<N>& dims) { - const int cols = dims.sizes[N - 1]; - int rows = 1; - for (int d = 0; d < N - 1; d++) { - rows *= dims.sizes[d]; - } +template <typename Scalar> +MatrixMap<Scalar> MapAsMatrixWithFirstDimAsCols(Scalar* data, + const RuntimeShape& shape) { + const int cols = shape.Dims(0); + const int rows = FlatSizeSkipDim(shape, 0); return MatrixMap<Scalar>(data, rows, cols); } @@ -110,147 +141,88 @@ using ArrayMap = typename std::conditional< Eigen::Dynamic, Eigen::Dynamic>>, Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; -template <typename Scalar, int N> -ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data, - const Dims<N>& dims) { - const int rows = dims.sizes[0]; - int cols = 1; - for (int d = 1; d < N; d++) { - cols *= dims.sizes[d]; - } +template <typename Scalar> +ArrayMap<Scalar> MapAsArrayWithLastDimAsRows(Scalar* data, + const RuntimeShape& shape) { + const int dims_count = shape.DimensionsCount(); + const int rows = shape.Dims(dims_count - 1); + const int cols = FlatSizeSkipDim(shape, dims_count - 1); return ArrayMap<Scalar>(data, rows, cols); } +// Copied from tensorflow/core/framework/tensor_types.h +template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex> +struct TTypes { + // Rank-1 tensor (vector) of scalar type T. + typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, + Eigen::Aligned> + Flat; + typedef Eigen::TensorMap< + Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>> + UnalignedConstMatrix; +}; + // TODO(b/62193649): this function is only needed as long // as we have the --variable_batch hack. -template <typename Scalar, int N> +template <typename Scalar> MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data, - const Dims<N>& dims, + const RuntimeShape& shape, int rows) { - int cols = 1; - bool matched_rows = false; - for (int d = 0; d < N; d++) { - cols *= dims.sizes[d]; - if (cols == rows) { - matched_rows = true; - cols = 1; - } - } - TFLITE_DCHECK(matched_rows); + const int flatsize = shape.FlatSize(); + TFLITE_DCHECK_EQ(flatsize % rows, 0); + const int cols = flatsize / rows; return MatrixMap<Scalar>(data, rows, cols); } -// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE -// BROADCASTING. -// -// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional -// rectangular array of numbers. -// -// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. -// However, as Dims<N> is to be deprecated, this class exists as an adaptor -// to enable simple unoptimized implementations of element-wise broadcasting -// operations. -template <int N> -struct NdArrayDesc { - // The "extent" of each dimension. Indices along dimension d must be in the - // half-open interval [0, extents[d]). - int extents[N]; - - // The number of *elements* (not bytes) between consecutive indices of each - // dimension. - int strides[N]; -}; - -// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING -// ELEMENT-WISE BROADCASTING. -// -// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. -inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, - int i3) { - TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); - TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); - TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); - TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); - return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + - i3 * desc.strides[3]; -} - -// Given the dimensions of the operands for an element-wise binary broadcast, -// adjusts them so that they can be directly iterated over with simple loops. -// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and -// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. -// -// This function assumes that the two input shapes are compatible up to -// broadcasting and the shorter one has already been prepended with 1s to be the -// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), -// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that -// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be -// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). -// -// When two shapes are compatible up to broadcasting, for each dimension d, -// the input extents are either equal, or one of them is 1. -// -// This function performs the following for each dimension d: -// - If the extents are equal, then do nothing since the loop that walks over -// both of the input arrays is correct. -// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 -// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows -// array0 to be referenced *at any index* in dimension d and still access the -// same slice. -template <int N> -inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, - const Dims<N>& input1_dims, - NdArrayDesc<N>* desc0_out, - NdArrayDesc<N>* desc1_out) { - TFLITE_DCHECK(desc0_out != nullptr); - TFLITE_DCHECK(desc1_out != nullptr); - - // Copy dims to desc. - for (int i = 0; i < N; ++i) { - desc0_out->extents[i] = input0_dims.sizes[i]; - desc0_out->strides[i] = input0_dims.strides[i]; - desc1_out->extents[i] = input1_dims.sizes[i]; - desc1_out->strides[i] = input1_dims.strides[i]; - } - - // Walk over each dimension. If the extents are equal do nothing. - // Otherwise, set the desc with extent 1 to have extent equal to the other and - // stride 0. - for (int i = 0; i < N; ++i) { - const int extent0 = ArraySize(input0_dims, i); - const int extent1 = ArraySize(input1_dims, i); - if (extent0 != extent1) { - if (extent0 == 1) { - desc0_out->strides[i] = 0; - desc0_out->extents[i] = extent1; - } else { - TFLITE_DCHECK_EQ(extent1, 1); - desc1_out->strides[i] = 0; - desc1_out->extents[i] = extent0; - } - } - } -} - -inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) { - for (int i = 0; i < 4; i++) { - if (dims1.sizes[i] != dims2.sizes[i]) { - return false; - } - } - return true; -} - -inline void AddBiasAndEvalActivationFunction(const float* bias_data, - const Dims<4>& bias_dims, - float* array_data, - const Dims<4>& array_dims, - float output_activation_min, - float output_activation_max) { +// This is like the template-parameter version, except that the power-of-two is +// passed as a function parameter. The template version is to be preferred, +// since some target hardware optimizations depend on the range of the exponent. +template <typename IntegerType> +IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) { + if (exponent == 0) { + return x; + } + using ScalarIntegerType = + typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType; + const IntegerType min = + gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min()); + const IntegerType max = + gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max()); + const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType); + + const std::int32_t threshold = + ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1); + const IntegerType positive_mask = + gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold)); + const IntegerType negative_mask = + gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold)); + + IntegerType result = gemmlowp::ShiftLeft(x, exponent); + result = gemmlowp::SelectUsingMask(positive_mask, max, result); + result = gemmlowp::SelectUsingMask(negative_mask, min, result); + return result; +} + +// This is like the template-parameter version, except that the power-of-two is +// passed as a function parameter. See raw-integer version for further comments. +template <typename tRawType, int tIntegerBits> +gemmlowp::FixedPoint<tRawType, tIntegerBits> +SaturatingRoundingMultiplyByPOTParam( + gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) { + return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( + SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent)); +} + +inline void AddBiasAndEvalActivationFunction(float output_activation_min, + float output_activation_max, + const RuntimeShape& bias_shape, + const float* bias_data, + const RuntimeShape& array_shape, + float* array_data) { #ifdef USE_NEON gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction"); - const int bias_size = FlatSize(bias_dims); - const int array_size = FlatSize(array_dims); + const int bias_size = bias_shape.FlatSize(); + const int array_size = array_shape.FlatSize(); TFLITE_DCHECK_EQ((array_size % bias_size), 0); float* array_ptr = array_data; float* array_end_ptr = array_ptr + array_size; @@ -300,8 +272,8 @@ inline void AddBiasAndEvalActivationFunction(const float* bias_data, } #else // not NEON gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction"); - const int bias_size = FlatSize(bias_dims); - const int array_size = FlatSize(array_dims); + const int bias_size = bias_shape.FlatSize(); + const int array_size = array_shape.FlatSize(); TFLITE_DCHECK_EQ((array_size % bias_size), 0); for (int array_offset = 0; array_offset < array_size; array_offset += bias_size) { @@ -314,19 +286,6 @@ inline void AddBiasAndEvalActivationFunction(const float* bias_data, #endif } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void AddBiasAndEvalActivationFunction(const float* bias_data, - const Dims<4>& bias_dims, - float* array_data, - const Dims<4>& array_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - AddBiasAndEvalActivationFunction(bias_data, bias_dims, array_data, array_dims, - output_activation_min, - output_activation_max); -} - template <typename Lhs, typename Rhs, typename Result> void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs, Eigen::MatrixBase<Result>* result) { @@ -360,21 +319,24 @@ inline void optimized_ops_preload_l1_keep(const uint8* ptr) { // to a matrix*vector product. LSTM cells contain a fully-connected node; // when quantized, this becomes a special type of GEMV operation where // the output is 16bit-quantized, thus needs its own special path. -inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims, - const uint8* weights_data, - const Dims<4>& weights_dims, - uint8 weights_zero_point, const int32* bias_data, - const Dims<4>& bias_dims, int32 accum_multiplier, - int accum_shift, int16* output_data, - const Dims<4>& output_dims) { +inline void GEMVForLstmCell(const RuntimeShape& input_shape, + const uint8* input_data, + const RuntimeShape& weights_shape, + const uint8* weights_data, uint8 weights_zero_point, + const RuntimeShape& bias_shape, + const int32* bias_data, int32 accum_multiplier, + int accum_shift, const RuntimeShape& output_shape, + int16* output_data) { gemmlowp::ScopedProfilingLabel label("GEMVForLstmCell"); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); - TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_dims, 0), 1); - const int input_size = FlatSizeSkipDim(input_dims, 3); - const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0); + TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + const int output_dim_count = output_shape.DimensionsCount(); + const int weights_dim_count = weights_shape.DimensionsCount(); + TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1); + const int input_size = FlatSizeSkipDim(input_shape, 0); + const int output_size = MatchingDim(weights_shape, weights_dim_count - 2, + output_shape, output_dim_count - 1); // This special fast path for quantized LSTM cells does not try to support // odd sizes that we haven't encountered in any LSTM cell, that would // require special code (that would go untested until any LSTM cell @@ -547,18 +509,21 @@ inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims, #ifdef GEMMLOWP_NEON inline void GEMVForLstmCellWithSymmetricRange( - const uint8* input_data, const Dims<4>& input_dims, - const uint8* weights_data, const Dims<4>& weights_dims, - const int32* bias_data, const Dims<4>& bias_dims, int32 accum_multiplier, - int accum_shift, int16* output_data, const Dims<4>& output_dims) { + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& weights_shape, const uint8* weights_data, + const RuntimeShape& bias_shape, const int32* bias_data, + int32 accum_multiplier, int accum_shift, const RuntimeShape& output_shape, + int16* output_data) { gemmlowp::ScopedProfilingLabel label("GEMVForLstmCellWithSymmetricRange"); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); - TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_dims, 0), 1); - const int input_size = FlatSizeSkipDim(input_dims, 3); - const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0); + TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + const int output_dim_count = output_shape.DimensionsCount(); + const int weights_dim_count = weights_shape.DimensionsCount(); + TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1); + const int input_size = FlatSizeSkipDim(input_shape, 0); + const int output_size = MatchingDim(weights_shape, weights_dim_count - 2, + output_shape, output_dim_count - 1); // This special fast path for quantized LSTM cells does not try to support // odd sizes that we haven't encountered in any LSTM cell, that would // require special code (that would go untested until any LSTM cell @@ -834,14 +799,16 @@ inline void GEMVForLstmCellWithSymmetricRange( } #endif -inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, - const float* weights_data, - const Dims<4>& weights_dims, const float* bias_data, - const Dims<4>& bias_dims, - float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims) { +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& weights_shape, + const float* weights_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("FullyConnected"); + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + // TODO(b/62193649): this convoluted shape computation (determining // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows) // is because the current --variable_batch hack consists in overwriting the @@ -850,50 +817,42 @@ inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, // When that is fixed, this should become: // const auto input_matrix_map = // MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - const int input_rows = ArraySize(weights_dims, 0); + const int dims_count = weights_shape.DimensionsCount(); + const int input_rows = weights_shape.Dims(dims_count - 1); const auto input_matrix_map = - MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows); + MapAsMatrixWithGivenNumberOfRows(input_data, input_shape, input_rows); const auto filter_matrix_map = - MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims); + MapAsMatrixWithLastDimAsRows(weights_data, weights_shape); auto output_matrix_map = - MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + MapAsMatrixWithLastDimAsRows(output_data, output_shape); Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map); - AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data, - output_dims, output_activation_min, - output_activation_max); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void FullyConnected(const float* input_data, const Dims<4>& input_dims, - const float* weights_data, const Dims<4>& weights_dims, - const float* bias_data, const Dims<4>& bias_dims, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, - bias_dims, output_activation_min, output_activation_max, - output_data, output_dims); + AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max, + bias_shape, bias_data, output_shape, + output_data); } #ifdef USE_NEON inline void FullyConnectedAsGEMV( - const uint8* input_data, const Dims<4>& input_dims, int32 input_offset, - const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int32 output_offset, + const RuntimeShape& input_shape, const uint8* input_data, + int32 input_offset, const RuntimeShape& filter_shape, + const uint8* filter_data, int32 filter_offset, + const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset, int32 output_multiplier, int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { + int32 output_activation_max, const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit"); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); - TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_dims, 0), 1); - const int input_size = FlatSizeSkipDim(input_dims, 3); - const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); + TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); + TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1); + const int input_size = FlatSizeSkipDim(input_shape, 0); + const int output_size = MatchingDim(filter_shape, filter_dim_count - 2, + output_shape, output_dim_count - 1); static constexpr int kPeel = 4; + const bool shift_left = (output_shift > 0); for (int k = 0; k < input_size; k += 64) { optimized_ops_preload_l1_stream(input_data + k); } @@ -1005,11 +964,17 @@ inline void FullyConnectedAsGEMV( int32x4_t bias_vec = vld1q_s32(bias_ptr); bias_ptr += 4; reduced = vaddq_s32(reduced, bias_vec); - // Multiply by the fixed-point multiplier. - reduced = vqrdmulhq_n_s32(reduced, output_multiplier); - // Rounding-shift-right. - using gemmlowp::RoundingDivideByPOT; - reduced = RoundingDivideByPOT(reduced, output_shift); + if (shift_left) { + const int32 multiplier_power_of_two = 1 << output_shift; + reduced = vmulq_n_s32(reduced, multiplier_power_of_two); + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + } else { + // Multiply by the fixed-point multiplier. + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + // Rounding-shift-right. + using gemmlowp::RoundingDivideByPOT; + reduced = RoundingDivideByPOT(reduced, -output_shift); + } // Add the output offset. const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); reduced = vaddq_s32(reduced, output_offset_vec); @@ -1031,23 +996,22 @@ inline void FullyConnectedAsGEMV( struct GemmlowpOutputPipeline { typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col> ColVectorMap; - typedef std::tuple< - gemmlowp::OutputStageBiasAddition<ColVectorMap>, - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, - gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8> + typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>, + gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent, + gemmlowp::OutputStageClamp, + gemmlowp::OutputStageSaturatingCastToUint8> Pipeline; - static Pipeline Make(const int32* bias_data, int output_rows, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max) { + static Pipeline MakeExp(const int32* bias_data, int output_rows, + int32 output_offset, int32 output_multiplier, + int output_left_shift, int32 output_activation_min, + int32 output_activation_max) { ColVectorMap bias_vector(bias_data, output_rows); gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage; bias_addition_stage.bias_vector = bias_vector; - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint - quantize_down_stage; + gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage; quantize_down_stage.result_offset_after_shift = output_offset; quantize_down_stage.result_fixedpoint_multiplier = output_multiplier; - quantize_down_stage.result_shift = output_shift; + quantize_down_stage.result_exponent = output_left_shift; gemmlowp::OutputStageClamp clamp_stage; clamp_stage.min = output_activation_min; clamp_stage.max = output_activation_max; @@ -1057,42 +1021,47 @@ struct GemmlowpOutputPipeline { } }; -inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data, gemmlowp::GemmContext* gemm_context) { gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit"); + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); // TODO(benoitjacob): This really should be: // const int batches = ArraySize(output_dims, 1); // but the current --variable_batch hack consists in overwriting the 3rd // dimension with the runtime batch size, as we don't keep track for each // array of which dimension is the batch dimension in it. - const int batches = FlatSizeSkipDim(output_dims, 0); + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); #ifdef USE_NEON - const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); + const int output_size = MatchingDim(filter_shape, filter_dim_count - 2, + output_shape, output_dim_count - 1); if (batches == 1 && !(output_size % 4)) { return FullyConnectedAsGEMV( - input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, output_data, - output_dims); + input_shape, input_data, input_offset, filter_shape, filter_data, + filter_offset, bias_shape, bias_data, output_offset, output_multiplier, + output_shift, output_activation_min, output_activation_max, + output_shape, output_data); } #endif // USE_NEON - const int filter_rows = filter_dims.sizes[1]; - const int filter_cols = filter_dims.sizes[0]; - TFLITE_DCHECK_EQ(filter_dims.sizes[2], 1); - TFLITE_DCHECK_EQ(filter_dims.sizes[3], 1); - const int output_rows = output_dims.sizes[0]; + const int filter_rows = filter_shape.Dims(filter_dim_count - 2); + const int filter_cols = filter_shape.Dims(filter_dim_count - 1); + TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols); + const int output_rows = output_shape.Dims(output_dim_count - 1); TFLITE_DCHECK_EQ(output_rows, filter_rows); - TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows); - TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1); - TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1); - TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows); gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( filter_data, output_rows, filter_cols, filter_cols); @@ -1100,7 +1069,7 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, input_data, filter_cols, batches, filter_cols); gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( output_data, output_rows, batches, output_rows); - const auto& output_pipeline = GemmlowpOutputPipeline::Make( + const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp( bias_data, output_rows, output_offset, output_multiplier, output_shift, output_activation_min, output_activation_max); gemmlowp::GemmWithOutputPipeline<uint8, uint8, @@ -1110,29 +1079,38 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, } inline void FullyConnected( - const uint8* input_data, const Dims<4>& input_dims, int32 input_offset, - const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data_int32, const Dims<4>& bias_dims, int32 output_offset, - int32 output_multiplier, int output_shift, int32 output_activation_min, - int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data_int32, const RuntimeShape& output_shape, + int16* output_data, gemmlowp::GemmContext* gemm_context) { gemmlowp::ScopedProfilingLabel label("FullyConnected/Uint8Int16"); + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; // This is a copy of the reference implementation. We do not currently have a // properly optimized version. (void)gemm_context; // only used in properly optimized code. TFLITE_DCHECK_LE(output_activation_min, output_activation_max); TFLITE_DCHECK_EQ(output_offset, 0); + TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); // TODO(benoitjacob): This really should be: // const int batches = ArraySize(output_dims, 1); // but the current --variable_batch hack consists in overwriting the 3rd // dimension with the runtime batch size, as we don't keep track for each // array of which dimension is the batch dimension in it. - const int batches = FlatSizeSkipDim(output_dims, 0); - const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0); - const int accum_depth = ArraySize(filter_dims, 0); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); // Implementation of the fully connected node suited to the inside of an LSTM // cell. The operands are 8-bit integers, the accumulators are internally @@ -1143,17 +1121,17 @@ inline void FullyConnected( if (batches == 1 && input_offset == -128 && output_activation_min == -32768 && output_activation_max == 32767) { if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 64)) { - GEMVForLstmCellWithSymmetricRange(input_data, input_dims, filter_data, - filter_dims, bias_data_int32, bias_dims, - output_multiplier, -output_shift, - output_data, output_dims); + GEMVForLstmCellWithSymmetricRange( + input_shape, input_data, filter_shape, filter_data, bias_shape, + bias_data_int32, output_multiplier, output_shift, output_shape, + output_data); return; } if (!(output_depth % 4) && !(accum_depth % 8)) { - GEMVForLstmCell(input_data, input_dims, filter_data, filter_dims, - filter_offset, bias_data_int32, bias_dims, - output_multiplier, -output_shift, output_data, - output_dims); + GEMVForLstmCell(input_shape, input_data, filter_shape, filter_data, + filter_offset, bias_shape, bias_data_int32, + output_multiplier, output_shift, output_shape, + output_data); return; } } @@ -1173,7 +1151,7 @@ inline void FullyConnected( scale_stage.result_offset_after_shift = 0; scale_stage.result_fixedpoint_multiplier = output_multiplier; // Note that this shift is negated wrt ordinary FC. - scale_stage.result_exponent = -output_shift; + scale_stage.result_exponent = output_shift; gemmlowp::OutputStageClamp clamp_stage; clamp_stage.min = output_activation_min; clamp_stage.max = output_activation_max; @@ -1187,34 +1165,12 @@ inline void FullyConnected( input_offset, output_pipeline); } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims, gemm_context); -} - // Internal function doing the actual arithmetic work for -// ExperimentalShuffledFullyConnected. +// ShuffledFullyConnected. // May be called either directly by it (single-threaded case) or may be used // as the 'task' for worker threads to run (multi-threaded case, see -// ExperimentalShuffledFullyConnectedWorkerTask below). -inline void ExperimentalShuffledFullyConnectedWorkerImpl( +// ShuffledFullyConnectedWorkerTask below). +inline void ShuffledFullyConnectedWorkerImpl( const uint8* shuffled_input_workspace_data, const int8* shuffled_weights_data, int batches, int output_depth, int output_stride, int accum_depth, const int32* bias_data, @@ -1222,8 +1178,8 @@ inline void ExperimentalShuffledFullyConnectedWorkerImpl( #if defined USE_NEON const int8* shuffled_weights_ptr = shuffled_weights_data; if (batches == 1) { - const int right_shift = output_shift > 0 ? output_shift : 0; - const int left_shift = output_shift > 0 ? 0 : -output_shift; + const int right_shift = output_shift > 0 ? 0 : -output_shift; + const int left_shift = output_shift > 0 ? output_shift : 0; for (int c = 0; c < output_depth; c += 4) { // Accumulation loop. int32x4_t row_accum0 = vdupq_n_s32(0); @@ -1289,8 +1245,8 @@ inline void ExperimentalShuffledFullyConnectedWorkerImpl( vst1_s16(output_data + c, res16); } } else if (batches == 4) { - const int right_shift = output_shift > 0 ? output_shift : 0; - const int left_shift = output_shift > 0 ? 0 : -output_shift; + const int right_shift = output_shift > 0 ? 0 : -output_shift; + const int left_shift = output_shift > 0 ? output_shift : 0; for (int c = 0; c < output_depth; c += 4) { const int8* shuffled_input_ptr = reinterpret_cast<const int8*>(shuffled_input_workspace_data); @@ -1421,8 +1377,8 @@ inline void ExperimentalShuffledFullyConnectedWorkerImpl( // (16-bit, typically 3 integer bits) fixed-point format. The quantized // multiplier and shift here have been pre-computed offline // (e.g. by toco). - acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, - -output_shift); + acc = + MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); // Saturate, cast to int16, and store to output array. acc = std::max(acc, -32768); acc = std::min(acc, 32767); @@ -1473,7 +1429,7 @@ inline void ExperimentalShuffledFullyConnectedWorkerImpl( // quantized multiplier and shift here have been pre-computed offline // (e.g. by toco). acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, - -output_shift); + output_shift); // Saturate, cast to int16, and store to output array. acc = std::max(acc, -32768); acc = std::min(acc, 32767); @@ -1488,14 +1444,16 @@ inline void ExperimentalShuffledFullyConnectedWorkerImpl( #endif } -// Wraps ExperimentalShuffledFullyConnectedWorkerImpl into a Task class +// Wraps ShuffledFullyConnectedWorkerImpl into a Task class // to allow using gemmlowp's threadpool. -struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { - ExperimentalShuffledFullyConnectedWorkerTask( - const uint8* input_data, const int8* shuffled_weights_data, int batches, - int output_depth, int output_stride, int accum_depth, - const int32* bias_data, int32 output_multiplier, int output_shift, - int16* output_data) +struct ShuffledFullyConnectedWorkerTask : gemmlowp::Task { + ShuffledFullyConnectedWorkerTask(const uint8* input_data, + const int8* shuffled_weights_data, + int batches, int output_depth, + int output_stride, int accum_depth, + const int32* bias_data, + int32 output_multiplier, int output_shift, + int16* output_data) : input_data_(input_data), shuffled_weights_data_(shuffled_weights_data), batches_(batches), @@ -1508,7 +1466,7 @@ struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { output_data_(output_data) {} void Run() override { - ExperimentalShuffledFullyConnectedWorkerImpl( + ShuffledFullyConnectedWorkerImpl( input_data_, shuffled_weights_data_, batches_, output_depth_, output_stride_, accum_depth_, bias_data_, output_multiplier_, output_shift_, output_data_); @@ -1526,28 +1484,35 @@ struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { int16* output_data_; }; -inline void ExperimentalShuffledFullyConnected( - const uint8* input_data, const Dims<4>& input_dims, - const uint8* shuffled_weights_data, const Dims<4>& weights_dims, - const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - int16* output_data, const Dims<4>& output_dims, - uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { - gemmlowp::ScopedProfilingLabel label( - "ExperimentalShuffledFullyConnected/8bit"); +inline void ShuffledFullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& weights_shape, + const uint8* shuffled_weights_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + int16* output_data, uint8* shuffled_input_workspace_data, + gemmlowp::GemmContext* gemm_context) { + gemmlowp::ScopedProfilingLabel label("ShuffledFullyConnected/8bit"); + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; (void)gemm_context; // only used in optimized code. TFLITE_DCHECK_EQ(output_activation_min, -32768); TFLITE_DCHECK_EQ(output_activation_max, 32767); + TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); // TODO(benoitjacob): This really should be: // const int batches = ArraySize(output_dims, 1); // but the current --variable_batch hack consists in overwriting the 3rd // dimension with the runtime batch size, as we don't keep track for each // array of which dimension is the batch dimension in it. - const int batches = FlatSizeSkipDim(output_dims, 0); - const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0); - const int accum_depth = ArraySize(weights_dims, 0); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); + const int output_dim_count = output_shape.DimensionsCount(); + const int weights_dim_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = weights_shape.Dims(weights_dim_count - 1); TFLITE_DCHECK((accum_depth % 16) == 0); TFLITE_DCHECK((output_depth % 4) == 0); // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) @@ -1618,7 +1583,7 @@ inline void ExperimentalShuffledFullyConnected( if (thread_count == 1) { // Single-thread case: do the computation on the current thread, don't // use a threadpool - ExperimentalShuffledFullyConnectedWorkerImpl( + ShuffledFullyConnectedWorkerImpl( shuffled_input_workspace_data, int8_shuffled_weights_data, batches, output_depth, output_depth, accum_depth, bias_data, output_multiplier, output_shift, output_data); @@ -1633,7 +1598,7 @@ inline void ExperimentalShuffledFullyConnected( int row_start = 0; for (int i = 0; i < thread_count; i++) { int row_end = std::min(output_depth, row_start + kRowsPerWorker); - tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask( + tasks[i] = new ShuffledFullyConnectedWorkerTask( shuffled_input_workspace_data, int8_shuffled_weights_data + row_start * accum_depth, batches, row_end - row_start, output_depth, accum_depth, bias_data + row_start, @@ -1645,12 +1610,16 @@ inline void ExperimentalShuffledFullyConnected( } template <typename T> -inline void ExtractPatchIntoBufferColumn( - const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth, - int stride_width, int stride_height, int pad_width, int pad_height, - int in_width, int in_height, int in_depth, int single_buffer_length, - int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) { +inline void ExtractPatchIntoBufferColumn(const RuntimeShape& input_shape, int w, + int h, int b, int kheight, int kwidth, + int stride_width, int stride_height, + int pad_width, int pad_height, + int in_width, int in_height, + int in_depth, int single_buffer_length, + int buffer_id, const T* in_data, + T* conv_buffer_data, uint8 zero_byte) { gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn"); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); // This chunk of code reshapes all the inputs corresponding to // output (b, h, w) to a column vector in conv_buffer(:, buffer_id). const int kwidth_times_indepth = kwidth * in_depth; @@ -1672,7 +1641,7 @@ inline void ExtractPatchIntoBufferColumn( const int output_row_offset = (buffer_id * single_buffer_length); int out_offset = output_row_offset + (h_offset * kwidth + w_offset) * in_depth; - int in_offset = Offset(input_dims, 0, iw_start, ih_start, b); + int in_offset = Offset(input_shape, b, ih_start, iw_start, 0); // Express all of the calculations as padding around the input patch. const int top_padding = h_offset; @@ -1686,7 +1655,7 @@ inline void ExtractPatchIntoBufferColumn( // patch that are off the edge of the input image. if (top_padding > 0) { const int top_row_elements = (top_padding * kwidth * in_depth); - memset(conv_buffer_data + output_row_offset, byte_zero, + memset(conv_buffer_data + output_row_offset, zero_byte, (top_row_elements * sizeof(T))); } @@ -1703,14 +1672,14 @@ inline void ExtractPatchIntoBufferColumn( for (int ih = ih_start; ih < ih_end; ++ih) { if (left_padding > 0) { const int left_start = (out_offset - (left_padding * in_depth)); - memset(conv_buffer_data + left_start, byte_zero, + memset(conv_buffer_data + left_start, zero_byte, (left_padding * in_depth * sizeof(T))); } memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T)); if (right_padding > 0) { const int right_start = (out_offset + single_row_num); - memset(conv_buffer_data + right_start, byte_zero, + memset(conv_buffer_data + right_start, zero_byte, (right_padding * in_depth * sizeof(T))); } out_offset += kwidth_times_indepth; @@ -1725,26 +1694,113 @@ inline void ExtractPatchIntoBufferColumn( const int bottom_start = output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); - memset(conv_buffer_data + bottom_start, byte_zero, + memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T))); } } template <typename T> -void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width, - int stride_height, int pad_width, int pad_height, int kheight, - int kwidth, uint8 byte_zero, T* output_data, - const Dims<4>& output_dims) { +void DilatedIm2col(const ConvParams& params, uint8 zero_byte, + const RuntimeShape& input_shape, const T* input_data, + const RuntimeShape& filter_shape, + const RuntimeShape& output_shape, T* im2col_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + // For dilated convolution, the input pixels are not contiguous therefore we + // can't use the same opitimizations as Im2Col(). Though note this code would + // work fine for the non-dilated case too (though likely a bit slower). + gemmlowp::ScopedProfilingLabel label("DilatedIm2col"); + TFLITE_DCHECK(dilation_width_factor != 1 || dilation_height_factor != 1); + TFLITE_DCHECK(im2col_data); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + MatchingDim(output_shape, 3, filter_shape, 0); + + // Construct the MxN sized im2col matrix. + // The rows M, are sub-ordered B x H x W + const RuntimeShape row_shape({1, batches, output_height, output_width}); + // The columns, N, are sub-ordered Kh x Kw x Din + const RuntimeShape col_shape({1, filter_height, filter_width, input_depth}); + // Use dimensions M and N to construct dims for indexing directly into im2col + const RuntimeShape im2col_shape( + {1, 1, row_shape.FlatSize(), col_shape.FlatSize()}); + + // Loop through the output rows (B x H x W) + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + // Each im2col row is an output pixel. Arrange the input data in this + // row in an order we can conveniently multiply with the filter data. + int row_offset = Offset(row_shape, 0, batch, out_y, out_x); + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + // Loop through all the pixels of the filter (Kh x Kw) + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + if ((in_y >= 0) && (in_y < input_height)) { + // Filter row is within the input data. + // Loop through all the filter pixels in this row. + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0); + T* dst = im2col_data + + Offset(im2col_shape, 0, 0, row_offset, col_offset); + if ((in_x >= 0) && (in_x < input_width)) { + // Filter pixel is within the input, copy the input data. + T const* src = + input_data + Offset(input_shape, batch, in_y, in_x, 0); + memcpy(dst, src, input_depth * sizeof(T)); + } else { + // Filter pixel is outside the input, zero it out. + memset(dst, zero_byte, input_depth * sizeof(T)); + } + } + } else { + // Filter row is outside the input, zero out the entire filter row. + int col_offset = Offset(col_shape, 0, filter_y, 0, 0); + T* dst = im2col_data + + Offset(im2col_shape, 0, 0, row_offset, col_offset); + memset(dst, zero_byte, filter_width * input_depth * sizeof(T)); + } + } + } + } + } +} + +template <typename T> +void Im2col(const ConvParams& params, int kheight, int kwidth, uint8 zero_byte, + const RuntimeShape& input_shape, const T* input_data, + const RuntimeShape& output_shape, T* output_data) { gemmlowp::ScopedProfilingLabel label("Im2col"); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_depth = ArraySize(input_dims, 0); - const int input_width = ArraySize(input_dims, 1); - const int input_height = ArraySize(input_dims, 2); - const int output_depth = ArraySize(output_dims, 0); - const int output_width = ArraySize(output_dims, 1); - const int output_height = ArraySize(output_dims, 2); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + const int output_depth = output_shape.Dims(3); + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); int buffer_id = 0; // Loop over the output nodes. @@ -1752,252 +1808,241 @@ void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width, for (int h = 0; h < output_height; ++h) { for (int w = 0; w < output_width; ++w) { ExtractPatchIntoBufferColumn( - input_dims, w, h, b, kheight, kwidth, stride_width, stride_height, + input_shape, w, h, b, kheight, kwidth, stride_width, stride_height, pad_width, pad_height, input_width, input_height, input_depth, - output_depth, buffer_id, input_data, output_data, byte_zero); + output_depth, buffer_id, input_data, output_data, zero_byte); ++buffer_id; } } } } -// legacy, for compatibility with old checked-in code -template <typename T> -void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int kheight, int kwidth, - uint8 byte_zero, T* output_data, const Dims<4>& output_dims) { - Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, - kwidth, byte_zero, output_data, output_dims); -} - -inline void DilatedConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, - int dilation_width_factor, int dilation_height_factor, - int pad_width, int pad_height, - float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims, float* im2col_data, - const Dims<4>& im2col_dims) { - gemmlowp::ScopedProfilingLabel label("DilatedConv"); - // This is a copy of the reference Conv implementation. We do not currently - // have an optimized path for dilation. - (void)im2col_data; // only used in optimized code. - (void)im2col_dims; // only used in optimized code. - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); - const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0); - if (bias_data) { - TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0)); - } - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - for (int batch = 0; batch < batches; ++batch) { - for (int out_y = 0; out_y < output_height; ++out_y) { - for (int out_x = 0; out_x < output_width; ++out_x) { - for (int out_channel = 0; out_channel < output_depth; ++out_channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; - float total = 0.f; - for (int filter_y = 0; filter_y < filter_height; ++filter_y) { - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - for (int in_channel = 0; in_channel < input_depth; ++in_channel) { - const int in_x = in_x_origin + dilation_width_factor * filter_x; - const int in_y = - in_y_origin + dilation_height_factor * filter_y; - // If the location is outside the bounds of the input image, - // use zero as a default value. - if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && - (in_y < input_height)) { - float input_value = input_data[Offset(input_dims, in_channel, - in_x, in_y, batch)]; - float filter_value = - filter_data[Offset(filter_dims, in_channel, filter_x, - filter_y, out_channel)]; - total += (input_value * filter_value); - } - } - } - } - float bias_value = 0.0f; - if (bias_data) { - bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; - } - output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = - ActivationFunctionWithMinMax(total + bias_value, - output_activation_min, - output_activation_max); - } - } - } - } -} - -inline void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int dilation_width_factor, - int dilation_height_factor, int pad_width, int pad_height, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims, - float* im2col_data, const Dims<4>& im2col_dims) { - if ((dilation_width_factor != 1) || (dilation_height_factor != 1)) { - return DilatedConv(input_data, input_dims, filter_data, filter_dims, - bias_data, bias_dims, stride_width, stride_height, - dilation_width_factor, dilation_height_factor, pad_width, - pad_height, output_activation_min, output_activation_max, - output_data, output_dims, im2col_data, im2col_dims); - } +inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data, const RuntimeShape& im2col_shape, + float* im2col_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); (void)im2col_data; - (void)im2col_dims; + (void)im2col_shape; gemmlowp::ScopedProfilingLabel label("Conv"); + // NB: static_cast<float>(0x00000000h) == 0.0f + const uint8 float_zero_byte = 0x00; const float* gemm_input_data = nullptr; - const Dims<4>* gemm_input_dims = nullptr; - const int filter_width = ArraySize(filter_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); + const RuntimeShape* gemm_input_shape = nullptr; + const int filter_width = filter_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const bool need_dilated_im2col = + dilation_width_factor != 1 || dilation_height_factor != 1; const bool need_im2col = stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1; - if (need_im2col) { + if (need_dilated_im2col) { + DilatedIm2col(params, float_zero_byte, input_shape, input_data, + filter_shape, output_shape, im2col_data); + gemm_input_data = im2col_data; + gemm_input_shape = &im2col_shape; + } else if (need_im2col) { TFLITE_DCHECK(im2col_data); - Im2col(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_height, filter_width, 0, im2col_data, - im2col_dims); + Im2col(params, filter_height, filter_width, float_zero_byte, input_shape, + input_data, im2col_shape, im2col_data); gemm_input_data = im2col_data; - gemm_input_dims = &im2col_dims; + gemm_input_shape = &im2col_shape; } else { // TODO(aselle): We need to make sure to not send im2col if it is not // needed. TFLITE_DCHECK(!im2col_data); gemm_input_data = input_data; - gemm_input_dims = &input_dims; + gemm_input_shape = &input_shape; } const auto im2col_matrix_map = - MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims); + MapAsMatrixWithLastDimAsRows(gemm_input_data, *gemm_input_shape); const auto filter_matrix_map = - MapAsMatrixWithLastDimAsCols(filter_data, filter_dims); + MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape); auto output_matrix_map = - MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + MapAsMatrixWithLastDimAsRows(output_data, output_shape); Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map); - AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data, - output_dims, output_activation_min, - output_activation_max); -} + AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max, + bias_shape, bias_data, output_shape, + output_data); +} + +inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr, + const RuntimeShape& input_shape, + const int8_t* input_data, + const RuntimeShape& filter_shape, + const int8_t* filter_data, + const RuntimeShape& bias_shape, const float* bias_data, + const RuntimeShape& output_shape, float* output_data, + const RuntimeShape& im2col_shape, int8_t* im2col_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + const int batch_size = input_shape.Dims(0); + const int filter_width = filter_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + + const int8_t* gemm_input_data = nullptr; + int num_input; + const bool need_im2col = stride_width != 1 || stride_height != 1 || + filter_width != 1 || filter_height != 1; -template <FusedActivationFunctionType Ac> -void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, int stride_width, - int stride_height, int dilation_width_factor, - int dilation_height_factor, int pad_width, int pad_height, - float* output_data, const Dims<4>& output_dims, float* im2col_data, - const Dims<4>& im2col_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, - stride_width, stride_height, dilation_width_factor, - dilation_height_factor, pad_width, pad_height, output_activation_min, - output_activation_max, output_data, output_dims, im2col_data, - im2col_dims); -} + if (need_im2col) { + TFLITE_DCHECK(im2col_data); + // symmetric quantization assumes zero point of 0. + const int input_zero_point = 0; -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, int stride_width, - int stride_height, int pad_width, int pad_height, float* output_data, - const Dims<4>& output_dims, float* im2col_data, - const Dims<4>& im2col_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, - stride_width, stride_height, 1, 1, pad_width, pad_height, - output_activation_min, output_activation_max, output_data, output_dims, - im2col_data, im2col_dims); -} + Im2col(params, filter_height, filter_width, input_zero_point, input_shape, + input_data, im2col_shape, im2col_data); + gemm_input_data = im2col_data; + num_input = im2col_shape.FlatSize(); + } else { + TFLITE_DCHECK(!im2col_data); + gemm_input_data = input_data; + num_input = input_shape.FlatSize(); + } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, float* output_data, - const Dims<4>& output_dims, float* im2col_data, - const Dims<4>& im2col_dims) { - Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, - bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data, - output_dims, im2col_data, im2col_dims); -} + // Flatten 4D matrices into 2D matrices for matrix multiplication. -inline void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, - gemmlowp::GemmContext* gemm_context) { - gemmlowp::ScopedProfilingLabel label("Conv/8bit"); + // Flatten so that each filter has its own row. + const int filter_rows = filter_shape.Dims(0); + const int filter_cols = FlatSizeSkipDim(filter_shape, 0); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); + // In MatrixBatchVectorMultiplyAccumulate, each output value is the + // dot product of one row of the first matrix with one row of the second + // matrix. Therefore, the number of cols in each matrix are equivalent. + // + // After Im2Col, each input patch becomes a row. + const int gemm_input_cols = filter_cols; + const int gemm_input_rows = num_input / gemm_input_cols; + + const int output_cols = output_shape.Dims(3); + const int output_rows = FlatSizeSkipDim(output_shape, 3); + TFLITE_DCHECK_EQ(output_cols, filter_rows); + TFLITE_DCHECK_EQ(output_rows, gemm_input_rows); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_cols); + + // MatrixBatchVectorMultiplyAccumulate assumes that each row of the second + // input matrix has its own scale factor. This code duplicates the scale + // factors for each row in the same batch. + const int rows_per_batch = gemm_input_rows / batch_size; + for (int i = gemm_input_rows - 1; i >= 0; --i) { + scaling_factors_ptr[i] = scaling_factors_ptr[i / rows_per_batch]; + } + + tensor_utils::ZeroVector(output_data, output_rows * output_cols); + + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + filter_data, filter_rows, filter_cols, gemm_input_data, + scaling_factors_ptr, /*n_batch=*/gemm_input_rows, output_data, + /*result_stride=*/1); + + AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max, + bias_shape, bias_data, output_shape, + output_data); +} + +inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data, const RuntimeShape& im2col_shape, + uint8* im2col_data, gemmlowp::GemmContext* gemm_context) { + gemmlowp::ScopedProfilingLabel label("Conv/8bit"); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); const uint8* gemm_input_data = nullptr; - const Dims<4>* gemm_input_dims = nullptr; - const int filter_width = ArraySize(filter_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); + const RuntimeShape* gemm_input_shape = nullptr; + const int filter_width = filter_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const bool need_dilated_im2col = + dilation_width_factor != 1 || dilation_height_factor != 1; const bool need_im2col = stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1; - if (need_im2col) { + if (need_dilated_im2col) { TFLITE_DCHECK(im2col_data); const int input_zero_point = -input_offset; TFLITE_DCHECK_GE(input_zero_point, 0); TFLITE_DCHECK_LE(input_zero_point, 255); - Im2col(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_height, filter_width, input_zero_point, - im2col_data, im2col_dims); + DilatedIm2col(params, input_zero_point, input_shape, input_data, + filter_shape, output_shape, im2col_data); gemm_input_data = im2col_data; - gemm_input_dims = &im2col_dims; + gemm_input_shape = &im2col_shape; + } else if (need_im2col) { + TFLITE_DCHECK(im2col_data); + const int input_zero_point = -input_offset; + TFLITE_DCHECK_GE(input_zero_point, 0); + TFLITE_DCHECK_LE(input_zero_point, 255); + Im2col(params, filter_height, filter_width, input_zero_point, input_shape, + input_data, im2col_shape, im2col_data); + gemm_input_data = im2col_data; + gemm_input_shape = &im2col_shape; } else { TFLITE_DCHECK(!im2col_data); gemm_input_data = input_data; - gemm_input_dims = &input_dims; - } - - const int gemm_input_rows = gemm_input_dims->sizes[0]; - const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_dims, 0); - const int filter_rows = filter_dims.sizes[3]; - const int filter_cols = FlatSizeSkipDim(filter_dims, 3); - const int output_rows = output_dims.sizes[0]; - const int output_cols = FlatSizeSkipDim(output_dims, 0); + gemm_input_shape = &input_shape; + } + + const int gemm_input_rows = gemm_input_shape->Dims(3); + // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784). + // The root cause has not yet been identified though. Same applies below for + // the other calls commented out. This is a partial rollback of cl/196819423. + // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3); + const int gemm_input_cols = gemm_input_shape->Dims(0) * + gemm_input_shape->Dims(1) * + gemm_input_shape->Dims(2); + const int filter_rows = filter_shape.Dims(0); + // See b/79927784. + // const int filter_cols = FlatSizeSkipDim(filter_shape, 0); + const int filter_cols = + filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3); + const int output_rows = output_shape.Dims(3); + // See b/79927784. + // const int output_cols = FlatSizeSkipDim(output_shape, 3); + const int output_cols = + output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2); TFLITE_DCHECK_EQ(output_rows, filter_rows); TFLITE_DCHECK_EQ(output_cols, gemm_input_cols); TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows); - TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows); - TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1); - TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1); - TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows); gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( filter_data, filter_rows, filter_cols); gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( gemm_input_data, gemm_input_rows, gemm_input_cols); gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( output_data, output_rows, output_cols); - const auto& output_pipeline = GemmlowpOutputPipeline::Make( + const auto& output_pipeline = GemmlowpOutputPipeline::MakeExp( bias_data, output_rows, output_offset, output_multiplier, output_shift, output_activation_min, output_activation_max); gemmlowp::GemmWithOutputPipeline<uint8, uint8, @@ -2006,78 +2051,35 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims, input_offset, output_pipeline); } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -inline void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, - gemmlowp::GemmContext* gemm_context) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - Conv(input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, stride_width, stride_height, - pad_width, pad_height, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data, output_dims, - im2col_data, im2col_dims, gemm_context); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - Conv(input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, stride, stride, pad_width, - pad_height, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data, output_dims, - im2col_data, im2col_dims, gemm_context); -} - template <typename T> -inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, - int block_size, T* output_data, - const Dims<4>& output_dims) { +inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params, + const RuntimeShape& unextended_input_shape, + const T* input_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { gemmlowp::ScopedProfilingLabel label("DepthToSpace"); - const int input_depth = ArraySize(input_dims, 0); - const int input_width = ArraySize(input_dims, 1); - const int input_height = ArraySize(input_dims, 2); + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); - const int output_depth = ArraySize(output_dims, 0); - const int batch_size = ArraySize(output_dims, 3); + const int input_depth = input_shape.Dims(3); + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + + const int output_depth = output_shape.Dims(3); + const int batch_size = output_shape.Dims(0); // Number of continuous values that we can copy in one interation. - const int stride = block_size * output_depth; + const int stride = op_params.block_size * output_depth; for (int batch = 0; batch < batch_size; ++batch) { for (int in_h = 0; in_h < input_height; ++in_h) { - const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch); - for (int offset_h = 0; offset_h < block_size; ++offset_h) { + const T* input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0); + for (int offset_h = 0; offset_h < op_params.block_size; ++offset_h) { const T* src = input_ptr; for (int in_w = 0; in_w < input_width; ++in_w) { memcpy(output_data, src, stride * sizeof(T)); @@ -2090,100 +2092,35 @@ inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac, typename T> -void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int kheight, int kwidth, - uint8 byte_zero, T* output_data, const Dims<4>& output_dims) { - Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, - kwidth, byte_zero, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void ConvAsGemm(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("ConvAsGemm"); - - const auto input_matrix_map = - MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - const auto filter_matrix_map = - MapAsMatrixWithLastDimAsCols(filter_data, filter_dims); - auto output_matrix_map = - MapAsMatrixWithFirstDimAsRows(output_data, output_dims); - - Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map); - - AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data, - output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit"); - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - const int input_rows = input_dims.sizes[0]; - const int input_cols = FlatSizeSkipDim(input_dims, 0); - const int filter_rows = filter_dims.sizes[3]; - const int filter_cols = FlatSizeSkipDim(filter_dims, 3); - const int output_rows = output_dims.sizes[0]; - const int output_cols = FlatSizeSkipDim(output_dims, 0); - TFLITE_DCHECK_EQ(output_rows, filter_rows); - TFLITE_DCHECK_EQ(output_cols, input_cols); - TFLITE_DCHECK_EQ(filter_cols, input_rows); - TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows); - TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1); - TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1); - TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1); - gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( - filter_data, output_rows, filter_cols, filter_cols); - gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( - input_data, filter_cols, output_cols, filter_cols); - gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( - output_data, output_rows, output_cols, output_rows); - const auto& output_pipeline = GemmlowpOutputPipeline::Make( - bias_data, output_rows, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max); - gemmlowp::GemmWithOutputPipeline<uint8, uint8, - gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( - gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, - input_offset, output_pipeline); -} - template <typename T> -inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, - int block_size, T* output_data, - const Dims<4>& output_dims) { +inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params, + const RuntimeShape& unextended_input_shape, + const T* input_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { gemmlowp::ScopedProfilingLabel label("SpaceToDepth"); - const int output_depth = ArraySize(output_dims, 0); - const int output_width = ArraySize(output_dims, 1); - const int output_height = ArraySize(output_dims, 2); + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + const int output_depth = output_shape.Dims(3); + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); - const int input_depth = ArraySize(input_dims, 0); - const int batch_size = ArraySize(input_dims, 3); + const int input_depth = input_shape.Dims(3); + const int batch_size = input_shape.Dims(0); // Number of continuous values that we can copy in one interation. - const int stride = block_size * input_depth; + const int stride = op_params.block_size * input_depth; for (int batch = 0; batch < batch_size; ++batch) { for (int out_h = 0; out_h < output_height; ++out_h) { - T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch); - for (int offset_h = 0; offset_h < block_size; ++offset_h) { + T* output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0); + for (int offset_h = 0; offset_h < op_params.block_size; ++offset_h) { T* dst = output_ptr; for (int out_w = 0; out_w < output_width; ++out_w) { memcpy(dst, input_data, stride * sizeof(T)); @@ -2196,95 +2133,26 @@ inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, } } -template <FusedActivationFunctionType Ac> -void NonGlobalBatchNormalization( - const float* input_data, const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, const float* multiplier_data, - const Dims<4>& multiplier_dims, const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int inner_size = MatchingFlatSizeSkipDim( - input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims); - - for (int b = 0; b < batches; ++b) { - for (int i = 0; i < inner_size; ++i) { - *output_data = ActivationFunction<Ac>( - (*input_data - mean_data[i]) * multiplier_data[i] + offset_data[i]); - ++output_data; - ++input_data; - } - } -} - -template <FusedActivationFunctionType Ac> -void GlobalBatchNormalization(const float* input_data, - const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, - const float* multiplier_data, - const Dims<4>& multiplier_dims, - const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization"); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = - MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, - offset_dims, 0, output_dims, 0); - - for (int i = 0; i < outer_size; ++i) { - for (int c = 0; c < depth; ++c) { - *output_data = ActivationFunction<Ac>( - (*input_data - mean_data[c]) * multiplier_data[c] + offset_data[c]); - ++output_data; - ++input_data; - } - } -} - -inline void Relu(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { +inline void Relu(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Relu (not fused)"); - const auto input = MapAsVector(input_data, input_dims); - auto output = MapAsVector(output_data, output_dims); + const auto input = MapAsVector(input_data, input_shape); + auto output = MapAsVector(output_data, output_shape); output = input.cwiseMax(0.0f); } -inline void Relu1(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)"); - const int flat_size = MatchingFlatSize(input_dims, output_dims); - for (int i = 0; i < flat_size; ++i) { - const float val = input_data[i]; - const float upper = 1; - const float lower = -1; - const float clamped = val > upper ? upper : val < lower ? lower : val; - output_data[i] = clamped; - } -} - -inline void Relu6(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)"); - const int flat_size = MatchingFlatSize(input_dims, output_dims); - for (int i = 0; i < flat_size; ++i) { - const float val = input_data[i]; - const float upper = 6; - const float lower = 0; - const float clamped = val > upper ? upper : val < lower ? lower : val; - output_data[i] = clamped; - } -} - -template <FusedActivationFunctionType Ac> -void L2Normalization(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { +inline void L2Normalization(const tflite::L2NormalizationParams& op_params, + const RuntimeShape& input_shape, + const float* input_data, + const RuntimeShape& output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("L2Normalization"); - static_assert(Ac == FusedActivationFunctionType::kNone, ""); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { float squared_l2_norm = 0; for (int c = 0; c < depth; ++c) { @@ -2300,15 +2168,17 @@ void L2Normalization(const float* input_data, const Dims<4>& input_dims, } } -inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, - int* output_shift) { +inline void GetInvSqrtQuantizedMultiplierExp(int32 input, + int32* output_inv_sqrt, + int* output_shift) { *output_shift = 11; while (input >= (1 << 29)) { input /= 4; ++*output_shift; } TFLITE_DCHECK_GT(input, 0); - const unsigned max_left_shift_bits = __builtin_clz(input) - 1; + const unsigned max_left_shift_bits = + CountLeadingZeros(static_cast<uint32>(input)) - 1; const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; *output_shift -= left_shift_bit_pairs; @@ -2343,51 +2213,58 @@ inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, *output_inv_sqrt <<= -*output_shift; *output_shift = 0; } + // Convert right shift (right is positive) to left shift. + *output_shift *= kReverseShift; } -inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, uint8* output_data, - const Dims<4>& output_dims) { +inline void L2Normalization(const tflite::L2NormalizationParams& op_params, + const RuntimeShape& input_shape, + const uint8* input_data, + const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit"); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - TFLITE_DCHECK_EQ(outer_size, 1); - int32 square_l2_norm = 0; - for (int i = 0; i < depth; i++) { - int32 diff = input_data[i] - input_zero_point; - square_l2_norm += diff * diff; - } - int32 inv_l2norm_multiplier; - int inv_l2norm_shift; - GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier, - &inv_l2norm_shift); - - for (int i = 0; i < depth; i++) { - int32 diff = input_data[i] - input_zero_point; - int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne( - 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); - int32 unclamped_output_val = 128 + rescaled_diff; - int32 output_val = std::min(255, std::max(0, unclamped_output_val)); - output_data[i] = static_cast<uint8>(output_val); + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int32 input_zero_point = op_params.input_zero_point; + for (int i = 0; i < outer_size; ++i) { + int32 square_l2_norm = 0; + for (int c = 0; c < depth; c++) { + // Note that input_data advances by depth in the second pass below. + int32 diff = input_data[c] - input_zero_point; + square_l2_norm += diff * diff; + } + int32 inv_l2norm_multiplier; + int inv_l2norm_shift; + GetInvSqrtQuantizedMultiplierExp(square_l2_norm, &inv_l2norm_multiplier, + &inv_l2norm_shift); + + for (int c = 0; c < depth; c++) { + int32 diff = *input_data - input_zero_point; + int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( + 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); + int32 unclamped_output_val = 128 + rescaled_diff; + int32 output_val = std::min(255, std::max(0, unclamped_output_val)); + *output_data = static_cast<uint8>(output_val); + ++input_data; + ++output_data; + } } } -inline void Add(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const float* input1_data, + const RuntimeShape& input2_shape, const float* input2_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Add"); - TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); int i = 0; - const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims); + const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); #ifdef USE_NEON - const auto activation_min = vdupq_n_f32(output_activation_min); - const auto activation_max = vdupq_n_f32(output_activation_max); + const auto activation_min = vdupq_n_f32(params.float_activation_min); + const auto activation_max = vdupq_n_f32(params.float_activation_max); for (; i <= size - 16; i += 16) { auto a10 = vld1q_f32(input1_data + i); auto a11 = vld1q_f32(input1_data + i + 4); @@ -2426,29 +2303,26 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims, for (; i < size; i++) { auto x = input1_data[i] + input2_data[i]; - output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min, - output_activation_max); + output_data[i] = ActivationFunctionWithMinMax( + x, params.float_activation_min, params.float_activation_max); } } // Element-wise add that can often be used for inner loop of broadcast add as // well as the non-broadcast add. -inline void AddElementwise(int size, int left_shift, const uint8* input1_data, - int32 input1_offset, int32 input1_multiplier, - int input1_shift, const uint8* input2_data, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data) { +inline void AddElementwise(int size, const ArithmeticParams& params, + const uint8* input1_data, const uint8* input2_data, + uint8* output_data) { int i = 0; - TFLITE_DCHECK_GT(input1_offset, -256); - TFLITE_DCHECK_GT(input2_offset, -256); - TFLITE_DCHECK_LT(input1_offset, 256); - TFLITE_DCHECK_LT(input2_offset, 256); + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_LT(params.input2_offset, 256); #ifdef USE_NEON - const auto output_activation_min_vector = vdup_n_u8(output_activation_min); - const auto output_activation_max_vector = vdup_n_u8(output_activation_max); + const auto output_activation_min_vector = + vdup_n_u8(params.quantized_activation_min); + const auto output_activation_max_vector = + vdup_n_u8(params.quantized_activation_max); for (; i <= size - 8; i += 8) { const auto input1_val_original = vld1_u8(input1_data + i); const auto input2_val_original = vld1_u8(input2_data + i); @@ -2457,9 +2331,9 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data, const auto input2_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); const auto input1_val = - vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset)); + vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset)); const auto input2_val = - vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset)); + vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset)); const auto input1_val_high = vget_high_s16(input1_val); const auto input1_val_low = vget_low_s16(input1_val); const auto input2_val_high = vget_high_s16(input2_val); @@ -2468,32 +2342,32 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data, auto x12 = vmovl_s16(input1_val_high); auto x21 = vmovl_s16(input2_val_low); auto x22 = vmovl_s16(input2_val_high); - const auto left_shift_dup = vdupq_n_s32(left_shift); + const auto left_shift_dup = vdupq_n_s32(params.left_shift); x11 = vshlq_s32(x11, left_shift_dup); x12 = vshlq_s32(x12, left_shift_dup); x21 = vshlq_s32(x21, left_shift_dup); x22 = vshlq_s32(x22, left_shift_dup); - x11 = vqrdmulhq_n_s32(x11, input1_multiplier); - x12 = vqrdmulhq_n_s32(x12, input1_multiplier); - x21 = vqrdmulhq_n_s32(x21, input2_multiplier); - x22 = vqrdmulhq_n_s32(x22, input2_multiplier); - const auto input1_shift_dup = vdupq_n_s32(-input1_shift); - const auto input2_shift_dup = vdupq_n_s32(-input2_shift); + x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier); + x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier); + x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier); + x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier); + const auto input1_shift_dup = vdupq_n_s32(params.input1_shift); + const auto input2_shift_dup = vdupq_n_s32(params.input2_shift); x11 = vshlq_s32(x11, input1_shift_dup); x12 = vshlq_s32(x12, input1_shift_dup); x21 = vshlq_s32(x21, input2_shift_dup); x22 = vshlq_s32(x22, input2_shift_dup); auto s1 = vaddq_s32(x11, x21); auto s2 = vaddq_s32(x12, x22); - s1 = vqrdmulhq_n_s32(s1, output_multiplier); - s2 = vqrdmulhq_n_s32(s2, output_multiplier); + s1 = vqrdmulhq_n_s32(s1, params.output_multiplier); + s2 = vqrdmulhq_n_s32(s2, params.output_multiplier); using gemmlowp::RoundingDivideByPOT; - s1 = RoundingDivideByPOT(s1, output_shift); - s2 = RoundingDivideByPOT(s2, output_shift); + s1 = RoundingDivideByPOT(s1, -params.output_shift); + s2 = RoundingDivideByPOT(s2, -params.output_shift); const auto s1_narrowed = vmovn_s32(s1); const auto s2_narrowed = vmovn_s32(s2); const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), - vdupq_n_s16(output_offset)); + vdupq_n_s16(params.output_offset)); const auto clamped = vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(s))); @@ -2502,108 +2376,74 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data, #endif // NEON for (; i < size; ++i) { - const int32 input1_val = input1_offset + input1_data[i]; - const int32 input2_val = input2_offset + input2_data[i]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); + const int32 input1_val = params.input1_offset + input1_data[i]; + const int32 input2_val = params.input2_offset + input2_data[i]; + const int32 shifted_input1_val = input1_val * (1 << params.left_shift); + const int32 shifted_input2_val = input2_val * (1 << params.left_shift); + const int32 scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + const int32 scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne( - raw_sum, output_multiplier, output_shift) + - output_offset; - const int32 clamped_output = std::min( - output_activation_max, std::max(output_activation_min, raw_output)); + const int32 raw_output = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32 clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); output_data[i] = static_cast<uint8>(clamped_output); } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Add(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, - output_activation_max, output_data, output_dims); -} - -template <FusedActivationFunctionType Ac> -inline void Add(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, int input2_shift, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const uint8* input1_data, + const RuntimeShape& input2_shape, const uint8* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); gemmlowp::ScopedProfilingLabel label("Add/8bit"); - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); - TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); - - TFLITE_DCHECK_GT(input1_offset, -256); - TFLITE_DCHECK_GT(input2_offset, -256); - TFLITE_DCHECK_LT(input1_offset, 256); - TFLITE_DCHECK_LT(input2_offset, 256); - AddElementwise(flat_size, left_shift, input1_data, input1_offset, - input1_multiplier, input1_shift, input2_data, input2_offset, - input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_LT(params.input2_offset, 256); + AddElementwise(flat_size, params, input1_data, input2_data, output_data); } -template <FusedActivationFunctionType Ac> -inline void Add(const int16* input1_data, const Dims<4>& input1_dims, - int input1_shift, const int16* input2_data, - const Dims<4>& input2_dims, int input2_shift, - int16 output_activation_min, int16 output_activation_max, - int16* output_data, const Dims<4>& output_dims) { +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int16* output_data) { gemmlowp::ScopedProfilingLabel label("Add/Int16"); - // This is a copy of the reference implementation. We do not currently have a - // properly optimized version. - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, -32768); - TFLITE_DCHECK_EQ(output_activation_max, 32767); - } - - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); - - TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0); - TFLITE_DCHECK_GE(input1_shift, 0); - TFLITE_DCHECK_GE(input2_shift, 0); + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + + const int input1_shift = params.input1_shift; + const int flat_size = + MatchingFlatSize(output_shape, input1_shape, input2_shape); + const int16 output_activation_min = params.quantized_activation_min; + const int16 output_activation_max = params.quantized_activation_max; + + TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0); + TFLITE_DCHECK_LE(input1_shift, 0); + TFLITE_DCHECK_LE(params.input2_shift, 0); const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data; const int16* shift_input = input1_shift == 0 ? input2_data : input1_data; - const int input_shift = input1_shift == 0 ? input2_shift : input1_shift; + const int input_right_shift = + input1_shift == 0 ? -params.input2_shift : -input1_shift; for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]); - F0 scaled_input = - F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift)); + F0 scaled_input = F0::FromRaw( + gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift)); F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled); const int16 raw_output = result.raw(); const int16 clamped_output = std::min( @@ -2612,157 +2452,59 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims, } } -template <FusedActivationFunctionType Ac> -void Add(const int32* input1_data, const Dims<4>& input1_dims, - const int32* input2_data, const Dims<4>& input2_dims, - int32* output_data, const Dims<4>& output_dims) { +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int32* input1_data, + const RuntimeShape& input2_shape, const int32* input2_data, + const RuntimeShape& output_shape, int32* output_data) { gemmlowp::ScopedProfilingLabel label("Add/int32"); - TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); - auto input1_map = MapAsVector(input1_data, input1_dims); - auto input2_map = MapAsVector(input2_data, input2_dims); - auto output_map = MapAsVector(output_data, output_dims); - if (AreSameDims(input1_dims, input2_dims)) { + auto input1_map = MapAsVector(input1_data, input1_shape); + auto input2_map = MapAsVector(input2_data, input2_shape); + auto output_map = MapAsVector(output_data, output_shape); + if (input1_shape == input2_shape) { output_map.array() = input1_map.array() + input2_map.array(); - } else if (FlatSize(input2_dims) == 1) { + } else if (input2_shape.FlatSize() == 1) { auto scalar = input2_data[0]; output_map.array() = input1_map.array() + scalar; - } else if (FlatSize(input1_dims) == 1) { + } else if (input1_shape.FlatSize() == 1) { auto scalar = input1_data[0]; output_map.array() = scalar + input2_map.array(); } else { // Should not come here. TFLITE_DCHECK(false); } + output_map = output_map.cwiseMax(params.quantized_activation_min); + output_map = output_map.cwiseMin(params.quantized_activation_max); } -// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. -// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from -// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> -// is no longer referenced in this file, move NdArrayDesc<T> from types.h to -// reference_ops.h. -template <typename T> -void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] + - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); - } - } - } - } -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac, typename T> -void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims) { - T output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims, - output_activation_min, output_activation_max, output_data, - output_dims); -} - -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAddGeneric/8bit"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOne( - raw_sum, output_multiplier, output_shift) + - output_offset; - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast<uint8>(clamped_output); - } - } - } - } -} - -inline void BroadcastAddFivefold( - int y0, int y1, int y2, int y3, int y4, int left_shift, - const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { +inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, + const RuntimeShape& unswitched_input1_shape, + const uint8* unswitched_input1_data, + const RuntimeShape& unswitched_input2_shape, + const uint8* unswitched_input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit"); + ArithmeticParams switched_params = unswitched_params; + switched_params.input1_offset = unswitched_params.input2_offset; + switched_params.input1_multiplier = unswitched_params.input2_multiplier; + switched_params.input1_shift = unswitched_params.input2_shift; + switched_params.input2_offset = unswitched_params.input1_offset; + switched_params.input2_multiplier = unswitched_params.input1_multiplier; + switched_params.input2_shift = unswitched_params.input1_shift; + + const bool use_unswitched = + unswitched_params.broadcast_category == + tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; + + const ArithmeticParams& params = + use_unswitched ? unswitched_params : switched_params; + const uint8* input1_data = + use_unswitched ? unswitched_input1_data : unswitched_input2_data; + const uint8* input2_data = + use_unswitched ? unswitched_input2_data : unswitched_input1_data; + // Fivefold nested loops. The second input resets its position for each // iteration of the second loop. The first input resets its position at the // beginning of the fourth loop. The innermost loop is an elementwise add of @@ -2770,93 +2512,39 @@ inline void BroadcastAddFivefold( uint8* output_data_ptr = output_data; const uint8* input1_data_ptr = input1_data; const uint8* input2_data_reset = input2_data; - for (int i4 = 0; i4 < y4; ++i4) { + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + for (int i0 = 0; i0 < y0; ++i0) { const uint8* input2_data_ptr; - for (int i3 = 0; i3 < y3; ++i3) { + for (int i1 = 0; i1 < y1; ++i1) { input2_data_ptr = input2_data_reset; for (int i2 = 0; i2 < y2; ++i2) { - for (int i1 = 0; i1 < y1; ++i1) { - AddElementwise( - y0, left_shift, input1_data_ptr, input1_offset, input1_multiplier, - input1_shift, input2_data_ptr, input2_offset, input2_multiplier, - input2_shift, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data_ptr); - input2_data_ptr += y0; - output_data_ptr += y0; + for (int i3 = 0; i3 < y3; ++i3) { + AddElementwise(y4, params, input1_data_ptr, input2_data_ptr, + output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; } - input1_data_ptr += y0; + input1_data_ptr += y4; } } input2_data_reset = input2_data_ptr; } } -template <FusedActivationFunctionType Ac> -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset, - input1_multiplier, input1_shift, input2_data, input2_dims, - input2_offset, input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims); -} - -template <FusedActivationFunctionType Ac> -inline void BroadcastAddFivefold( - int y0, int y1, int y2, int y3, int y4, int left_shift, - const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims, - input1_offset, input1_multiplier, input1_shift, - input2_data, input2_dims, input2_offset, - input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims); -} - -inline void Mul(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const float* input1_data, + const RuntimeShape& input2_shape, const float* input2_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Mul"); - TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; int i = 0; - const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims); + const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); #ifdef USE_NEON const auto activation_min = vdupq_n_f32(output_activation_min); const auto activation_max = vdupq_n_f32(output_activation_max); @@ -2907,34 +2595,41 @@ inline void Mul(const float* input1_data, const Dims<4>& input1_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Mul(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int32* input1_data, + const RuntimeShape& input2_shape, const int32* input2_data, + const RuntimeShape& output_shape, int32* output_data) { + gemmlowp::ScopedProfilingLabel label("Mul/int32/activation"); - Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, - output_activation_max, output_data, output_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] * input2_data[i], output_activation_min, + output_activation_max); + } } -template <FusedActivationFunctionType Ac> -void Mul(const int32* input1_data, const Dims<4>& input1_dims, - const int32* input2_data, const Dims<4>& input2_dims, - int32* output_data, const Dims<4>& output_dims) { +inline void MulNoActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { gemmlowp::ScopedProfilingLabel label("Mul/int32"); - TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); - auto input1_map = MapAsVector(input1_data, input1_dims); - auto input2_map = MapAsVector(input2_data, input2_dims); - auto output_map = MapAsVector(output_data, output_dims); - if (AreSameDims(input1_dims, input2_dims)) { + auto input1_map = MapAsVector(input1_data, input1_shape); + auto input2_map = MapAsVector(input2_data, input2_shape); + auto output_map = MapAsVector(output_data, output_shape); + if (input1_shape == input2_shape) { output_map.array() = input1_map.array() * input2_map.array(); - } else if (FlatSize(input2_dims) == 1) { + } else if (input2_shape.FlatSize() == 1) { auto scalar = input2_data[0]; output_map.array() = input1_map.array() * scalar; - } else if (FlatSize(input1_dims) == 1) { + } else if (input1_shape.FlatSize() == 1) { auto scalar = input1_data[0]; output_map.array() = scalar * input2_map.array(); } else { @@ -2943,14 +2638,16 @@ void Mul(const int32* input1_data, const Dims<4>& input1_dims, } } -inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, - const int16* input2_data, const Dims<4>& input2_dims, - int16* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Mul/Int16"); +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int16* output_data) { + gemmlowp::ScopedProfilingLabel label("Mul/Int16/NoActivation"); // This is a copy of the reference implementation. We do not currently have a // properly optimized version. - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. @@ -2962,17 +2659,20 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, } } -inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, - const int16* input2_data, const Dims<4>& input2_dims, - int32 output_offset, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8"); // This is a copy of the reference implementation. We do not currently have a // properly optimized version. + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + const int32 output_offset = params.output_offset; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. @@ -2990,216 +2690,256 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, } } -// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. -// TODO(benoitjacob): BroadcastMul is intentionally duplicated from -// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> -// is no longer referenced in this file, move NdArrayDesc<T> from types.h to -// reference_ops.h. -template <typename T> -void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastMul"); +// Element-wise mul that can often be used for inner loop of broadcast Mul as +// well as the non-broadcast Mul. +inline void MulElementwise(int size, const ArithmeticParams& params, + const uint8* input1_data, const uint8* input2_data, + uint8* output_data) { + int i = 0; + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input2_offset, 256); + TFLITE_DCHECK_GT(params.output_offset, -256); + TFLITE_DCHECK_LT(params.output_offset, 256); +#ifdef USE_NEON + const auto input1_offset_vector = vdupq_n_s16(params.input1_offset); + const auto input2_offset_vector = vdupq_n_s16(params.input2_offset); + const auto output_offset_vector = vdupq_n_s16(params.output_offset); + const auto output_activation_min_vector = + vdup_n_u8(params.quantized_activation_min); + const auto output_activation_max_vector = + vdup_n_u8(params.quantized_activation_max); + for (; i <= size - 8; i += 8) { + // We load / store 8 at a time, multiplying as two sets of 4 int32s. + const auto input1_val_original = vld1_u8(input1_data + i); + const auto input2_val_original = vld1_u8(input2_data + i); + const auto input1_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input1_val_original)); + const auto input2_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); + const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector); + const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector); - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + const auto input1_val_low = vget_low_s16(input1_val); + const auto input1_val_high = vget_high_s16(input1_val); + const auto input2_val_low = vget_low_s16(input2_val); + const auto input2_val_high = vget_high_s16(input2_val); - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] * - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); - } - } - } + auto p1 = vmull_s16(input2_val_low, input1_val_low); + auto p2 = vmull_s16(input2_val_high, input1_val_high); + + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); + p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + p1 = RoundingDivideByPOT(p1, -params.output_shift); + p2 = RoundingDivideByPOT(p2, -params.output_shift); + + const auto p1_narrowed = vmovn_s32(p1); + const auto p2_narrowed = vmovn_s32(p2); + const auto p = + vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); + const auto clamped = + vmax_u8(output_activation_min_vector, + vmin_u8(output_activation_max_vector, vqmovun_s16(p))); + vst1_u8(output_data + i, clamped); } -} +#endif // NEON -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac, typename T> -void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims) { - T output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - BroadcastMul(input1_data, input1_dims, input2_data, input2_dims, - output_activation_min, output_activation_max, output_data, - output_dims); + for (; i < size; ++i) { + const int32 input1_val = params.input1_offset + input1_data[i]; + const int32 input2_val = params.input2_offset + input2_data[i]; + const int32 unclamped_result = + params.output_offset + + MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32 clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + output_data[i] = static_cast<uint8>(clamped_output); + } } -inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, - int32 input1_offset, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit"); +// Broadcast mul that can often be used for inner loop of broadcast Mul. +inline void MulSimpleBroadcast(int size, const ArithmeticParams& params, + const uint8 broadcast_value, + const uint8* input2_data, uint8* output_data) { + const int16 input1_val = params.input1_offset + broadcast_value; - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + int i = 0; + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input2_offset, 256); + TFLITE_DCHECK_GT(params.output_offset, -256); + TFLITE_DCHECK_LT(params.output_offset, 256); +#ifdef USE_NEON + const auto input2_offset_vector = vdupq_n_s16(params.input2_offset); + const auto output_offset_vector = vdupq_n_s16(params.output_offset); + const auto output_activation_min_vector = + vdup_n_u8(params.quantized_activation_min); + const auto output_activation_max_vector = + vdup_n_u8(params.quantized_activation_max); + for (; i <= size - 8; i += 8) { + // We load / store 8 at a time, multiplying as two sets of 4 int32s. + const auto input2_val_original = vld1_u8(input2_data + i); + const auto input2_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); + const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector); - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 unclamped_result = - output_offset + - MultiplyByQuantizedMultiplierSmallerThanOne( - input1_val * input2_val, output_multiplier, output_shift); - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, unclamped_result)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast<uint8>(clamped_output); - } - } - } + const auto input2_val_low = vget_low_s16(input2_val); + const auto input2_val_high = vget_high_s16(input2_val); + + auto p1 = vmull_n_s16(input2_val_low, input1_val); + auto p2 = vmull_n_s16(input2_val_high, input1_val); + + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); + p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + p1 = RoundingDivideByPOT(p1, -params.output_shift); + p2 = RoundingDivideByPOT(p2, -params.output_shift); + + const auto p1_narrowed = vmovn_s32(p1); + const auto p2_narrowed = vmovn_s32(p2); + const auto p = + vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); + const auto clamped = + vmax_u8(output_activation_min_vector, + vmin_u8(output_activation_max_vector, vqmovun_s16(p))); + vst1_u8(output_data + i, clamped); } -} +#endif // NEON -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, - int32 input1_offset, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - BroadcastMul(input1_data, input1_dims, input1_offset, input2_data, - input2_dims, input2_offset, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_data, output_dims); + for (; i < size; ++i) { + const int32 input2_val = params.input2_offset + input2_data[i]; + const int32 unclamped_result = + params.output_offset + + MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32 clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + output_data[i] = static_cast<uint8>(clamped_output); + } } -// TODO(aselle): This is not actually optimized yet. -inline void Div(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); - for (int i = 0; i < flat_size; i++) { - output_data[i] = ActivationFunctionWithMinMax( - input1_data[i] / input2_data[i], output_activation_min, - output_activation_max); - } +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const uint8* input1_data, + const RuntimeShape& input2_shape, const uint8* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + gemmlowp::ScopedProfilingLabel label("Mul/8bit"); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + + MulElementwise(flat_size, params, input1_data, input2_data, output_data); } -// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary -// dimensionality if the runtime code does a single loop over one dimension -// that handles broadcasting as the base case. The code generator would then -// generate max(D1, D2) nested for loops. -// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from -// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> -// is no longer referenced in this file, move NdArrayDesc<T> from types.h to -// reference_ops.h. -template <typename T> -void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastDiv"); +inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params, + const RuntimeShape& unswitched_input1_shape, + const uint8* unswitched_input1_data, + const RuntimeShape& unswitched_input2_shape, + const uint8* unswitched_input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastMulFivefold/8bit"); - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + ArithmeticParams switched_params = unswitched_params; + switched_params.input1_offset = unswitched_params.input2_offset; + switched_params.input2_offset = unswitched_params.input1_offset; - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] / - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); + const bool use_unswitched = + unswitched_params.broadcast_category == + tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; + + const ArithmeticParams& params = + use_unswitched ? unswitched_params : switched_params; + const uint8* input1_data = + use_unswitched ? unswitched_input1_data : unswitched_input2_data; + const uint8* input2_data = + use_unswitched ? unswitched_input2_data : unswitched_input1_data; + + // Fivefold nested loops. The second input resets its position for each + // iteration of the second loop. The first input resets its position at the + // beginning of the fourth loop. The innermost loop is an elementwise Mul of + // sections of the arrays. + uint8* output_data_ptr = output_data; + const uint8* input1_data_ptr = input1_data; + const uint8* input2_data_reset = input2_data; + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + if (y4 > 1) { + for (int i0 = 0; i0 < y0; ++i0) { + const uint8* input2_data_ptr; + for (int i1 = 0; i1 < y1; ++i1) { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) { + for (int i3 = 0; i3 < y3; ++i3) { + MulElementwise(y4, params, input1_data_ptr, input2_data_ptr, + output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; + } + input1_data_ptr += y4; } } + input2_data_reset = input2_data_ptr; + } + } else { + for (int i0 = 0; i0 < y0; ++i0) { + const uint8* input2_data_ptr; + for (int i1 = 0; i1 < y1; ++i1) { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) { + MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr, + output_data_ptr); + input2_data_ptr += y3; + output_data_ptr += y3; + ++input1_data_ptr; + } + } + input2_data_reset = input2_data_ptr; } } } -// TODO(aselle): This is not actually optimized yet. -inline void Sub(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Sub"); - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); - for (int i = 0; i < flat_size; ++i) { - output_data[i] = ActivationFunctionWithMinMax( - input1_data[i] - input2_data[i], output_activation_min, - output_activation_max); - } -} - -// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary +// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary // dimensionality if the runtime code does a single loop over one dimension // that handles broadcasting as the base case. The code generator would then // generate max(D1, D2) nested for loops. -// TODO(benoitjacob): BroadcastSub is intentionally duplicated from +// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> // is no longer referenced in this file, move NdArrayDesc<T> from types.h to // reference_ops.h. template <typename T> -void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastSub"); +void BroadcastDiv4DSlow(const ArithmeticParams& params, + const RuntimeShape& unextended_input1_shape, + const T* input1_data, + const RuntimeShape& unextended_input2_shape, + const T* input2_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastDiv4DSlow"); + T output_activation_min; + T output_activation_max; + GetActivationParams(params, &output_activation_min, &output_activation_max); + + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -3212,14 +2952,14 @@ void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + output_data[Offset(output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] - - input2_data[SubscriptToIndex(desc2, c, x, y, b)], + input1_data[SubscriptToIndex(desc1, b, y, x, c)] / + input2_data[SubscriptToIndex(desc2, b, y, x, c)], output_activation_min, output_activation_max); } } @@ -3227,220 +2967,172 @@ void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims, } } -inline void BroadcastSub(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); - const int32 raw_sub = scaled_input1_val - scaled_input2_val; - const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOne( - raw_sub, output_multiplier, output_shift) + - output_offset; - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast<uint8>(clamped_output); - } - } - } +// TODO(aselle): This is not actually optimized yet. +inline void SubNonBroadcast(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + gemmlowp::ScopedProfilingLabel label("SubNonBroadcast"); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.float_activation_min, + params.float_activation_max); } } -template <FusedActivationFunctionType Ac, typename Scalar> -void Concatenation(int concat_dim, const Scalar* const* input_data, - const Dims<4>* const* input_dims, int inputs_count, - Scalar* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Concatenation"); - int concat_size = 0; - for (int i = 0; i < inputs_count; i++) { - for (int j = 0; j < 4; j++) { - if (j != concat_dim) { - MatchingArraySize(*input_dims[i], j, output_dims, j); - } - } - concat_size += ArraySize(*input_dims[i], concat_dim); - } - TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); - TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); - // for now we dont have a model with a Concatenation - // with fused activation function. - TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); - int outer_size = 1; - for (int i = concat_dim + 1; i < 4; i++) { - outer_size *= output_dims.sizes[i]; - } - Scalar* output_ptr = output_data; - for (int k = 0; k < outer_size; k++) { - for (int i = 0; i < inputs_count; ++i) { - const int copy_size = - input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; - memcpy(output_ptr, input_data[i] + k * copy_size, - copy_size * sizeof(Scalar)); - output_ptr += copy_size; - } +inline void SubWithActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + gemmlowp::ScopedProfilingLabel label("SubWithActivation/int32"); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, input2_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.quantized_activation_min, + params.quantized_activation_max); } } -// TODO(prabhumk): This is the same as the reference implementation. -// TODO(prabhumk): The quantized implementation of concatentation isn't fully -// quantized as it takes scale as a floating point value. This should be fixed -// when optimizng this routine further. -inline void Concatenation(int concat_dim, const uint8* const* input_data, - const Dims<4>* const* input_dims, - const int32* input_zeropoint, - const float* input_scale, int inputs_count, - uint8* output_data, const Dims<4>& output_dims, - const int32 output_zeropoint, - const float output_scale) { - // The arguments input_zeropoint and input_scale are expected to be an array - // that have the quantization parameters for all the inputs to the concat - // operator. - gemmlowp::ScopedProfilingLabel label("Concatenation"); - TFLITE_DCHECK_GT(inputs_count, 1); - int concat_size = 0; - for (int i = 0; i < inputs_count; i++) { - for (int j = 0; j < 4; j++) { - if (j != concat_dim) { - MatchingArraySize(*input_dims[i], j, output_dims, j); - } - } - concat_size += ArraySize(*input_dims[i], concat_dim); - } - TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); - int outer_size = 1; - for (int i = concat_dim + 1; i < 4; i++) { - outer_size *= output_dims.sizes[i]; - } - const float inverse_output_scale = 1.f / output_scale; - uint8* output_ptr = output_data; - for (int k = 0; k < outer_size; k++) { - for (int i = 0; i < inputs_count; ++i) { - const int copy_size = - input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; - const uint8* input_ptr = input_data[i] + k * copy_size; - if (input_zeropoint[i] == output_zeropoint && - input_scale[i] == output_scale) { - memcpy(output_ptr, input_ptr, copy_size); - } else { - const float scale = input_scale[i] * inverse_output_scale; - const float bias = -input_zeropoint[i] * scale; - for (int j = 0; j < copy_size; ++j) { - const int32_t value = - static_cast<int32_t>(round(input_ptr[j] * scale + bias)) + - output_zeropoint; - output_ptr[j] = - static_cast<uint8_t>(std::max(std::min(255, value), 0)); - } - } - output_ptr += copy_size; - } +inline void SubWithActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + gemmlowp::ScopedProfilingLabel label("SubWithActivation/float"); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, input2_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.float_activation_min, + params.float_activation_max); } } -template <FusedActivationFunctionType Ac, typename Scalar> -void DepthConcatenation(const Scalar* const* input_data, - const Dims<4>* const* input_dims, int inputs_count, - Scalar* output_data, const Dims<4>& output_dims) { - Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count, - output_data, output_dims); -} +template <typename T> +void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape, + const T* input1_data, const RuntimeShape& input2_shape, + const T* input2_data, const RuntimeShape& output_shape, + T* output_data) { + gemmlowp::ScopedProfilingLabel label("Sub"); -inline void LstmCell(const float* input_data, const Dims<4>& input_dims, - const float* prev_activ_data, - const Dims<4>& prev_activ_dims, const float* weights_data, - const Dims<4>& weights_dims, const float* bias_data, - const Dims<4>& bias_dims, const float* prev_state_data, - const Dims<4>& prev_state_dims, float* output_state_data, - const Dims<4>& output_state_dims, float* output_activ_data, - const Dims<4>& output_activ_dims, float* concat_temp_data, - const Dims<4>& concat_temp_dims, float* activ_temp_data, - const Dims<4>& activ_temp_dims) { + auto input1_map = MapAsVector(input1_data, input1_shape); + auto input2_map = MapAsVector(input2_data, input2_shape); + auto output_map = MapAsVector(output_data, output_shape); + if (input1_shape == input2_shape) { + output_map.array() = input1_map.array() - input2_map.array(); + } else if (input1_shape.FlatSize() == 1) { + auto scalar = input1_data[0]; + output_map.array() = scalar - input2_map.array(); + } else if (input2_shape.FlatSize() == 1) { + auto scalar = input2_data[0]; + output_map.array() = input1_map.array() - scalar; + } else { + BroadcastSub4DSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); + } +} + +inline void LstmCell( + const LstmCellParams& params, const RuntimeShape& unextended_input_shape, + const float* input_data, const RuntimeShape& unextended_prev_activ_shape, + const float* prev_activ_data, const RuntimeShape& weights_shape, + const float* weights_data, const RuntimeShape& unextended_bias_shape, + const float* bias_data, const RuntimeShape& unextended_prev_state_shape, + const float* prev_state_data, + const RuntimeShape& unextended_output_state_shape, float* output_state_data, + const RuntimeShape& unextended_output_activ_shape, float* output_activ_data, + const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data, + const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) { gemmlowp::ScopedProfilingLabel label("LstmCell"); - MatchingArraySize( // batches - input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims, - 3, output_activ_dims, 3); - MatchingArraySize( // height - input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims, - 2, output_activ_dims, 2); - MatchingArraySize( // width - input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims, - 1, output_activ_dims, 1); - TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); - TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); - const int input_depth = ArraySize(input_dims, 0); - const int prev_activ_depth = ArraySize(prev_activ_dims, 0); + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape prev_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape); + const RuntimeShape bias_shape = + RuntimeShape::ExtendedShape(4, unextended_bias_shape); + const RuntimeShape prev_state_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_state_shape); + const RuntimeShape output_state_shape = + RuntimeShape::ExtendedShape(4, unextended_output_state_shape); + const RuntimeShape output_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_output_activ_shape); + const RuntimeShape concat_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape); + const RuntimeShape activ_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + + const int weights_dim_count = weights_shape.DimensionsCount(); + MatchingDim( // batches + input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0, + output_state_shape, 0, output_activ_shape, 0); + MatchingDim( // height + input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1, + output_state_shape, 1, output_activ_shape, 1); + MatchingDim( // width + input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2, + output_state_shape, 2, output_activ_shape, 2); + const int input_depth = input_shape.Dims(3); + const int prev_activ_depth = prev_activ_shape.Dims(3); const int total_input_depth = prev_activ_depth + input_depth; - TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); - TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), - 1); + TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), + total_input_depth); + TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1); const int intern_activ_depth = - MatchingArraySize(weights_dims, 1, bias_dims, 0); - TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); + MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3); + TFLITE_DCHECK_EQ(weights_shape.FlatSize(), + intern_activ_depth * total_input_depth); + TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0); const int output_depth = - MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, - output_state_dims, 0, output_activ_dims, 0); - TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); + MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape, + 3, output_activ_shape, 3); + TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4); // Concatenate prev_activ and input data together std::vector<float const*> concat_input_arrays_data; - std::vector<Dims<4> const*> concat_input_arrays_dims; + std::vector<RuntimeShape const*> concat_input_arrays_shapes; concat_input_arrays_data.push_back(input_data); concat_input_arrays_data.push_back(prev_activ_data); - concat_input_arrays_dims.push_back(&input_dims); - concat_input_arrays_dims.push_back(&prev_activ_dims); - Concatenation<FusedActivationFunctionType::kNone, float>( - 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]), - concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims); + concat_input_arrays_shapes.push_back(&input_shape); + concat_input_arrays_shapes.push_back(&prev_activ_shape); + tflite::ConcatenationParams concat_params; + concat_params.axis = 3; + concat_params.inputs_count = concat_input_arrays_data.size(); + Concatenation(concat_params, &(concat_input_arrays_shapes[0]), + &(concat_input_arrays_data[0]), concat_temp_shape, + concat_temp_data); // Fully connected - FullyConnected<FusedActivationFunctionType::kNone>( - concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data, - bias_dims, activ_temp_data, activ_temp_dims); + tflite::FullyConnectedParams fc_params; + fc_params.float_activation_min = std::numeric_limits<float>::lowest(); + fc_params.float_activation_max = std::numeric_limits<float>::max(); + FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, + weights_data, bias_shape, bias_data, activ_temp_shape, + activ_temp_data); // Map raw arrays to Eigen arrays so we can use Eigen's optimized array // operations. ArrayMap<float> activ_temp_map = - MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims); + MapAsArrayWithLastDimAsRows(activ_temp_data, activ_temp_shape); auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth, activ_temp_map.cols()); auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth, @@ -3450,11 +3142,11 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims, auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth, activ_temp_map.cols()); ArrayMap<const float> prev_state_map = - MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims); + MapAsArrayWithLastDimAsRows(prev_state_data, prev_state_shape); ArrayMap<float> output_state_map = - MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims); + MapAsArrayWithLastDimAsRows(output_state_data, output_state_shape); ArrayMap<float> output_activ_map = - MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims); + MapAsArrayWithLastDimAsRows(output_activ_data, output_activ_shape); // Combined memory state and final output calculation gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput"); @@ -3472,52 +3164,91 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims, // reference_ops.h. See the big function comment there, not replicating it // here. template <int StateIntegerBits> -void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, - const uint8* prev_activ_data_uint8, - const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8, - const Dims<4>& weights_dims, const int32* bias_data_int32, - const Dims<4>& bias_dims, const int16* prev_state_data_int16, - const Dims<4>& prev_state_dims, int16* output_state_data_int16, - const Dims<4>& output_state_dims, uint8* output_activ_data_uint8, - const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8, - const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16, - const Dims<4>& activ_temp_dims, int32 weights_zero_point, - int32 accum_multiplier, int accum_shift, - gemmlowp::GemmContext* gemm_context) { +inline void LstmCell( + const LstmCellParams& params, const RuntimeShape& unextended_input_shape, + const uint8* input_data_uint8, + const RuntimeShape& unextended_prev_activ_shape, + const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape, + const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape, + const int32* bias_data_int32, + const RuntimeShape& unextended_prev_state_shape, + const int16* prev_state_data_int16, + const RuntimeShape& unextended_output_state_shape, + int16* output_state_data_int16, + const RuntimeShape& unextended_output_activ_shape, + uint8* output_activ_data_uint8, + const RuntimeShape& unextended_concat_temp_shape, + uint8* concat_temp_data_uint8, + const RuntimeShape& unextended_activ_temp_shape, + int16* activ_temp_data_int16, gemmlowp::GemmContext* gemm_context) { gemmlowp::ScopedProfilingLabel label( "LstmCell/quantized (8bit external, 16bit internal)"); + int32 weights_zero_point = params.weights_zero_point; + int32 accum_multiplier = params.accum_multiplier; + int accum_shift = params.accum_shift; + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape prev_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape); + const RuntimeShape bias_shape = + RuntimeShape::ExtendedShape(4, unextended_bias_shape); + const RuntimeShape prev_state_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_state_shape); + const RuntimeShape output_state_shape = + RuntimeShape::ExtendedShape(4, unextended_output_state_shape); + const RuntimeShape output_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_output_activ_shape); + const RuntimeShape concat_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape); + const RuntimeShape activ_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + // Gather dimensions information, and perform consistency checks. - const int outer_size = - MatchingFlatSizeSkipDim(input_dims, 0, prev_activ_dims, prev_state_dims, - output_state_dims, output_activ_dims); - TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); - TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); - const int input_depth = ArraySize(input_dims, 0); - const int prev_activ_depth = ArraySize(prev_activ_dims, 0); + const int weights_dim_count = weights_shape.DimensionsCount(); + const int outer_size = MatchingFlatSizeSkipDim( + input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape, + output_activ_shape); + const int input_depth = input_shape.Dims(3); + const int prev_activ_depth = prev_activ_shape.Dims(3); const int total_input_depth = prev_activ_depth + input_depth; - TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); - TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), - 1); + TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), + total_input_depth); const int intern_activ_depth = - MatchingArraySize(weights_dims, 1, bias_dims, 0); - TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); + MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3); + TFLITE_DCHECK_EQ(weights_shape.FlatSize(), + intern_activ_depth * total_input_depth); + TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1); + TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0); const int output_depth = - MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, - output_state_dims, 0, output_activ_dims, 0); - TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); - const int fc_batches = FlatSizeSkipDim(activ_temp_dims, 0); + MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape, + 3, output_activ_shape, 3); + TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4); + const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3); const int fc_output_depth = - MatchingArraySize(weights_dims, 1, activ_temp_dims, 0); - const int fc_accum_depth = ArraySize(weights_dims, 0); - TFLITE_CHECK_EQ(fc_output_depth, 4 * output_depth); + MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3); + const int fc_accum_depth = total_input_depth; + TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth); // Depth-concatenate prev_activ and input data together. uint8 const* concat_input_arrays_data[2] = {input_data_uint8, prev_activ_data_uint8}; - Dims<4> const* concat_input_arrays_dims[2] = {&input_dims, &prev_activ_dims}; - Concatenation<FusedActivationFunctionType::kNone, uint8>( - 0, concat_input_arrays_data, concat_input_arrays_dims, 2, - concat_temp_data_uint8, concat_temp_dims); + const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape, + &prev_activ_shape}; + tflite::ConcatenationParams concat_params; + concat_params.axis = 3; + concat_params.inputs_count = 2; + Concatenation(concat_params, concat_input_arrays_shapes, + concat_input_arrays_data, concat_temp_shape, + concat_temp_data_uint8); // Implementation of the fully connected node inside the LSTM cell. // The operands are 8-bit integers, the accumulators are internally 32bit @@ -3527,10 +3258,10 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, bool gemm_already_performed = false; #ifdef GEMMLOWP_NEON if (fc_batches == 1 && !(fc_output_depth % 4) && !(fc_accum_depth % 8)) { - GEMVForLstmCell(concat_temp_data_uint8, concat_temp_dims, - weights_data_uint8, weights_dims, weights_zero_point, - bias_data_int32, bias_dims, accum_multiplier, accum_shift, - activ_temp_data_int16, activ_temp_dims); + GEMVForLstmCell(concat_temp_shape, concat_temp_data_uint8, weights_shape, + weights_data_uint8, weights_zero_point, bias_shape, + bias_data_int32, accum_multiplier, accum_shift, + activ_temp_shape, activ_temp_data_int16); gemm_already_performed = true; } #endif @@ -3719,51 +3450,28 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, } } -template <FusedActivationFunctionType Ac, typename Scalar> -void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, - int outputs_count, Scalar* const* output_data, - const Dims<4>* const* output_dims) { - gemmlowp::ScopedProfilingLabel label("TensorFlowSplit"); - TFLITE_DCHECK_GE(outputs_count, 1); - for (int i = 0; i < outputs_count; i++) { - MatchingFlatSizeSkipDim(*output_dims[i], 0, input_dims); - } - const int outer_size = FlatSizeSkipDim(input_dims, 0); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - // For now we don't have a model with a TensorFlowSplit - // with fused activation function. - TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); - const Scalar* input_ptr = input_data; - for (int k = 0; k < outer_size; k++) { - for (int i = 0; i < outputs_count; ++i) { - memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr, - output_dims[i]->sizes[0] * sizeof(Scalar)); - input_ptr += output_dims[i]->sizes[0]; - } - } -} - inline int NodeOffset(int b, int h, int w, int height, int width) { return (b * height + h) * width + w; } -inline void AveragePool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int kwidth, int kheight, - float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims) { +inline void AveragePool(const PoolParams& params, + const RuntimeShape& input_shape, + const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("AveragePool"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; // TODO(benoitjacob) make this a proper reference impl without Eigen! - const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); // TODO(benoitjacob) get rid of the dynamic memory allocation here! Eigen::VectorXf out_count(out_mat.cols()); out_count.setZero(); @@ -3774,12 +3482,15 @@ inline void AveragePool(const float* input_data, const Dims<4>& input_dims, for (int w = 0; w < input_width; ++w) { // (h_start, h_end) * (w_start, w_end) is the range that the input // vector projects to. - int hpad = h + pad_height; - int wpad = w + pad_width; - int h_start = - (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = (hpad < params.filter_height) + ? 0 + : (hpad - params.filter_height) / stride_height + 1; int h_end = std::min(hpad / stride_height + 1, output_height); - int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; + int w_start = (wpad < params.filter_width) + ? 0 + : (wpad - params.filter_width) / stride_width + 1; int w_end = std::min(wpad / stride_width + 1, output_width); // compute elementwise sum for (int ph = h_start; ph < h_end; ++ph) { @@ -3797,69 +3508,44 @@ inline void AveragePool(const float* input_data, const Dims<4>& input_dims, TFLITE_DCHECK_GT(out_count.minCoeff(), 0); out_mat.array().rowwise() /= out_count.transpose().array(); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < output_height; ++y) { - for (int x = 0; x < output_width; ++x) { - for (int c = 0; c < depth; ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - ActivationFunctionWithMinMax( - output_data[Offset(output_dims, c, x, y, b)], - output_activation_min, output_activation_max); - } - } - } + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], + params.float_activation_min, + params.float_activation_max); } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void AveragePool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int kwidth, int kheight, float* output_data, - const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, kwidth, kheight, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, - int filter_height, float* output_data, - const Dims<4>& output_dims) { - AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_data, output_dims); -} - -inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { +inline void AveragePool(const PoolParams& params, + const RuntimeShape& input_shape, + const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { gemmlowp::ScopedProfilingLabel label("AveragePool/8bit"); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); const int filter_count = (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); // 1280 required by Inception v3 @@ -3868,11 +3554,12 @@ inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, uint16 acc[kAccBufferMaxSize]; memset(acc, 0, depth * sizeof(acc[0])); const uint8* input_ptr = - input_data + input_dims.strides[1] * in_x_origin + - input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch; + input_data + + depth * (in_x_origin + + input_width * (in_y_origin + input_height * batch)); for (int fy = filter_y_start; fy < filter_y_end; fy++) { - const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] + - filter_x_start * input_dims.strides[1]; + const uint8* input_row_ptr = + input_ptr + depth * (fy * input_width + filter_x_start); for (int fx = filter_x_start; fx < filter_x_end; fx++) { int channel = 0; #ifdef USE_NEON @@ -3903,21 +3590,21 @@ inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, } } uint8* output_ptr = - output_data + Offset(output_dims, 0, out_x, out_y, batch); + output_data + Offset(output_shape, batch, out_y, out_x, 0); int channel = 0; #ifdef USE_NEON -#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ - if (filter_count == FILTER_COUNT) { \ - for (; channel <= depth - 8; channel += 8) { \ - uint16 buf[8]; \ - for (int i = 0; i < 8; i++) { \ - buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ - } \ - uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ - buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); \ - buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); \ - vst1_u8(output_ptr + channel, buf8); \ - } \ +#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ + if (filter_count == FILTER_COUNT) { \ + for (; channel <= depth - 8; channel += 8) { \ + uint16 buf[8]; \ + for (int i = 0; i < 8; i++) { \ + buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ + } \ + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ + buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \ + buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \ + vst1_u8(output_ptr + channel, buf8); \ + } \ } AVGPOOL_DIVIDING_BY(9) AVGPOOL_DIVIDING_BY(15) @@ -3928,15 +3615,15 @@ inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, buf[i] = (acc[channel + i] + filter_count / 2) / filter_count; } uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); - buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); - buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); + buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); + buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); vst1_u8(output_ptr + channel, buf8); } #endif for (; channel < depth; ++channel) { uint16 a = (acc[channel] + filter_count / 2) / filter_count; - a = std::max<uint16>(a, output_activation_min); - a = std::min<uint16>(a, output_activation_max); + a = std::max<uint16>(a, params.quantized_activation_min); + a = std::min<uint16>(a, params.quantized_activation_max); output_ptr[channel] = static_cast<uint8>(a); } } @@ -3944,54 +3631,22 @@ inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void AveragePool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, - int filter_height, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -inline void MaxPool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int kwidth, int kheight, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { +inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("MaxPool"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - - const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); // Prefill the output to minimum representable float value out_mat.setConstant(std::numeric_limits<float>::lowest()); for (int b = 0; b < batches; ++b) { @@ -3999,12 +3654,15 @@ inline void MaxPool(const float* input_data, const Dims<4>& input_dims, for (int w = 0; w < input_width; ++w) { // (h_start, h_end) * (w_start, w_end) is the range that the input // vector projects to. - int hpad = h + pad_height; - int wpad = w + pad_width; - int h_start = - (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = (hpad < params.filter_height) + ? 0 + : (hpad - params.filter_height) / stride_height + 1; int h_end = std::min(hpad / stride_height + 1, output_height); - int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; + int w_start = (wpad < params.filter_width) + ? 0 + : (wpad - params.filter_width) / stride_width + 1; int w_end = std::min(wpad / stride_width + 1, output_width); // compute elementwise sum for (int ph = h_start; ph < h_end; ++ph) { @@ -4019,78 +3677,55 @@ inline void MaxPool(const float* input_data, const Dims<4>& input_dims, } } } - - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < output_height; ++y) { - for (int x = 0; x < output_width; ++x) { - for (int c = 0; c < depth; ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - ActivationFunctionWithMinMax( - output_data[Offset(output_dims, c, x, y, b)], - output_activation_min, output_activation_max); - } - } - } + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], + params.float_activation_min, + params.float_activation_max); } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void MaxPool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, int pad_height, - int kwidth, int kheight, float* output_data, - const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, kwidth, kheight, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, int filter_height, - float* output_data, const Dims<4>& output_dims) { - MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_data, output_dims); -} - -inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { +inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& output_shape, + uint8* output_data) { gemmlowp::ScopedProfilingLabel label("MaxPool/8bit"); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); // 2048 required by Inception v3 static constexpr int kAccBufferMaxSize = 2048; TFLITE_DCHECK_LE(depth, kAccBufferMaxSize); uint8 acc[kAccBufferMaxSize]; memset(acc, 0, depth * sizeof(acc[0])); const uint8* input_ptr = - input_data + input_dims.strides[1] * in_x_origin + - input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch; + input_data + + depth * (in_x_origin + + input_width * (in_y_origin + input_height * batch)); for (int fy = filter_y_start; fy < filter_y_end; fy++) { - const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] + - filter_x_start * input_dims.strides[1]; + const uint8* input_row_ptr = + input_ptr + depth * (fy * input_width + filter_x_start); for (int fx = filter_x_start; fx < filter_x_end; fx++) { int channel = 0; #ifdef USE_NEON @@ -4116,26 +3751,26 @@ inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, } } uint8* output_ptr = - output_data + Offset(output_dims, 0, out_x, out_y, batch); + output_data + Offset(output_shape, batch, out_y, out_x, 0); int channel = 0; #ifdef USE_NEON for (; channel <= depth - 16; channel += 16) { uint8x16_t a = vld1q_u8(acc + channel); - a = vminq_u8(a, vdupq_n_u8(output_activation_max)); - a = vmaxq_u8(a, vdupq_n_u8(output_activation_min)); + a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max)); + a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min)); vst1q_u8(output_ptr + channel, a); } for (; channel <= depth - 8; channel += 8) { uint8x8_t a = vld1_u8(acc + channel); - a = vmin_u8(a, vdup_n_u8(output_activation_max)); - a = vmax_u8(a, vdup_n_u8(output_activation_min)); + a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max)); + a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min)); vst1_u8(output_ptr + channel, a); } #endif for (; channel < depth; ++channel) { uint8 a = acc[channel]; - a = std::max<uint8>(a, output_activation_min); - a = std::min<uint8>(a, output_activation_max); + a = std::max<uint8>(a, params.quantized_activation_min); + a = std::min<uint8>(a, params.quantized_activation_max); output_ptr[channel] = static_cast<uint8>(a); } } @@ -4143,53 +3778,23 @@ inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void MaxPool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, int pad_height, - int filter_width, int filter_height, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -inline void L2Pool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { +inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("L2Pool"); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; // Actually carry out L2 Pool. Code is written in forward mode: we go through // the input values once, and write to all the pooled regions that it maps to. - const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); Eigen::VectorXf in_square(in_mat.rows()); Eigen::VectorXf out_count(out_mat.cols()); out_count.setZero(); @@ -4200,15 +3805,17 @@ inline void L2Pool(const float* input_data, const Dims<4>& input_dims, for (int w = 0; w < input_width; ++w) { // (h_start, h_end) * (w_start, w_end) is the range that the input // vector projects to. - const int hpad = h + pad_height; - const int wpad = w + pad_width; - const int h_start = (hpad < filter_height) - ? 0 - : (hpad - filter_height) / stride_height + 1; + const int hpad = h + params.padding_values.height; + const int wpad = w + params.padding_values.width; + const int h_start = + (hpad < params.filter_height) + ? 0 + : (hpad - params.filter_height) / stride_height + 1; const int h_end = std::min(hpad / stride_height + 1, output_height); - const int w_start = (wpad < filter_width) - ? 0 - : (wpad - filter_width) / stride_width + 1; + const int w_start = + (wpad < params.filter_width) + ? 0 + : (wpad - params.filter_width) / stride_width + 1; const int w_end = std::min(wpad / stride_width + 1, output_width); // pre-compute square const int in_offset = w + input_width * (h + input_height * b); @@ -4229,53 +3836,37 @@ inline void L2Pool(const float* input_data, const Dims<4>& input_dims, out_count = out_count.array().inverse(); out_mat = (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt(); -} -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void L2Pool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, int pad_height, - int filter_width, int filter_height, float* output_data, - const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - L2Pool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, int filter_height, - float* output_data, const Dims<4>& output_dims) { - L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_data, output_dims); + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], + params.float_activation_min, + params.float_activation_max); + } } -inline void LocalResponseNormalization(const float* input_data, - const Dims<4>& input_dims, int range, - float bias, float alpha, float beta, - float* output_data, - const Dims<4>& output_dims) { +inline void LocalResponseNormalization( + const tflite::LocalResponseNormalizationParams& op_params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("LocalResponseNormalization"); - MatchingFlatSize(input_dims, output_dims); + MatchingFlatSize(input_shape, output_shape); - const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + const auto data_in = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto data_out = MapAsMatrixWithLastDimAsRows(output_data, output_shape); // Carry out local response normalization, vector by vector. // Since the data are stored column major, making row-wise operation // probably not memory efficient anyway, we do an explicit for loop over // the columns. - const int double_range = range * 2; + const int double_range = op_params.range * 2; Eigen::VectorXf padded_square(data_in.rows() + double_range); padded_square.setZero(); for (int r = 0; r < data_in.cols(); ++r) { // Do local response normalization for data_in(:, r) // first, compute the square and store them in buffer for repeated use - padded_square.block(range, 0, data_in.rows(), 1) = - data_in.col(r).cwiseProduct(data_in.col(r)) * alpha; + padded_square.block(op_params.range, 0, data_in.rows(), 1) = + data_in.col(r).cwiseProduct(data_in.col(r)) * op_params.alpha; // Then, compute the scale and writes them to data_out float accumulated_scale = 0; for (int i = 0; i < double_range; ++i) { @@ -4283,32 +3874,33 @@ inline void LocalResponseNormalization(const float* input_data, } for (int i = 0; i < data_in.rows(); ++i) { accumulated_scale += padded_square(i + double_range); - data_out(i, r) = bias + accumulated_scale; + data_out(i, r) = op_params.bias + accumulated_scale; accumulated_scale -= padded_square(i); } } // In a few cases, the pow computation could benefit from speedups. - if (beta == 1) { + if (op_params.beta == 1) { data_out.array() = data_in.array() * data_out.array().inverse(); - } else if (beta == 0.5) { + } else if (op_params.beta == 0.5) { data_out.array() = data_in.array() * data_out.array().sqrt().inverse(); } else { - data_out.array() = data_in.array() * data_out.array().pow(-beta); + data_out.array() = data_in.array() * data_out.array().pow(-op_params.beta); } } -inline void Softmax(const float* input_data, const Dims<4>& input_dims, - float beta, float* output_data, - const Dims<4>& output_dims) { +inline void Softmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Softmax"); - MatchingFlatSize(input_dims, output_dims); + MatchingFlatSize(input_shape, output_shape); - const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); - auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); // Compute the exponential first, removing the max coefficient for numerical // stability. - out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * beta; + out_mat = + (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta; // We are separating out the exp function so that exp can be vectorized. out_mat = out_mat.array().exp(); // Normalize to get the activations. @@ -4317,10 +3909,12 @@ inline void Softmax(const float* input_data, const Dims<4>& input_dims, out_mat.array().rowwise() *= scale; } -inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, - int32 input_beta_multiplier, int32 input_beta_left_shift, - int diff_min, uint8* output_data, - const Dims<4>& output_dims) { +inline void Softmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { + const int32 input_beta_multiplier = params.input_multiplier; + const int32 input_beta_left_shift = params.input_left_shift; + const int diff_min = params.diff_min; // The representation chosen for the input to the exp() function is Q5.26. // We need to leave extra space since values that we skip might be as large as // -32 before multiplying by input_beta_multiplier, and therefore as large as @@ -4334,8 +3928,11 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; gemmlowp::ScopedProfilingLabel label("Softmax/8bit"); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int b = 0; b < outer_size; ++b) { const uint8* input_data_ptr = input_data + b * depth; @@ -4452,7 +4049,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, // perform a division by the above-computed sum-of-exponentials. int32 fixed_sum_of_exps = sum_of_exps.raw(); int headroom_plus_one = - __builtin_clz(static_cast<uint32>(fixed_sum_of_exps)); + CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps)); // This is the number of bits to the left of the binary point above 1.0. // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and // no later adjustment will be needed. @@ -4525,11 +4122,15 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, // TODO(myenik): This is the same as the reference implementation, not actually // optimized yet. -inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { +inline void LogSoftmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("LogSoftmax"); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { const float* block_input_data = input_data + i * depth; @@ -4556,13 +4157,129 @@ inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims, } } +template <int OutputIntegerBits, int InputIntegerBits> +inline gemmlowp::FixedPoint<int32, OutputIntegerBits> +log_x_for_x_greater_than_or_equal_to_1_impl( + gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) { + // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1); + // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits); + using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; + // The reason for accumulating the result with an extra bit of headroom is + // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled * + // recip_denom will otherwise introduce an error. + static constexpr int kAccumIntegerBits = OutputIntegerBits + 1; + using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>; + + const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 1488522236, std::log(2.0)); + const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5))); + const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 1518500250, std::sqrt(0.5)); + const FixedPoint0 one_quarter = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0); + + const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0))); + const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0))); + const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 1057819769, + 2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0))); + const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0))); + + const FixedPointAccum shifted_quarter = + gemmlowp::Rescale<kAccumIntegerBits>(one_quarter); + + // Reinterpret the input value as Q0.31, because we will figure out the + // required shift "ourselves" instead of using, say, Rescale. + FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw()); + // z_a_pow_2 = input_integer_bits - z_a_headroom; + int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw())); + FixedPoint0 r_a_tmp = + SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1)); + const int32 r_a_raw = + SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1); + // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25); + // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25, + // InputIntegerBits - z_b_headroom - 0.25); + const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp( + FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam( + InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)), + shifted_quarter); + + // z_b is treated like z_a, but premultiplying by sqrt(0.5). + FixedPoint0 z_b = z_a * sqrt_half; + int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1; + const int32 r_b_raw = + SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom); + const FixedPointAccum z_b_pow_2_adj = SaturatingSub( + FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam( + InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)), + shifted_quarter); + + const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw)); + const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw( + std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw())); + + const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half); + FixedPoint0 q = r - sqrt_sqrt_half; + q = q + q; + + const FixedPoint0 common_sq = q * q; + const FixedPoint0 num = q * r + q * common_sq * alpha_n; + const FixedPoint0 denom_minus_one_0 = + p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q; + const FixedPoint0 recip_denom = + one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0); + + const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num); + return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 + + num_scaled * recip_denom); +} + +// Minimum output bits to accommodate log of maximum input range. It actually +// does not matter if one considers, say, [-64,64] or [-64,64). +// +// For example, run this through Octave: +// [0:127; ... +// ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ... +// ceil(log(abs( log(2.^(0:127))+1 ))/log(2))] +constexpr int min_log_x_output_bits(int input_bits) { + return input_bits > 90 + ? 7 + : input_bits > 44 + ? 6 + : input_bits > 21 + ? 5 + : input_bits > 10 + ? 4 + : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1; +} + +template <int OutputIntegerBits, int InputIntegerBits> +inline gemmlowp::FixedPoint<int32, OutputIntegerBits> +log_x_for_x_greater_than_or_equal_to_1( + gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) { + static_assert( + OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits), + "Output integer bits must be sufficent to accommodate logs of inputs."); + return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits, + InputIntegerBits>( + input_val); +} + // Currently just a copy of the reference code. -inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, - int32 input_multiplier, int32 input_left_shift, - int32 reverse_scaling_divisor, - int32 reverse_scaling_right_shift, int diff_min, - uint8* output_data, const Dims<4>& output_dims) { +inline void LogSoftmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { gemmlowp::ScopedProfilingLabel label("LogSoftmax/Uint8"); + const int32 input_multiplier = params.input_multiplier; + const int32 input_left_shift = params.input_left_shift; + const int32 reverse_scaling_divisor = params.reverse_scaling_divisor; + const int32 reverse_scaling_right_shift = params.reverse_scaling_right_shift; + const int diff_min = params.diff_min; // The representation chosen for the input to the exp() function is Q5.26. // We need to leave extra space since values that we skip might be as large as // -32 before multiplying by input_beta_multiplier, and therefore as large as @@ -4576,8 +4293,11 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { const uint8* block_input_data = input_data + i * depth; @@ -4601,13 +4321,10 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, } } - // TODO(b/77858996): Implement fixed-point log(). - // Not a fully-quantized implementation: floating-point log(). - const float float_log_sum_of_exps = - std::log(static_cast<float>(sum_of_exps.raw()) / - (1 << (31 - kAccumulationIntegerBits))); - const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound( - float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits)))); + const int32 fixed_log_sum_of_exps = + log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>( + sum_of_exps) + .raw(); // rescaled_diff_min is smallest representable in // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the @@ -4618,9 +4335,9 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest(); const int adjusted_diff_min = std::max(diff_min - 1, // Note use of > below instead of >= above. - MultiplyByQuantizedMultiplierSmallerThanOne( + MultiplyByQuantizedMultiplierSmallerThanOneExp( rescaled_diff_min, reverse_scaling_divisor, - reverse_scaling_right_shift)); + -reverse_scaling_right_shift)); for (int c = 0; c < depth; ++c) { int32 input_diff = static_cast<int32>(block_input_data[c]) - max_in_row; @@ -4644,21 +4361,33 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, } } -inline void Logistic(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { +inline void Logistic(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Logistic"); - auto input_map = MapAsVector(input_data, input_dims); - auto output_map = MapAsVector(output_data, output_dims); + auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()); } -inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, int32 input_range_radius, - int32 input_multiplier, int input_left_shift, - uint8* output_data, const Dims<4>& output_dims) { +// Convenience version that allows, for example, generated-code calls to be +// uniform between data types. +inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { + // Drop params: not needed. + Logistic(input_shape, input_data, output_shape, output_data); +} + +inline void Logistic(const LogisticParams& params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { gemmlowp::ScopedProfilingLabel label("Logistic/Uint8"); - const int size = MatchingFlatSize(input_dims, output_dims); + const int32 input_zero_point = params.input_zero_point; + const int32 input_range_radius = params.input_range_radius; + const int32 input_multiplier = params.input_multiplier; + const int input_left_shift = params.input_left_shift; + const int size = MatchingFlatSize(input_shape, output_shape); int c = 0; #ifdef USE_NEON @@ -4790,10 +4519,11 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, } } -inline void Logistic(const int16* input_data, const Dims<4>& input_dims, - int16* output_data, const Dims<4>& output_dims) { +inline void Logistic(const LogisticParams& params, + const RuntimeShape& input_shape, const int16* input_data, + const RuntimeShape& output_shape, int16* output_data) { gemmlowp::ScopedProfilingLabel label("Logistic/Int16"); - const int flat_size = MatchingFlatSize(output_dims, input_dims); + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { } @@ -4850,21 +4580,33 @@ inline void Logistic(const int16* input_data, const Dims<4>& input_dims, } } -inline void Tanh(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { +inline void Tanh(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Tanh"); - auto input_map = MapAsVector(input_data, input_dims); - auto output_map = MapAsVector(output_data, output_dims); + auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); output_map.array() = input_map.array().tanh(); } -inline void Tanh(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, int32 input_range_radius, - int32 input_multiplier, int input_left_shift, - uint8* output_data, const Dims<4>& output_dims) { +// Convenience version that allows, for example, generated-code calls to be +// uniform between data types. +inline void Tanh(const TanhParams&, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { + // Drop params: not needed. + Tanh(input_shape, input_data, output_shape, output_data); +} + +inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& output_shape, + uint8* output_data) { // Note that this is almost the exact same code as in Logistic(). gemmlowp::ScopedProfilingLabel label("Tanh"); - const int size = MatchingFlatSize(input_dims, output_dims); + const int32 input_zero_point = params.input_zero_point; + const int32 input_range_radius = params.input_range_radius; + const int32 input_multiplier = params.input_multiplier; + const int input_left_shift = params.input_left_shift; + const int size = MatchingFlatSize(input_shape, output_shape); int c = 0; int32_t output_zero_point = 128; @@ -5005,16 +4747,17 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims, } } -inline void Tanh(const int16* input_data, const Dims<4>& input_dims, - int input_left_shift, int16* output_data, - const Dims<4>& output_dims) { +inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, + const int16* input_data, const RuntimeShape& output_shape, + int16* output_data) { gemmlowp::ScopedProfilingLabel label("Tanh/Int16"); + const int input_left_shift = params.input_left_shift; // Support for shifts is limited until we have a parameterized version of // SaturatingRoundingMultiplyByPOT(). TFLITE_DCHECK_GE(input_left_shift, 0); TFLITE_DCHECK_LE(input_left_shift, 1); - const int flat_size = MatchingFlatSize(output_dims, input_dims); + const int flat_size = MatchingFlatSize(input_shape, output_shape); int c = 0; const int16* input_data_ptr = input_data; @@ -5105,86 +4848,23 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims, } } -inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, - int32 zero_point, double scale, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Dequantize"); - const int flat_size = MatchingFlatSize(output_dims, input_dims); - for (int i = 0; i < flat_size; ++i) { - int32 val = input_data[i]; - float result = static_cast<float>(scale * (val - zero_point)); - output_data[i] = result; - } -} - -inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, - float rmin, float rmax, int num_bits, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("FakeQuant"); - - // 0 should always be a representable value. Let's assume that the initial - // min,max range contains 0. - TFLITE_DCHECK_LE(rmin, 0.0f); - TFLITE_DCHECK_GE(rmax, 0.0f); - TFLITE_DCHECK_LT(rmin, rmax); - - // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor. - int quant_min = 0; - int quant_max = (1 << num_bits) - 1; - float nudged_min, nudged_max, nudged_scale; - NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, - &nudged_max, &nudged_scale); - const float inv_nudged_scale = 1.0f / nudged_scale; - - const int flat_size = MatchingFlatSize(output_dims, input_dims); - for (int i = 0; i < flat_size; ++i) { - const float src_val = input_data[i]; - const float clamped = std::min(nudged_max, std::max(nudged_min, src_val)); - const float clamped_shifted = clamped - nudged_min; - const float dst_val = - TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale + - nudged_min; - output_data[i] = dst_val; - } -} - template <typename SrcT, typename DstT> -inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, - DstT* output_data, const Dims<4>& output_dims) { +inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data, + const RuntimeShape& output_shape, DstT* output_data) { gemmlowp::ScopedProfilingLabel label("Cast"); - auto input_map = MapAsVector(input_data, input_dims); - auto output_map = MapAsVector(output_data, output_dims); + auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); output_map.array() = input_map.array().template cast<DstT>(); } -inline void Floor(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { +inline void Floor(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("Floor"); - auto input_map = MapAsVector(input_data, input_dims); - auto output_map = MapAsVector(output_data, output_dims); + auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); output_map.array() = Eigen::floor(input_map.array()); } -template <typename T> -inline void Gather(const T* input_data, const Dims<4>& input_dims, - int input_rank, const int32* coords_data, - const Dims<4>& coords_dims, T* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Gather"); - - TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]); - int stride = input_dims.strides[input_rank - 1]; - T* out = output_data; - - for (int i = 0; i < coords_dims.sizes[0]; i++) { - TFLITE_DCHECK_GE(coords_data[i], 0); - TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]); - const T* in = input_data + coords_data[i] * stride; - memcpy(out, in, sizeof(T) * stride); - out += stride; - } -} - #ifdef USE_NEON inline void ResizeBilinearKernel(const float* input_ptr, int32 depth, float scale, float* output_ptr) { @@ -5284,12 +4964,14 @@ inline void ResizeBilinearKernel(const float* input_ptr, int32 depth, inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, int32 x, int32 y, int32 depth, int32 batch, + const RuntimeShape& input_shape, const float* input_data, - const Dims<4>& input_dims, - float* output_data, - const Dims<4>& output_dims) { - const int32 input_width = ArraySize(input_dims, 1); - const int32 output_width = ArraySize(output_dims, 1); + const RuntimeShape& output_shape, + float* output_data) { + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int32 input_width = input_shape.Dims(2); + const int32 output_width = output_shape.Dims(2); const int32 input_x_offset = (x1 - x0) * depth; const int32 input_y_offset = (y1 - y0) * depth * input_width; @@ -5297,7 +4979,6 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, const int32 output_y_offset = depth * output_width; #ifdef USE_NEON - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); TFLITE_DCHECK(x1 >= x0); TFLITE_DCHECK(y1 >= y0); @@ -5307,7 +4988,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, const float* input_ptr = nullptr; float32x4x2_t x0y0; - input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)]; + input_ptr = &input_data[Offset(input_shape, batch, y0, x0, ic)]; x0y0.val[0] = vld1q_f32(input_ptr); x0y0.val[1] = vld1q_f32(input_ptr + 4); @@ -5327,7 +5008,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, x1y1.val[1] = vld1q_f32(input_ptr + 4); // Top left corner. - float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)]; + float* output_ptr = &output_data[Offset(output_shape, batch, y, x, ic)]; vst1q_f32(output_ptr, x0y0.val[0]); vst1q_f32(output_ptr + 4, x0y0.val[1]); @@ -5366,14 +5047,15 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, } // Handle 4 input channels at a time. for (; ic <= depth - 4; ic += 4) { - const float* input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)]; + const float* input_ptr = + &input_data[Offset(input_shape, batch, y0, x0, ic)]; float32x4_t x0y0 = vld1q_f32(input_ptr); float32x4_t x1y0 = vld1q_f32(input_ptr + input_x_offset); float32x4_t x0y1 = vld1q_f32(input_ptr + input_y_offset); float32x4_t x1y1 = vld1q_f32(input_ptr + input_x_offset + input_y_offset); // Top left corner. - float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)]; + float* output_ptr = &output_data[Offset(output_shape, batch, y, x, ic)]; vst1q_f32(output_ptr, x0y0); // Top right corner. @@ -5397,7 +5079,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, } // Handle one input channel at a time. for (; ic < depth; ic++) { - const int32 input_offset = Offset(input_dims, ic, x0, y0, batch); + const int32 input_offset = Offset(input_shape, batch, y0, x0, ic); float x0y0 = input_data[input_offset]; float x1y0 = input_data[input_offset + input_x_offset]; @@ -5405,7 +5087,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, float x1y1 = input_data[input_offset + input_x_offset + input_y_offset]; // Top left corner. - const int32 output_offset = Offset(output_dims, ic, x, y, batch); + const int32 output_offset = Offset(output_shape, batch, y, x, ic); output_data[output_offset] = x0y0; // Top right corner. @@ -5421,7 +5103,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, } #else for (int ch = 0; ch < depth; ch++) { - const int32 input_offset = Offset(input_dims, ch, x0, y0, batch); + const int32 input_offset = Offset(input_shape, batch, y0, x0, ch); float x0y0 = input_data[input_offset]; float x1y0 = input_data[input_offset + input_x_offset]; @@ -5429,7 +5111,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, float x1y1 = input_data[input_offset + input_x_offset + input_y_offset]; // Top left corner. - const int32 output_offset = Offset(output_dims, ch, x, y, batch); + const int32 output_offset = Offset(output_shape, batch, y, x, ch); output_data[output_offset] = x0y0; // Top right corner. @@ -5446,31 +5128,30 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, #endif } -inline void ResizeBilinear2x2(const float* input_data, - const Dims<4>& input_dims, float* output_data, - const Dims<4>& output_dims, int32 batches, - int32 input_height, int32 input_width, - int32 depth, int32 output_height, - int32 output_width) { +inline void ResizeBilinear2x2(int32 batches, int32 input_height, + int32 input_width, int32 depth, + int32 output_height, int32 output_width, + const RuntimeShape& input_shape, + const float* input_data, + const RuntimeShape& output_shape, + float* output_data) { for (int b = 0; b < batches; b++) { for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) { for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) { int32 x1 = std::min(x0 + 1, input_width - 1); int32 y1 = std::min(y0 + 1, input_height - 1); - ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_data, - input_dims, output_data, output_dims); + ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape, + input_data, output_shape, output_data); } } } } -inline void ResizeBilinearGeneric(const float* input_data, - const Dims<4>& input_dims, float* output_data, - const Dims<4>& output_dims, int32 batches, - int32 input_height, int32 input_width, - int32 depth, int32 output_height, - int32 output_width, float height_scale, - float width_scale) { +inline void ResizeBilinearGeneric( + int32 batches, int32 input_height, int32 input_width, int32 depth, + int32 output_height, int32 output_width, float height_scale, + float width_scale, const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { memset(output_data, 0, batches * output_height * output_width * depth * sizeof(float)); @@ -5487,22 +5168,22 @@ inline void ResizeBilinearGeneric(const float* input_data, float* output_ptr = &output_data[output_offset]; // Run kernel on the 4 corners of the bilinear resize algorithm. - int32 input_offset = Offset(input_dims, 0, x0, y0, b); + int32 input_offset = Offset(input_shape, b, y0, x0, 0); float scale = (1 - (input_y - y0)) * (1 - (input_x - x0)); const float* input_ptr = &input_data[input_offset]; ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); - input_offset = Offset(input_dims, 0, x1, y0, b); + input_offset = Offset(input_shape, b, y0, x1, 0); scale = (1 - (input_y - y0)) * (input_x - x0); input_ptr = &input_data[input_offset]; ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); - input_offset = Offset(input_dims, 0, x0, y1, b); + input_offset = Offset(input_shape, b, y1, x0, 0); scale = (input_y - y0) * (1 - (input_x - x0)); input_ptr = &input_data[input_offset]; ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); - input_offset = Offset(input_dims, 0, x1, y1, b); + input_offset = Offset(input_shape, b, y1, x1, 0); scale = (input_y - y0) * (input_x - x0); input_ptr = &input_data[input_offset]; ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); @@ -5513,102 +5194,134 @@ inline void ResizeBilinearGeneric(const float* input_data, } } -inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, +template <typename T> +inline void ResizeBilinearGenericSmallChannel( + int32 batches, int32 input_height, int32 input_width, int32 depth, + int32 output_height, int32 output_width, float height_scale, + float width_scale, const RuntimeShape& input_shape, const T* input_data, + const RuntimeShape& output_shape, T* output_data) { + memset(output_data, 0, + batches * output_height * output_width * depth * sizeof(T)); + + T* output_ptr = &output_data[0]; + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < output_height; ++y) { + float input_y = y * height_scale; + int32 y0 = static_cast<int32>(std::floor(input_y)); + int32 y1 = std::min(y0 + 1, input_height - 1); + for (int x = 0; x < output_width; ++x) { + float input_x = x * width_scale; + int32 x0 = static_cast<int32>(input_x); + int32 x1 = std::min(x0 + 1, input_width - 1); + + int32 input_offset[4] = {Offset(input_shape, b, y0, x0, 0), + Offset(input_shape, b, y0, x1, 0), + Offset(input_shape, b, y1, x0, 0), + Offset(input_shape, b, y1, x1, 0)}; + float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)), + (1 - (input_y - y0)) * (input_x - x0), + (input_y - y0) * (1 - (input_x - x0)), + (input_y - y0) * (input_x - x0)}; + + for (int d = 0; d < depth; d++) { + const T* input_ptr = &input_data[d]; + *output_ptr++ = static_cast<T>(input_ptr[input_offset[0]] * scale[0] + + input_ptr[input_offset[1]] * scale[1] + + input_ptr[input_offset[2]] * scale[2] + + input_ptr[input_offset[3]] * scale[3]); + } + } + } + } +} + +inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, + const RuntimeShape& unextended_input_shape, + const float* input_data, + const RuntimeShape& output_size_shape, const int32* output_size_data, - const Dims<4>& output_size_dims, float* output_data, - const Dims<4>& output_dims, bool align_corners) { + const RuntimeShape& unextended_output_shape, + float* output_data) { gemmlowp::ScopedProfilingLabel label("ResizeBilinear"); - int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3); - int32 input_height = ArraySize(input_dims, 2); - int32 input_width = ArraySize(input_dims, 1); - int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0); - - TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1); - TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1); - TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1); - TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2); - int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)]; - int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)]; + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + int32 batches = MatchingDim(input_shape, 0, output_shape, 0); + int32 input_height = input_shape.Dims(1); + int32 input_width = input_shape.Dims(2); + int32 depth = MatchingDim(input_shape, 3, output_shape, 3); + + TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2); + int32 output_height = output_size_data[0]; + int32 output_width = output_size_data[1]; // Specialize for 2x2 upsample. - if (!align_corners && output_height == 2 * input_height && + if (!op_params.align_corners && output_height == 2 * input_height && output_width == 2 * input_width) { - ResizeBilinear2x2(input_data, input_dims, output_data, output_dims, batches, - input_height, input_width, depth, output_height, - output_width); + ResizeBilinear2x2(batches, input_height, input_width, depth, output_height, + output_width, input_shape, input_data, output_shape, + output_data); } else { float height_scale = static_cast<float>(input_height) / output_height; float width_scale = static_cast<float>(input_width) / output_width; - if (align_corners && output_height > 1) { + if (op_params.align_corners && output_height > 1) { height_scale = static_cast<float>(input_height - 1) / (output_height - 1); } - if (align_corners && output_width > 1) { + if (op_params.align_corners && output_width > 1) { width_scale = static_cast<float>(input_width - 1) / (output_width - 1); } - ResizeBilinearGeneric(input_data, input_dims, output_data, output_dims, - batches, input_height, input_width, depth, + ResizeBilinearGeneric(batches, input_height, input_width, depth, output_height, output_width, height_scale, - width_scale); + width_scale, input_shape, input_data, output_shape, + output_data); } } -// legacy, for compatibility with old checked-in code -inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, +// TODO(prabhumk): This is not a real quantized bilinear. It does not use int8 +// or int16 arithmetic. +inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, + const RuntimeShape& unextended_input_shape, + const uint8* input_data, + const RuntimeShape& output_size_shape, const int32* output_size_data, - const Dims<4>& output_size_dims, float* output_data, - const Dims<4>& output_dims) { - ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims, - output_data, output_dims, /*align_corners=*/false); -} - -template <typename T> -inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims, - const int32* block_shape_data, - const Dims<4>& block_shape_dims, - const int32* paddings_data, - const Dims<4>& paddings_dims, T* output_data, - const Dims<4>& output_dims) { - // Unoptimized - Straight copy from reference ops. - gemmlowp::ScopedProfilingLabel label("SpaceToBatchND"); - - const int output_batch_size = ArraySize(output_dims, 3); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int input_batch_size = ArraySize(input_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int depth = ArraySize(input_dims, 0); - const int block_shape_height = block_shape_data[0]; - const int block_shape_width = block_shape_data[1]; - const int padding_top = paddings_data[0]; - const int padding_left = paddings_data[2]; - - for (int out_b = 0; out_b < output_batch_size; ++out_b) { - int input_batch = out_b % input_batch_size; - int shift_w = (out_b / input_batch_size) % block_shape_width; - int shift_h = (out_b / input_batch_size) / block_shape_width; - for (int out_h = 0; out_h < output_height; ++out_h) { - for (int out_w = 0; out_w < output_width; ++out_w) { - T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b); - if (out_h * block_shape_height + shift_h < padding_top || - out_h * block_shape_height + shift_h >= - padding_top + input_height || - out_w * block_shape_width + shift_w < padding_left || - out_w * block_shape_width + shift_w >= padding_left + input_width) { - memset(out, 0, depth * sizeof(T)); - } else { - const T* in = - input_data + - Offset(input_dims, 0, - (out_w * block_shape_width + shift_w) - padding_left, - (out_h * block_shape_height + shift_h) - padding_top, - input_batch); - memcpy(out, in, depth * sizeof(T)); - } - } - } - } + const RuntimeShape& unextended_output_shape, + uint8* output_data) { + gemmlowp::ScopedProfilingLabel label("ResizeBilinear"); + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + int32 batches = MatchingDim(input_shape, 0, output_shape, 0); + int32 input_height = input_shape.Dims(1); + int32 input_width = input_shape.Dims(2); + int32 depth = MatchingDim(input_shape, 3, output_shape, 3); + + TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2); + int32 output_height = output_size_data[0]; + int32 output_width = output_size_data[1]; + + float height_scale = + (op_params.align_corners && output_height > 1) + ? (static_cast<float>(input_height - 1) / (output_height - 1)) + : (static_cast<float>(input_height) / output_height); + + float width_scale = + (op_params.align_corners && output_width > 1) + ? (static_cast<float>(input_width - 1) / (output_width - 1)) + : (static_cast<float>(input_width) / output_width); + + ResizeBilinearGenericSmallChannel<uint8>( + batches, input_height, input_width, depth, output_height, output_width, + height_scale, width_scale, input_shape, input_data, output_shape, + output_data); } // Helper methods for BatchToSpaceND. @@ -5633,20 +5346,29 @@ inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, } template <typename T> -inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, - const int32* block_shape_data, - const Dims<4>& block_shape_dims, - const int32* crops_data, const Dims<4>& crops_dims, - T* output_data, const Dims<4>& output_dims) { +inline void BatchToSpaceND( + const RuntimeShape& unextended_input1_shape, const T* input1_data, + const RuntimeShape& unextended_input2_shape, const int32* block_shape_data, + const RuntimeShape& unextended_input3_shape, const int32* crops_data, + const RuntimeShape& unextended_output_shape, T* output_data) { gemmlowp::ScopedProfilingLabel label("BatchToSpaceND"); - const int output_batch_size = ArraySize(output_dims, 3); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int input_batch_size = ArraySize(input_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int depth = ArraySize(input_dims, 0); + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input1_shape = + RuntimeShape::ExtendedShape(4, unextended_input1_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_batch_size = output_shape.Dims(0); + + const int depth = input1_shape.Dims(3); + const int input_width = input1_shape.Dims(2); + const int input_height = input1_shape.Dims(1); + const int input_batch_size = input1_shape.Dims(0); + const int block_shape_width = block_shape_data[1]; const int block_shape_height = block_shape_data[0]; const int crops_top = crops_data[0]; @@ -5681,8 +5403,9 @@ inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, spatial_offset % block_shape_width - crops_left; TFLITE_DCHECK_GE(out_w, 0); TFLITE_DCHECK_LT(out_w, output_width); - T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch); - const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch); + T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0); + const T* in = + input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0); memcpy(out, in, depth * sizeof(T)); } } @@ -5705,31 +5428,56 @@ void TypedMemset(void* ptr, T value, size_t num) { } } -template <typename T> -inline void PadV2(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& left_paddings, - const std::vector<int>& right_paddings, T* output_data, - const Dims<4>& output_dims, const T pad_value) { +// There are two versions of pad: Pad and PadV2. In PadV2 there is a second +// scalar input that provides the padding value. Therefore pad_value_ptr can be +// equivalent to a simple input1_data. For Pad, it should point to a zero +// value. +// +// Note that two typenames are required, so that T=P=int32 is considered a +// specialization distinct from P=int32. +template <typename T, typename P> +inline void PadImpl(const tflite::PadParams& op_params, + const RuntimeShape& input_shape, const T* input_data, + const P* pad_value_ptr, const RuntimeShape& output_shape, + T* output_data) { gemmlowp::ScopedProfilingLabel label("Pad"); - TFLITE_DCHECK_EQ(left_paddings.size(), 4); - TFLITE_DCHECK_EQ(right_paddings.size(), 4); - - const int output_batch = ArraySize(output_dims, 3); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int output_depth = ArraySize(output_dims, 0); - - const int left_b_padding = left_paddings[3]; - const int left_h_padding = left_paddings[2]; - const int left_w_padding = left_paddings[1]; - const int left_d_padding = left_paddings[0]; - - const int right_b_padding = right_paddings[3]; - const int right_h_padding = right_paddings[2]; - const int right_w_padding = right_paddings[1]; - const int right_d_padding = right_paddings[0]; - - const int input_depth = ArraySize(input_dims, 0); + const RuntimeShape ext_input_shape = + RuntimeShape::ExtendedShape(4, input_shape); + const RuntimeShape ext_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + TFLITE_DCHECK_LE(op_params.left_padding_count, 4); + TFLITE_DCHECK_LE(op_params.right_padding_count, 4); + + // Runtime calls are currently fixed at 4 dimensions. Copy inputs so + // we can pad them to 4 dims (yes, we are "padding the padding"). + std::vector<int> left_padding_copy(4, 0); + const int left_padding_extend = 4 - op_params.left_padding_count; + for (int i = 0; i < op_params.left_padding_count; ++i) { + left_padding_copy[left_padding_extend + i] = op_params.left_padding[i]; + } + std::vector<int> right_padding_copy(4, 0); + const int right_padding_extend = 4 - op_params.right_padding_count; + for (int i = 0; i < op_params.right_padding_count; ++i) { + right_padding_copy[right_padding_extend + i] = op_params.right_padding[i]; + } + + const int output_batch = ext_output_shape.Dims(0); + const int output_height = ext_output_shape.Dims(1); + const int output_width = ext_output_shape.Dims(2); + const int output_depth = ext_output_shape.Dims(3); + + const int left_b_padding = left_padding_copy[0]; + const int left_h_padding = left_padding_copy[1]; + const int left_w_padding = left_padding_copy[2]; + const int left_d_padding = left_padding_copy[3]; + + const int right_b_padding = right_padding_copy[0]; + const int right_h_padding = right_padding_copy[1]; + const int right_w_padding = right_padding_copy[2]; + const int right_d_padding = right_padding_copy[3]; + + const int input_depth = ext_input_shape.Dims(3); + const T pad_value = *pad_value_ptr; if (left_b_padding != 0) { TypedMemset<T>( @@ -5739,147 +5487,118 @@ inline void PadV2(const T* input_data, const Dims<4>& input_dims, for (int out_b = left_b_padding; out_b < output_batch - right_b_padding; ++out_b) { if (left_h_padding != 0) { - TypedMemset<T>(output_data + Offset(output_dims, 0, 0, 0, out_b), + TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, 0, 0, 0), pad_value, left_h_padding * output_width * output_depth); } for (int out_h = left_h_padding; out_h < output_height - right_h_padding; ++out_h) { if (left_w_padding != 0) { - TypedMemset<T>(output_data + Offset(output_dims, 0, 0, out_h, out_b), - pad_value, left_w_padding * output_depth); + TypedMemset<T>( + output_data + Offset(ext_output_shape, out_b, out_h, 0, 0), + pad_value, left_w_padding * output_depth); } for (int out_w = left_w_padding; out_w < output_width - right_w_padding; ++out_w) { if (left_d_padding != 0) { TypedMemset<T>( - output_data + Offset(output_dims, 0, out_w, out_h, out_b), + output_data + Offset(ext_output_shape, out_b, out_h, out_w, 0), pad_value, left_d_padding); } T* out = output_data + - Offset(output_dims, left_d_padding, out_w, out_h, out_b); - const T* in = - input_data + Offset(input_dims, 0, out_w - left_w_padding, - out_h - left_h_padding, out_b - left_b_padding); + Offset(ext_output_shape, out_b, out_h, out_w, left_d_padding); + const T* in = input_data + + Offset(ext_input_shape, out_b - left_b_padding, + out_h - left_h_padding, out_w - left_w_padding, 0); memcpy(out, in, input_depth * sizeof(T)); if (right_d_padding != 0) { TypedMemset<T>( - output_data + Offset(output_dims, output_depth - right_d_padding, - out_w, out_h, out_b), + output_data + Offset(ext_output_shape, out_b, out_h, out_w, + output_depth - right_d_padding), pad_value, right_d_padding); } } if (right_w_padding != 0) { - TypedMemset<T>( - output_data + Offset(output_dims, 0, output_width - right_w_padding, - out_h, out_b), - pad_value, right_w_padding * output_depth); + TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, out_h, + output_width - right_w_padding, 0), + pad_value, right_w_padding * output_depth); } } if (right_h_padding != 0) { TypedMemset<T>( - output_data + - Offset(output_dims, 0, 0, output_height - right_h_padding, out_b), + output_data + Offset(ext_output_shape, out_b, + output_height - right_h_padding, 0, 0), pad_value, right_h_padding * output_width * output_depth); } } if (right_b_padding != 0) { TypedMemset<T>( output_data + - Offset(output_dims, 0, 0, 0, output_batch - right_b_padding), + Offset(ext_output_shape, output_batch - right_b_padding, 0, 0, 0), pad_value, right_b_padding * output_height * output_width * output_depth); } } -// Legacy Pad() method that casts an int32_t to T before padding. -template <typename T> -inline void Pad(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& left_paddings, - const std::vector<int>& right_paddings, T* output_data, - const Dims<4>& output_dims, const int32_t pad_value) { - const T converted_pad_value = static_cast<T>(pad_value); - PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data, - output_dims, converted_pad_value); +template <typename T, typename P> +inline void Pad(const tflite::PadParams& op_params, + const RuntimeShape& input_shape, const T* input_data, + const P* pad_value_ptr, const RuntimeShape& output_shape, + T* output_data) { + PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape, + output_data); } +// The second (pad-value) input can be int32 when, say, the first is uint8. template <typename T> -inline void Pad(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& left_paddings, - const std::vector<int>& right_paddings, T* output_data, - const Dims<4>& output_dims) { - Pad(input_data, input_dims, left_paddings, right_paddings, output_data, - output_dims, 0); +inline void Pad(const tflite::PadParams& op_params, + const RuntimeShape& input_shape, const T* input_data, + const int32* pad_value_ptr, const RuntimeShape& output_shape, + T* output_data) { + const T converted_pad_value = static_cast<T>(*pad_value_ptr); + PadImpl(op_params, input_shape, input_data, &converted_pad_value, + output_shape, output_data); } -// UNOPTIMIZED COPY of StridedSlice from reference_ops.h. -template <typename T> -inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, - int begin_mask, int end_mask, - const std::vector<int>& start_indices, - const std::vector<int>& stop_indices, - const std::vector<int>& strides, T* output_data, - const Dims<4>& output_dims) { - TFLITE_DCHECK_EQ(start_indices.size(), 4); - TFLITE_DCHECK_EQ(stop_indices.size(), 4); - TFLITE_DCHECK_EQ(strides.size(), 4); - const int start_b = strided_slice::StartForAxis(begin_mask, start_indices, - strides, input_dims.sizes, 3); - const int stop_b = strided_slice::StopForAxis(end_mask, stop_indices, strides, - input_dims.sizes, 3); - const int start_h = strided_slice::StartForAxis(begin_mask, start_indices, - strides, input_dims.sizes, 2); - const int stop_h = strided_slice::StopForAxis(end_mask, stop_indices, strides, - input_dims.sizes, 2); - const int start_w = strided_slice::StartForAxis(begin_mask, start_indices, - strides, input_dims.sizes, 1); - const int stop_w = strided_slice::StopForAxis(end_mask, stop_indices, strides, - input_dims.sizes, 1); - const int start_d = strided_slice::StartForAxis(begin_mask, start_indices, - strides, input_dims.sizes, 0); - const int stop_d = strided_slice::StopForAxis(end_mask, stop_indices, strides, - input_dims.sizes, 0); - - T* out_ptr = output_data; - for (int in_b = start_b; - !strided_slice::LoopCondition(in_b, stop_b, strides[3]); - in_b += strides[3]) { - for (int in_h = start_h; - !strided_slice::LoopCondition(in_h, stop_h, strides[2]); - in_h += strides[2]) { - for (int in_w = start_w; - !strided_slice::LoopCondition(in_w, stop_w, strides[1]); - in_w += strides[1]) { - for (int in_d = start_d; - !strided_slice::LoopCondition(in_d, stop_d, strides[0]); - in_d += strides[0]) { - *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; - } - } - } - } +// This version avoids conflicting template matching. +template <> +inline void Pad(const tflite::PadParams& op_params, + const RuntimeShape& input_shape, const int32* input_data, + const int32* pad_value_ptr, const RuntimeShape& output_shape, + int32* output_data) { + PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape, + output_data); } template <typename T> -inline void Slice(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& begin, const std::vector<int>& size, - T* output_data, const Dims<4>& output_dims) { - // TODO(dkalenichenko): This op only supports 4D tensors. - TFLITE_DCHECK_EQ(begin.size(), 4); - TFLITE_DCHECK_EQ(size.size(), 4); - const int start_b = begin[3]; - const int stop_b = - size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3]; - const int start_h = begin[2]; - const int stop_h = - size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2]; - const int start_w = begin[1]; - const int stop_w = - size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1]; - const int start_d = begin[0]; - const int stop_d = - size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0]; +inline void Slice(const tflite::SliceParams& op_params, + const RuntimeShape& input_shape, const T* input_data, + const RuntimeShape& output_shape, T* output_data) { + gemmlowp::ScopedProfilingLabel label("Slice"); + const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape); + // TODO(dkalenichenko): This op only supports 4D tensors or smaller. + TFLITE_DCHECK_LE(op_params.begin_count, 4); + TFLITE_DCHECK_LE(op_params.size_count, 4); + const int begin_count = op_params.begin_count; + const int size_count = op_params.size_count; + // We front-pad the begin and size vectors. + const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0]; + const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1) + ? ext_shape.Dims(0) - start_b + : start_b + op_params.size[0]; + const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3]; + const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1) + ? ext_shape.Dims(1) - start_h + : start_h + op_params.size[size_count - 3]; + const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2]; + const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1) + ? ext_shape.Dims(2) - start_w + : start_w + op_params.size[size_count - 2]; + const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1]; + const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1) + ? ext_shape.Dims(3) - start_d + : start_d + op_params.size[size_count - 1]; T* out_ptr = output_data; for (int in_b = start_b; in_b < stop_b; ++in_b) { @@ -5887,7 +5606,7 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims, for (int in_w = start_w; in_w < stop_w; ++in_w) { const int len = stop_d - start_d; memcpy(out_ptr, - input_data + Offset(input_dims, start_d, in_w, in_h, in_b), + input_data + Offset(ext_shape, in_b, in_h, in_w, start_d), len * sizeof(T)); out_ptr += len; } @@ -5896,243 +5615,112 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims, } template <typename T> -inline void Mean(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& reduction_indices, T* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Mean"); - const int output_batch = ArraySize(output_dims, 3); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int output_depth = ArraySize(output_dims, 0); - - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - - // The current implementation only supports simultaneous reduction over - // width and height. - TFLITE_DCHECK_EQ(reduction_indices.size(), 2); - TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) || - (reduction_indices[0] == 2 && reduction_indices[1] == 1)); - TFLITE_DCHECK_EQ(output_height, 1); - TFLITE_DCHECK_EQ(output_width, 1); - - for (int out_b = 0; out_b < output_batch; ++out_b) { - for (int out_d = 0; out_d < output_depth; ++out_d) { - float value = 0; - for (int in_h = 0; in_h < input_height; ++in_h) { - for (int in_w = 0; in_w < input_width; ++in_w) { - value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)]; - } - } - output_data[Offset(output_dims, out_d, 0, 0, out_b)] = - value / (input_width * input_height); - } - } -} - -template <typename T> -void GenericBroadcastSub(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("GenericBroadcastSub"); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - input1_data[SubscriptToIndex(desc1, c, x, y, b)] - - input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - } - } - } - } -} - -template <typename T> -void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, - const Dims<4>& input2_dims, T* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("Sub"); - - auto input1_map = MapAsVector(input1_data, input1_dims); - auto input2_map = MapAsVector(input2_data, input2_dims); - auto output_map = MapAsVector(output_data, output_dims); - if (AreSameDims(input1_dims, input2_dims)) { - output_map.array() = input1_map.array() - input2_map.array(); - } else if (FlatSize(input1_dims) == 1) { - auto scalar = input1_data[0]; - output_map.array() = scalar - input2_map.array(); - } else if (FlatSize(input2_dims) == 1) { - auto scalar = input2_data[0]; - output_map.array() = input1_map.array() - scalar; - } else { - GenericBroadcastSub(input1_data, input1_dims, input2_data, input2_dims, - output_data, output_dims); - } -} - -template <typename T> -void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, T* output_data, - const Dims<4>& output_dims) { +void Minimum(const RuntimeShape& input1_shape, const T* input1_data, + const T* input2_data, const RuntimeShape& output_shape, + T* output_data) { gemmlowp::ScopedProfilingLabel label("TensorFlowMinimum"); - auto input1_map = MapAsVector(input1_data, input1_dims); - auto output_map = MapAsVector(output_data, output_dims); + auto input1_map = MapAsVector(input1_data, input1_shape); + auto output_map = MapAsVector(output_data, output_shape); auto min_value = input2_data[0]; output_map.array() = input1_map.array().min(min_value); } +// Convenience version that allows, for example, generated-code calls to be +// the same as other binary ops. template <typename T> -void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, T* output_data, - const Dims<4>& output_dims) { +inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape&, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + // Drop shape of second input: not needed. + Minimum(input1_shape, input1_data, input2_data, output_shape, output_data); +} + +template <typename T> +void Maximum(const RuntimeShape& input1_shape, const T* input1_data, + const T* input2_data, const RuntimeShape& output_shape, + T* output_data) { gemmlowp::ScopedProfilingLabel label("TensorFlowMaximum"); - auto input1_map = MapAsVector(input1_data, input1_dims); - auto output_map = MapAsVector(output_data, output_dims); + auto input1_map = MapAsVector(input1_data, input1_shape); + auto output_map = MapAsVector(output_data, output_shape); auto max_value = input2_data[0]; output_map.array() = input1_map.array().max(max_value); } -template <typename T1, typename T2, typename T3> -void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, - T2* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("ArgMax"); - - // The current ArgMax implemention can only determine the index of the maximum - // value in the last dimension. So the axis argument is ignored. - - // For ArgMax, the number of output dimensions = (number of input dimensions - - // 1). For the sake of simplicity, the output dimensions are equal to the - // input dimensions here. We enforce the constraint that the last dimension - // must always be 1. - TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = ArraySize(input_dims, 0); - for (int i = 0; i < outer_size; ++i) { - auto max_value = *input_data; - ++input_data; - int max_index = 0; - for (int d = 1; d < depth; ++d) { - const auto& curr_value = *input_data; - if (curr_value > max_value) { - max_value = curr_value; - max_index = d; - } - ++input_data; - } - *output_data = max_index; - ++output_data; - } -} - +// Convenience version that allows, for example, generated-code calls to be +// the same as other binary ops. template <typename T> -void Transpose(const T* input, const Dims<4>& input_dims, T* output, - const Dims<4>& output_dims, const int* permuted_axes) { - int out_sizes[4]; - // Compute the inverse permutation array so we can do an output centered - // transpose. Also, check to make sure output_dims is matching input_dims. - for (int k = 0; k < 4; k++) { - out_sizes[k] = - MatchingArraySize(input_dims, permuted_axes[k], output_dims, k); - } - - // Naive transpose loop (iterate on output index and compute input index). - int o[4]; // loop index (on output). - int i[4]; - for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) { - i[permuted_axes[3]] = o[3]; - for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) { - i[permuted_axes[2]] = o[2]; - for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) { - i[permuted_axes[1]] = o[1]; - for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) { - i[permuted_axes[0]] = o[0]; - output[Offset(output_dims, o)] = input[Offset(input_dims, i)]; - } - } - } - } +inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape&, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + // Drop shape of second input: not needed. + Maximum(input1_shape, input1_data, input2_data, output_shape, output_data); } -inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, float* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("TransposeConv"); - // THIS FUNCTION IS A COPY FROM reference_ops.h. - // To optimize, start by using the conv code with transposed weights for the - // case of stride_height = stride_width = 1. - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - - // Although transpose convolution simplifies to convolution with transposed - // weights for strides of 1, non-unitary striding complicates matters. To - // keep this reference implementation as clear as possible, we use a "scatter" - // access pattern, where we loop through all the input elements, computing - // their influence on the output, rather than looping through the output - // elements in the typical "gather" access pattern of a conv. We therefore - // must initialize the output array to zero. - for (int batch = 0; batch < batches; ++batch) { - for (int out_y = 0; out_y < output_height; ++out_y) { - for (int out_x = 0; out_x < output_width; ++out_x) { - for (int out_channel = 0; out_channel < output_depth; ++out_channel) { - output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = - 0.0f; - } - } - } - } - - // Loop through input elements one at a time. +template <typename T> +void TransposeIm2col(const ConvParams& params, uint8 zero_byte, + const RuntimeShape& input_shape, const T* input_data, + const RuntimeShape& filter_shape, + const RuntimeShape& output_shape, T* im2col_data) { + gemmlowp::ScopedProfilingLabel label("TransposeIm2col"); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + TFLITE_DCHECK(im2col_data); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 0); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + MatchingDim(output_shape, 3, filter_shape, 3); // output_depth + + // Construct the MxN sized im2col matrix. + // The rows M, are sub-ordered B x H x W + const RuntimeShape row_shape({1, batches, output_height, output_width}); + // The columns, N, are sub-ordered Kh x Kw x Din + const RuntimeShape col_shape({1, filter_height, filter_width, input_depth}); + // Use dimensions M and N to construct dims for indexing directly into im2col + const RuntimeShape im2col_shape( + {1, 1, row_shape.FlatSize(), col_shape.FlatSize()}); + + // Build the im2col matrix by looping through all the input pixels, + // computing their influence on the output, rather than looping through all + // the output pixels. We therefore must initialize the im2col array to zero. + // This is potentially inefficient because we subsequently overwrite bytes + // set here. However, in practice memset is very fast and costs negligible. + memset(im2col_data, zero_byte, im2col_shape.FlatSize() * sizeof(T)); + + // Loop through the output batches for (int batch = 0; batch < batches; ++batch) { + // Loop through input pixels one at a time. for (int in_y = 0; in_y < input_height; ++in_y) { for (int in_x = 0; in_x < input_width; ++in_x) { - for (int in_channel = 0; in_channel < input_depth; ++in_channel) { - // Loop through the output elements it will influence - const int out_x_origin = (in_x * stride_width) - pad_width; - const int out_y_origin = (in_y * stride_height) - pad_height; - for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + // Loop through the output pixels it will influence + const int out_x_origin = (in_x * stride_width) - pad_width; + const int out_y_origin = (in_y * stride_height) - pad_height; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + const int out_y = out_y_origin + filter_y; + // Is output pixel within height bounds? + if ((out_y >= 0) && (out_y < output_height)) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - for (int out_channel = 0; out_channel < output_depth; - ++out_channel) { - // Compute output element location - const int out_x = out_x_origin + filter_x; - const int out_y = out_y_origin + filter_y; - // We cannot accumulate out of bounds - if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) && - (out_y < output_height)) { - float input_value = input_data[Offset(input_dims, in_channel, - in_x, in_y, batch)]; - float filter_value = - filter_data[Offset(filter_dims, out_channel, filter_x, - filter_y, in_channel)]; - output_data[Offset(output_dims, out_channel, out_x, out_y, - batch)] += input_value * filter_value; - } + const int out_x = out_x_origin + filter_x; + // Is output pixel within width bounds? + if ((out_x >= 0) && (out_x < output_width)) { + // Copy the input elements of this pixel + T const* src = + input_data + Offset(input_shape, batch, in_y, in_x, 0); + int row_offset = Offset(row_shape, 0, batch, out_y, out_x); + int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0); + T* dst = im2col_data + + Offset(im2col_shape, 0, 0, row_offset, col_offset); + memcpy(dst, src, input_depth * sizeof(T)); } } } @@ -6142,6 +5730,29 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, } } +inline void TransposeConv( + const ConvParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& output_shape, + float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) { + gemmlowp::ScopedProfilingLabel label("TransposeConv"); + + // Note we could use transposed weights with forward conv for unstrided + // cases. But we are already getting good performance with this code as-is. + TFLITE_DCHECK(im2col_data); + TransposeIm2col(params, 0, input_shape, input_data, filter_shape, + output_shape, im2col_data); + + const auto im2col_matrix_map = + MapAsMatrixWithLastDimAsRows(im2col_data, im2col_shape); + const auto filter_matrix_map = + MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape); + auto output_matrix_map = + MapAsMatrixWithLastDimAsRows(output_data, output_shape); + + Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map); +} + } // namespace optimized_ops } // namespace tflite @@ -6150,4 +5761,4 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, #pragma GCC diagnostic pop #endif -#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_ +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h index d570dadd86..f87760a6c3 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h @@ -17,7 +17,11 @@ limitations under the License. // TODO(ghodrat): Remove this header file and the dependency to internal data // structure. -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" + +#if defined(_MSC_VER) +#define __restrict__ __restrict +#endif #ifndef USE_NEON #if defined(__ARM_NEON__) || defined(__ARM_NEON) @@ -82,6 +86,14 @@ void NeonBatchVectorBatchVectorDotProduct(const float* vector1, int n_batch, float* result, int result_stride); +// Cwise product of a vector and a batch-vector. +void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size, + const float* batch_vector, + int n_batch, float* result); +void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size, + const float* batch_vector, int n_batch, + float* result); + // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC // operation, the assumption here is that result array is initialized to valid // values. @@ -105,6 +117,10 @@ void PortableClipVector(const float* vector, int v_size, float abs_limit, void NeonClipVector(const float* vector, int v_size, float abs_limit, float* result); +// Add another vector for each batch in the batch vector. +void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch, + float* batch_vector); + // Batch vector initialization with another vector. void PortableVectorBatchVectorAssign(const float* vector, int v_size, int n_batch, float* batch_vector); @@ -124,9 +140,19 @@ void PortableCopyVector(const float* vector, int v_size, float* result); // Fill vector with 0.f. void PortableZeroVector(float* vector, int v_size); +// Multiply all elements of vector with a scalar. +void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale, + float* result); +void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale, + float* result); + // Limit a float input f between +abs_limit and -abs_limit. float PortableClip(float f, float abs_limit); +// Check if all entries of a vector are zero. +bool PortableIsZeroVector(const float* vector, int v_size); +bool NeonIsZeroVector(const float* vector, int v_size); + // Symmetric quantizer. void PortableSymmetricQuantizeFloats(const float* values, const int size, int8_t* quantized_values, float* min, @@ -150,6 +176,10 @@ void PortableReductionSumVector(const float* input_vector, float* output_vector, void NeonReductionSumVector(const float* input_vector, float* output_vector, int output_size, int reduction_size); +void PortableMeanStddevNormalization(const float* input_vector, + float* output_vector, int v_size, + int n_batch, float normalization_epsilon); + } // namespace tensor_utils } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc index b0951aac8c..544ef16ce1 100644 --- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc +++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ + #include <algorithm> #include <cmath> #include <limits> @@ -22,6 +23,32 @@ limitations under the License. namespace tflite { +namespace { +// These constants are used to manipulate the binary representation of doubles. +// Double-precision binary64 floating point format is: +// Bit | 63 | 62-52 | 51-0 | +// | Sign | Exponent | Fraction | +// To avoid 64-bit integers as much as possible, I break this into high and +// low 32-bit chunks. High is: +// Bit | 31 | 30-20 | 19-0 | +// | Sign | Exponent | High Fraction | +// Low is: +// Bit | 31-0 | +// | Low Fraction | +// We then access the components through logical bit-wise operations to +// extract the parts needed, with the positions and masks derived from the +// layout shown above. +constexpr uint64_t kSignMask = 0x8000000000000000LL; +constexpr uint64_t kExponentMask = 0x7ff0000000000000LL; +constexpr int32_t kExponentShift = 52; +constexpr int32_t kExponentBias = 1023; +constexpr uint32_t kExponentIsBadNum = 0x7ff; +constexpr uint64_t kFractionMask = 0x000fffffffc00000LL; +constexpr uint32_t kFractionShift = 22; +constexpr uint32_t kFractionRoundingMask = 0x003fffff; +constexpr uint32_t kFractionRoundingThreshold = 0x00200000; +} // namespace + void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier, int* shift) { if (double_multiplier == 0.) { @@ -29,8 +56,16 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier, *shift = 0; return; } +#ifdef TFLITE_EMULATE_FLOAT + // If we're trying to avoid the use of floating-point instructions (for + // example on microcontrollers) then use an alternative implementation + // that only requires integer and bitwise operations. To enable this, you + // need to set the define during the build process for your platform. + int64_t q_fixed = IntegerFrExp(double_multiplier, shift); +#else // TFLITE_EMULATE_FLOAT const double q = std::frexp(double_multiplier, shift); auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31))); +#endif // TFLITE_EMULATE_FLOAT TFLITE_CHECK(q_fixed <= (1ll << 31)); if (q_fixed == (1ll << 31)) { q_fixed /= 2; @@ -48,15 +83,172 @@ void QuantizeMultiplierGreaterThanOne(double double_multiplier, TFLITE_CHECK_GE(*left_shift, 0); } -void QuantizeMultiplierSmallerThanOne(double double_multiplier, - int32_t* quantized_multiplier, - int* right_shift) { +void QuantizeMultiplierSmallerThanOneExp(double double_multiplier, + int32_t* quantized_multiplier, + int* left_shift) { TFLITE_CHECK_LT(double_multiplier, 1.); TFLITE_CHECK_GT(double_multiplier, 0.); int shift; QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift); TFLITE_CHECK_LE(shift, 0); - *right_shift = -shift; + *left_shift = shift; +} + +int64_t IntegerFrExp(double input, int* shift) { + // Make sure our assumptions about the double layout hold. + TFLITE_CHECK_EQ(8, sizeof(double)); + + // We want to access the bits of the input double value directly, which is + // tricky to do safely, so use a union to handle the casting. + union { + double double_value; + uint64_t double_as_uint; + } cast_union; + cast_union.double_value = input; + const uint64_t u = cast_union.double_as_uint; + + // If the bitfield is all zeros apart from the sign bit, this is a normalized + // zero value, so return standard values for this special case. + if ((u & ~kSignMask) == 0) { + *shift = 0; + return 0; + } + + // Deal with NaNs and Infs, which are always indicated with a fixed pattern in + // the exponent, and distinguished by whether the fractions are zero or + // non-zero. + const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift); + if (exponent_part == kExponentIsBadNum) { + *shift = std::numeric_limits<int>::max(); + if (u & kFractionMask) { + // NaN, so just return zero (with the exponent set to INT_MAX). + return 0; + } else { + // Infinity, so return +/- INT_MAX. + if (u & kSignMask) { + return std::numeric_limits<int64_t>::min(); + } else { + return std::numeric_limits<int64_t>::max(); + } + } + } + + // The shift is fairly easy to extract from the high bits of the double value, + // just by masking it out and applying a bias. The std::frexp() implementation + // always returns values between 0.5 and 1.0 though, whereas the exponent + // assumes 1.0 to 2.0 is the standard range, so I add on one to match that + // interface. + *shift = (exponent_part - kExponentBias) + 1; + + // There's an implicit high bit in the double format definition, so make sure + // we include that at the top, and then reconstruct the rest of the fractional + // value from the remaining fragments. + int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift); + + // We're cutting off some bits at the bottom, so to exactly match the standard + // frexp implementation here we'll apply rounding by adding one to the least + // significant bit of the result if the discarded portion is over half of the + // maximum. + if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) { + fraction += 1; + } + // Negate the fraction if the sign bit was set. + if (u & kSignMask) { + fraction *= -1; + } + + return fraction; +} + +double DoubleFromFractionAndShift(int64_t fraction, int shift) { + union { + double double_value; + uint64_t double_as_uint; + } result; + + // Detect NaNs and infinities. + if (shift == std::numeric_limits<int>::max()) { + if (fraction == 0) { + return NAN; + } else if (fraction > 0) { + return INFINITY; + } else { + return -INFINITY; + } + } + + // Return a normalized zero for a zero fraction. + if (fraction == 0) { + result.double_as_uint = 0; + return result.double_value; + } + + bool is_negative = (fraction < 0); + int64_t encoded_fraction = is_negative ? -fraction : fraction; + int64_t encoded_shift = (shift - 1); + while (encoded_fraction < 0x40000000) { + encoded_fraction *= 2; + encoded_shift -= 1; + } + while (encoded_fraction > 0x80000000) { + encoded_fraction /= 2; + encoded_shift += 1; + } + encoded_fraction -= 0x40000000; + if (encoded_shift < -1022) { + encoded_shift = -1023; + } else if (encoded_shift > 1022) { + encoded_shift = 1023; + } + encoded_shift += kExponentBias; + uint64_t encoded_sign = is_negative ? kSignMask : 0; + result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) | + (encoded_fraction << kFractionShift); + return result.double_value; +} + +double IntegerDoubleMultiply(double a, double b) { + int a_shift; + const int64_t a_fraction = IntegerFrExp(a, &a_shift); + int b_shift; + const int64_t b_fraction = IntegerFrExp(b, &b_shift); + // Detect NaNs and infinities. + if (a_shift == std::numeric_limits<int>::max() || + (b_shift == std::numeric_limits<int>::max())) { + return NAN; + } + const int result_shift = a_shift + b_shift + 1; + const int64_t result_fraction = (a_fraction * b_fraction) >> 32; + return DoubleFromFractionAndShift(result_fraction, result_shift); +} + +int IntegerDoubleCompare(double a, double b) { + int a_shift; + const int64_t a_fraction = IntegerFrExp(a, &a_shift); + int b_shift; + const int64_t b_fraction = IntegerFrExp(b, &b_shift); + + // Detect NaNs and infinities. + if (a_shift == std::numeric_limits<int>::max() || + (b_shift == std::numeric_limits<int>::max())) { + return 1; + } + + if ((a_fraction == 0) && (b_fraction < 0)) { + return 1; + } else if ((a_fraction < 0) && (b_fraction == 0)) { + return -1; + } else if (a_shift < b_shift) { + return -1; + } else if (a_shift > b_shift) { + return 1; + } else if (a_fraction < b_fraction) { + return -1; + } else if (a_fraction > b_fraction) { + return 1; + } else { + return 0; + } } void PreprocessSoftmaxScaling(double beta, double input_scale, @@ -71,30 +263,49 @@ void PreprocessSoftmaxScaling(double beta, double input_scale, // result is double equivalent of Q0.31 (actually with more precision). Thus // this generates a Q(input_integer_bits).(31-input_integer_bits) // representation. +#ifdef TFLITE_EMULATE_FLOAT + const double input_beta = IntegerDoubleMultiply(beta, input_scale); + int shift; + int64_t fraction = IntegerFrExp(input_beta, &shift); + shift += (31 - input_integer_bits); + double input_beta_real_multiplier = + DoubleFromFractionAndShift(fraction, shift); + if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) { + input_beta_real_multiplier = (1ll << 31) - 1.0; + } +#else // TFLITE_EMULATE_FLOAT const double input_beta_real_multiplier = std::min( beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0); +#endif // TFLITE_EMULATE_FLOAT QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, quantized_multiplier, left_shift); } -void PreprocessLogSoftmaxScaling(double beta, double input_scale, - int input_integer_bits, - int32_t* quantized_multiplier, int* left_shift, - int32_t* reverse_scaling_divisor, - int* reverse_scaling_right_shift) { +void PreprocessLogSoftmaxScalingExp(double beta, double input_scale, + int input_integer_bits, + int32_t* quantized_multiplier, + int* left_shift, + int32_t* reverse_scaling_divisor, + int* reverse_scaling_left_shift) { PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits, quantized_multiplier, left_shift); // Also calculate what amounts to the inverse scaling factor for the input. const double real_reverse_scaling_divisor = (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier); - tflite::QuantizeMultiplierSmallerThanOne(real_reverse_scaling_divisor, - reverse_scaling_divisor, - reverse_scaling_right_shift); + tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor, + reverse_scaling_divisor, + reverse_scaling_left_shift); } int CalculateInputRadius(int input_integer_bits, int input_left_shift) { +#ifdef TFLITE_EMULATE_FLOAT + int64_t result = (1 << input_integer_bits) - 1; + result <<= (31 - input_integer_bits); + result >>= input_left_shift; + return result; +#else // TFLITE_EMULATE_FLOAT const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) * (1ll << (31 - input_integer_bits)) / (1ll << input_left_shift); @@ -102,17 +313,18 @@ int CalculateInputRadius(int input_integer_bits, int input_left_shift) { // After scaling the difference, the result would be at the maximum. Thus we // must ensure that our value has lower magnitude. return static_cast<int>(std::floor(max_input_rescaled)); +#endif // TFLITE_EMULATE_FLOAT } void NudgeQuantizationRange(const float min, const float max, const int quant_min, const int quant_max, float* nudged_min, float* nudged_max, - float* scale) { + float* nudged_scale) { // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h. const float quant_min_float = static_cast<float>(quant_min); const float quant_max_float = static_cast<float>(quant_max); - *scale = (max - min) / (quant_max_float - quant_min_float); - const float zero_point_from_min = quant_min_float - min / *scale; + *nudged_scale = (max - min) / (quant_max_float - quant_min_float); + const float zero_point_from_min = quant_min_float - min / *nudged_scale; uint16 nudged_zero_point; if (zero_point_from_min < quant_min_float) { nudged_zero_point = static_cast<uint16>(quant_min); @@ -121,8 +333,37 @@ void NudgeQuantizationRange(const float min, const float max, } else { nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min)); } - *nudged_min = (quant_min_float - nudged_zero_point) * (*scale); - *nudged_max = (quant_max_float - nudged_zero_point) * (*scale); + *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale); + *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale); +} + +void FakeQuantizeArray(const float nudged_scale, const float nudged_min, + const float nudged_max, const float* input_data, + float* output_data, const float size) { + // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h. + const float inv_nudged_scale = 1.0f / nudged_scale; + + for (int i = 0; i < size; i++) { + const float src_val = input_data[i]; + const float clamped = std::min(nudged_max, std::max(nudged_min, src_val)); + const float clamped_shifted = clamped - nudged_min; + const float dst_val = + TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale + + nudged_min; + output_data[i] = dst_val; + } +} + +bool CheckedLog2(const float x, int* log2_result) { + // Using TfLiteRound instead of std::round and std::log instead of + // std::log2 to work around these fuctions being missing in a toolchain + // used in some TensorFlow tests as of May 2018. + const float x_log2 = std::log(x) * (1.0f / std::log(2.0f)); + const float x_log2_rounded = TfLiteRound(x_log2); + const float x_log2_fracpart = x_log2 - x_log2_rounded; + + *log2_result = static_cast<int>(x_log2_rounded); + return std::abs(x_log2_fracpart) < 1e-3; } } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h index 4a217515f1..d74a1bac97 100644 --- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h +++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h @@ -28,8 +28,9 @@ namespace tflite { // Given the min and max values of a float array, return // reasonable quantization parameters to use for this array. template <typename T> -QuantizationParams ChooseQuantizationParams(double rmin, double rmax) { - const T qmin = std::numeric_limits<T>::min(); +QuantizationParams ChooseQuantizationParams(double rmin, double rmax, + bool narrow_range) { + const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0); const T qmax = std::numeric_limits<T>::max(); const double qmin_double = qmin; const double qmax_double = qmax; @@ -97,6 +98,11 @@ QuantizationParams ChooseQuantizationParams(double rmin, double rmax) { return quantization_params; } +template <typename T> +QuantizationParams ChooseQuantizationParams(double rmin, double rmax) { + return ChooseQuantizationParams<T>(rmin, rmax, false); +} + // Converts a floating-point number to an integer. For all inputs x where // static_cast<IntOut>(x) is legal according to the C++ standard, the result // is identical to that cast (i.e. the result is x with its fractional part @@ -167,9 +173,9 @@ IntOut SafeCast(FloatIn x) { // this is intended as a RIGHT-shift. // // Restricted to the case where the multiplier < 1 (and non-negative). -void QuantizeMultiplierSmallerThanOne(double double_multiplier, - int32_t* quantized_multiplier, - int* right_shift); +void QuantizeMultiplierSmallerThanOneExp(double double_multiplier, + int32_t* quantized_multiplier, + int* left_shift); // Decompose a double multiplier into a Q0.31 int32 representation of its // significand, and shift representation of its exponent. @@ -189,6 +195,44 @@ void QuantizeMultiplierGreaterThanOne(double double_multiplier, void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier, int* shift); +// Splits a double input value into a returned fraction, and a shift value from +// the exponent, using only bitwise and integer operations to support +// microcontrollers and other environments without floating-point support. +// +// This is designed to be a replacement for how std::frexp() is used within the +// QuantizeMultiplier() function, and so has a different signature than the +// standard version, returning a 64-bit integer rather than a double. This +// result has a maximum value of 1<<31, with the fraction expressed as a +// proportion of that maximum. +// +// std::frexp() returns NaNs and infinities unmodified, but since we're +// returning integers that can't represent those values, instead we return +// a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64 +// result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and +// std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will +// result in return values that end up truncating some bits at the end, +// reflecting the loss of precision inherent in denormalization. +int64_t IntegerFrExp(double input, int* shift); + +// Converts an integer fraction in the format produced by IntegerFrExp (where +// 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an +// IEEE binary64 double format result. The implementation uses only integer and +// bitwise operators, so no floating point hardware support or emulation is +// needed. This is here so quantized operations can run non-time-critical +// preparation calculations on microcontrollers and other platforms without +// float support. +double DoubleFromFractionAndShift(int64_t fraction, int shift); + +// Performs a multiplication of two numbers in double format, using only integer +// and bitwise instructions. This is aimed at supporting housekeeping functions +// for quantized operations on microcontrollers without floating-point hardware. +double IntegerDoubleMultiply(double a, double b); + +// Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is +// greater than b. It is implemented using only integer and logical instructions +// so that it can be easily run on microcontrollers for quantized operations. +int IntegerDoubleCompare(double a, double b); + // This first creates a multiplier in a double equivalent of // Q(input_integer_bits).(31-input_integer_bits) representation, with extra // precision in the double's fractional bits. It then splits the result into @@ -197,11 +241,12 @@ void PreprocessSoftmaxScaling(double beta, double input_scale, int input_integer_bits, int32_t* quantized_multiplier, int* left_shift); // Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated. -void PreprocessLogSoftmaxScaling(double beta, double input_scale, - int input_integer_bits, - int32_t* quantized_multiplier, int* left_shift, - int32_t* reverse_scaling_divisor, - int* reverse_scaling_right_shift); +void PreprocessLogSoftmaxScalingExp(double beta, double input_scale, + int input_integer_bits, + int32_t* quantized_multiplier, + int* left_shift, + int32_t* reverse_scaling_divisor, + int* reverse_scaling_left_shift); // Calculate the largest input that will result in a within-bounds intermediate // result within MultiplyByQuantizedMultiplierGreaterThanOne. In other words, // it must not overflow before we reduce the value by multiplication by the @@ -215,7 +260,20 @@ int CalculateInputRadius(int input_integer_bits, int input_left_shift); // Outputs nudged_min, nudged_max, nudged_scale. void NudgeQuantizationRange(const float min, const float max, const int quant_min, const int quant_max, - float* nudged_min, float* nudged_max, float* scale); + float* nudged_min, float* nudged_max, + float* nudged_scale); + +// Fake quantizes (quantizes and dequantizes) input_data using the scale, +// nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code +// in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor. +void FakeQuantizeArray(const float nudged_scale, const float nudged_min, + const float nudged_max, const float* input_data, + float* output_data, const float size); + +// If x is approximately a power of two (with any positive or negative +// exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise +// returns false. +bool CheckedLog2(const float x, int* log2_result); } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc index 2d74b3d384..25ea72b886 100644 --- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc +++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc @@ -130,22 +130,22 @@ void RunSafeCastTests() { } TEST(QuantizationUtilTest, SafeCast) { - RunSafeCastTests<float, int8>(); - RunSafeCastTests<double, int8>(); - RunSafeCastTests<float, int16>(); - RunSafeCastTests<double, int16>(); - RunSafeCastTests<float, int32>(); - RunSafeCastTests<double, int32>(); - RunSafeCastTests<float, int64>(); - RunSafeCastTests<double, int64>(); - RunSafeCastTests<float, uint8>(); - RunSafeCastTests<double, uint8>(); - RunSafeCastTests<float, uint16>(); - RunSafeCastTests<double, uint16>(); - RunSafeCastTests<float, uint32>(); - RunSafeCastTests<double, uint32>(); - RunSafeCastTests<float, uint64>(); - RunSafeCastTests<double, uint64>(); + RunSafeCastTests<float, int8_t>(); + RunSafeCastTests<double, int8_t>(); + RunSafeCastTests<float, int16_t>(); + RunSafeCastTests<double, int16_t>(); + RunSafeCastTests<float, int32_t>(); + RunSafeCastTests<double, int32_t>(); + RunSafeCastTests<float, int64_t>(); + RunSafeCastTests<double, int64_t>(); + RunSafeCastTests<float, uint8_t>(); + RunSafeCastTests<double, uint8_t>(); + RunSafeCastTests<float, uint16_t>(); + RunSafeCastTests<double, uint16_t>(); + RunSafeCastTests<float, uint32_t>(); + RunSafeCastTests<double, uint32_t>(); + RunSafeCastTests<float, uint64_t>(); + RunSafeCastTests<double, uint64_t>(); } // Example taken from http://www.tensorflow.org/performance/quantization @@ -191,26 +191,159 @@ TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroPointOnMaxBoundary) { EXPECT_EQ(qp.zero_point, 255); } +TEST(QuantizationUtilTest, IntegerFrExp) { + int shift; + int64_t result = IntegerFrExp(0.0, &shift); + EXPECT_EQ(0, result); + EXPECT_EQ(0, shift); + + result = IntegerFrExp(1.0, &shift); + EXPECT_NEAR(0x40000000, result, 1); + EXPECT_EQ(1, shift); + + result = IntegerFrExp(0.25, &shift); + EXPECT_NEAR(0x40000000, result, 1); + EXPECT_EQ(-1, shift); + + result = IntegerFrExp(-1.0, &shift); + EXPECT_NEAR(-(1 << 30), result, 1); + EXPECT_EQ(1, shift); + + result = IntegerFrExp(123.45, &shift); + EXPECT_NEAR(2071147315, result, 1); + EXPECT_EQ(7, shift); + + result = IntegerFrExp(NAN, &shift); + EXPECT_NEAR(0, result, 1); + EXPECT_EQ(0x7fffffff, shift); + + result = IntegerFrExp(INFINITY, &shift); + EXPECT_NEAR(std::numeric_limits<int64_t>::max(), result, 1); + EXPECT_EQ(0x7fffffff, shift); + + result = IntegerFrExp(-INFINITY, &shift); + EXPECT_NEAR(std::numeric_limits<int64_t>::min(), result, 1); + EXPECT_EQ(0x7fffffff, shift); +} + +TEST(QuantizationUtilTest, IntegerFrExpVersusDouble) { + int shift; + int32_t result = IntegerFrExp(0.0, &shift); + EXPECT_EQ(result, 0); + EXPECT_EQ(shift, 0); + + int double_shift; + double double_result = std::frexp(0.0, &double_shift); + EXPECT_EQ(double_result, 0); + EXPECT_EQ(double_shift, 0); + + result = IntegerFrExp(1.0, &shift); + EXPECT_NEAR(result, 0x40000000, 1); + EXPECT_EQ(shift, 1); + double_result = std::frexp(1.0, &double_shift); + EXPECT_NEAR(double_result, 0.5, 1e-5); + EXPECT_EQ(double_shift, 1); + + result = IntegerFrExp(0.25, &shift); + EXPECT_NEAR(result, 0x40000000, 1); + EXPECT_EQ(shift, -1); + double_result = std::frexp(0.25, &double_shift); + EXPECT_NEAR(double_result, 0.5, 1e-5); + EXPECT_EQ(double_shift, -1); + + result = IntegerFrExp(-1.0, &shift); + EXPECT_NEAR(result, -(1 << 30), 1); + EXPECT_EQ(shift, 1); + double_result = std::frexp(-1.0, &double_shift); + EXPECT_NEAR(double_result, -0.5, 1e-5); + EXPECT_EQ(double_shift, 1); + + result = IntegerFrExp(123.45, &shift); + EXPECT_NEAR(result, (0.964453 * (1LL << 31)), 1000); + EXPECT_EQ(shift, 7); + double_result = std::frexp(123.45, &double_shift); + EXPECT_NEAR(double_result, 0.964453, 1e-5); + EXPECT_EQ(double_shift, 7); +} + +TEST(QuantizationUtilTest, DoubleFromFractionAndShift) { + double result = DoubleFromFractionAndShift(0, 0); + EXPECT_EQ(0, result); + + result = DoubleFromFractionAndShift(0x40000000, 1); + EXPECT_NEAR(1.0, result, 1e-5); + + result = DoubleFromFractionAndShift(0x40000000, 2); + EXPECT_NEAR(2.0, result, 1e-5); + + int shift; + int64_t fraction = IntegerFrExp(3.0, &shift); + result = DoubleFromFractionAndShift(fraction, shift); + EXPECT_NEAR(3.0, result, 1e-5); + + fraction = IntegerFrExp(123.45, &shift); + result = DoubleFromFractionAndShift(fraction, shift); + EXPECT_NEAR(123.45, result, 1e-5); + + fraction = IntegerFrExp(-23.232323, &shift); + result = DoubleFromFractionAndShift(fraction, shift); + EXPECT_NEAR(-23.232323, result, 1e-5); + + fraction = IntegerFrExp(NAN, &shift); + result = DoubleFromFractionAndShift(fraction, shift); + EXPECT_TRUE(std::isnan(result)); + + fraction = IntegerFrExp(INFINITY, &shift); + result = DoubleFromFractionAndShift(fraction, shift); + EXPECT_FALSE(std::isfinite(result)); +} + +TEST(QuantizationUtilTest, IntegerDoubleMultiply) { + EXPECT_NEAR(1.0, IntegerDoubleMultiply(1.0, 1.0), 1e-5); + EXPECT_NEAR(2.0, IntegerDoubleMultiply(1.0, 2.0), 1e-5); + EXPECT_NEAR(2.0, IntegerDoubleMultiply(2.0, 1.0), 1e-5); + EXPECT_NEAR(4.0, IntegerDoubleMultiply(2.0, 2.0), 1e-5); + EXPECT_NEAR(0.5, IntegerDoubleMultiply(1.0, 0.5), 1e-5); + EXPECT_NEAR(0.25, IntegerDoubleMultiply(0.5, 0.5), 1e-5); + EXPECT_NEAR(-1.0, IntegerDoubleMultiply(1.0, -1.0), 1e-5); + EXPECT_NEAR(-1.0, IntegerDoubleMultiply(-1.0, 1.0), 1e-5); + EXPECT_NEAR(1.0, IntegerDoubleMultiply(-1.0, -1.0), 1e-5); + EXPECT_NEAR(15000000.0, IntegerDoubleMultiply(3000.0, 5000.0), 1e-5); + EXPECT_TRUE(std::isnan(IntegerDoubleMultiply(NAN, 5000.0))); + EXPECT_TRUE(std::isnan(IntegerDoubleMultiply(3000.0, NAN))); +} + +TEST(QuantizationUtilTest, IntegerDoubleCompare) { + EXPECT_EQ(-1, IntegerDoubleCompare(0.0, 1.0)); + EXPECT_EQ(1, IntegerDoubleCompare(1.0, 0.0)); + EXPECT_EQ(0, IntegerDoubleCompare(1.0, 1.0)); + EXPECT_EQ(0, IntegerDoubleCompare(0.0, 0.0)); + EXPECT_EQ(-1, IntegerDoubleCompare(-10.0, 10.0)); + EXPECT_EQ(1, IntegerDoubleCompare(123.45, 10.0)); + EXPECT_EQ(1, IntegerDoubleCompare(NAN, INFINITY)); + EXPECT_EQ(1, IntegerDoubleCompare(INFINITY, NAN)); +} + #ifdef GTEST_HAS_DEATH_TEST TEST(QuantizationUtilTest, ChooseQuantizationParamsInvalidRange) { EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, -30.0), ""); } -TEST(QuantizationUtilTest, QuantizeMultiplierSmallerThanOne) { +TEST(QuantizationUtilTest, QuantizeMultiplierSmallerThanOneExp) { auto quantize = [](double d) { int32_t q; int s; - QuantizeMultiplierSmallerThanOne(d, &q, &s); + QuantizeMultiplierSmallerThanOneExp(d, &q, &s); return std::pair<int32_t, int>{q, s}; }; EXPECT_DEATH(quantize(-0.1), ""); EXPECT_DEATH(quantize(0.0), ""); - EXPECT_THAT(quantize(0.25), Pair(1073741824, 1)); + EXPECT_THAT(quantize(0.25), Pair(1073741824, -1)); // Around 0.5 we can see the change in exponent and how we try hard to // void hitting max int32. - EXPECT_THAT(quantize(0.50 - 5e-9), Pair(2147483627, 1)); + EXPECT_THAT(quantize(0.50 - 5e-9), Pair(2147483627, -1)); EXPECT_THAT(quantize(0.50 - 1e-10), Pair(1073741824, 0)); EXPECT_THAT(quantize(0.50), Pair(1073741824, 0)); diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h index 9aabee5000..11224270a4 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h @@ -22,24 +22,36 @@ limitations under the License. namespace tflite { namespace reference_ops { -inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, - float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); +inline void DepthwiseConv( + const DepthwiseParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); for (int b = 0; b < batches; ++b) { for (int out_y = 0; out_y < output_height; ++out_y) { @@ -52,25 +64,26 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, float total = 0.f; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - const int in_x = in_x_origin + filter_x; - const int in_y = in_y_origin + filter_y; + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = + in_y_origin + dilation_height_factor * filter_y; // If the location is outside the bounds of the input image, // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { float input_value = - input_data[Offset(input_dims, ic, in_x, in_y, b)]; + input_data[Offset(input_shape, b, in_y, in_x, ic)]; float filter_value = filter_data[Offset( - filter_dims, oc, filter_x, filter_y, 0)]; + filter_shape, 0, filter_y, filter_x, oc)]; total += (input_value * filter_value); } } } float bias_value = 0.0f; if (bias_data) { - bias_value = bias_data[Offset(bias_dims, oc, 0, 0, 0)]; + bias_value = bias_data[oc]; } - output_data[Offset(output_dims, oc, out_x, out_y, b)] = + output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax(total + bias_value, output_activation_min, output_activation_max); @@ -81,34 +94,6 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, } } -// Legacy, for compatibility with old checked-in code. -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, float* output_data, - const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data, - bias_dims, stride_width, stride_height, pad_width, pad_height, - depth_multiplier, output_activation_min, output_activation_max, - output_data, output_dims); -} - -// Legacy, for compatibility with old checked-in code. -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, int depth_multiplier, - float* output_data, const Dims<4>& output_dims) { - DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, - bias_dims, stride, stride, pad_width, pad_height, - depth_multiplier, output_data, output_dims); -} - } // end namespace reference_ops } // end namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h index e9b6baeaee..eab28e6c84 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h @@ -18,7 +18,6 @@ limitations under the License. #include <algorithm> #include "fixedpoint/fixedpoint.h" -#include "public/gemmlowp.h" #include "tensorflow/contrib/lite/kernels/internal/common.h" #include "tensorflow/contrib/lite/kernels/internal/compatibility.h" #include "tensorflow/contrib/lite/kernels/internal/types.h" @@ -26,26 +25,42 @@ limitations under the License. namespace tflite { namespace reference_ops { -inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int input_depth = ArraySize(input_dims, 0); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); +inline void DepthwiseConv( + const DepthwiseParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); for (int b = 0; b < batches; ++b) { for (int out_y = 0; out_y < output_height; ++out_y) { @@ -58,30 +73,31 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, int32 acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - const int in_x = in_x_origin + filter_x; - const int in_y = in_y_origin + filter_y; + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = + in_y_origin + dilation_height_factor * filter_y; // If the location is outside the bounds of the input image, // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { int32 input_val = - input_data[Offset(input_dims, ic, in_x, in_y, b)]; - int32 filter_val = filter_data[Offset(filter_dims, oc, - filter_x, filter_y, 0)]; + input_data[Offset(input_shape, b, in_y, in_x, ic)]; + int32 filter_val = filter_data[Offset( + filter_shape, 0, filter_y, filter_x, oc)]; acc += (filter_val + filter_offset) * (input_val + input_offset); } } } if (bias_data) { - acc += bias_data[Offset(bias_dims, oc, 0, 0, 0)]; + acc += bias_data[oc]; } - acc = MultiplyByQuantizedMultiplierSmallerThanOne( - acc, output_multiplier, output_shift); + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + output_shift); acc += output_offset; acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); - output_data[Offset(output_dims, oc, out_x, out_y, b)] = + output_data[Offset(output_shape, b, out_y, out_x, oc)] = static_cast<uint8>(acc); } } @@ -90,48 +106,6 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, } } -// Legacy, for compatibility with old checked-in code. -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int depth_multiplier, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, stride_width, - stride_height, pad_width, pad_height, depth_multiplier, - output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data, - output_dims); -} - -// Legacy, for compatibility with old checked-in code. -template <FusedActivationFunctionType Ac> -void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, int depth_multiplier, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data, - filter_dims, filter_offset, bias_data, bias_dims, stride, - stride, pad_width, pad_height, depth_multiplier, - output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data, - output_dims); -} - } // end namespace reference_ops } // end namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h b/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h new file mode 100644 index 0000000000..3c7fd29256 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h @@ -0,0 +1,326 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ + +#include "fixedpoint/fixedpoint.h" +#include "tensorflow/contrib/lite/kernels/internal/common.h" +#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" +#include "tensorflow/contrib/lite/kernels/internal/round.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" + +namespace tflite { +namespace reference_ops { + +const int kReverseShift = -1; + +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& weights_shape, + const float* weights_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data) { + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dims_count = output_shape.DimensionsCount(); + const int weights_dims_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); + const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2, + output_shape, output_dims_count - 1); + const int accum_depth = weights_shape.Dims(weights_dims_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + float total = 0.f; + for (int d = 0; d < accum_depth; ++d) { + total += input_data[b * accum_depth + d] * + weights_data[out_c * accum_depth + d]; + } + float bias_value = 0.0f; + if (bias_data) { + bias_value = bias_data[out_c]; + } + output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( + total + bias_value, output_activation_min, output_activation_max); + } + } +} + +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data, void* gemm_context) { + (void)gemm_context; // only used in optimized code. + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + int32 acc = 0; + for (int d = 0; d < accum_depth; ++d) { + int32 input_val = input_data[b * accum_depth + d]; + int32 filter_val = filter_data[out_c * accum_depth + d]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + if (bias_data) { + acc += bias_data[out_c]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[out_c + output_depth * b] = static_cast<uint8>(acc); + } + } +} + +inline void FullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + int16* output_data, void* gemm_context) { + (void)gemm_context; // only used in optimized code. + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_EQ(output_offset, 0); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum = bias_data[out_c]; + // Accumulation loop. + for (int d = 0; d < accum_depth; ++d) { + int16 input_val = input_data[b * accum_depth + d] + input_offset; + int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset; + accum += filter_val * input_val; + } + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + accum = + MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift); + // Saturate, cast to int16, and store to output array. + accum = std::max(accum, output_activation_min - output_offset); + accum = std::min(accum, output_activation_max - output_offset); + accum += output_offset; + output_data[out_c + output_depth * b] = accum; + } + } +} + +inline void ShuffledFullyConnected( + const FullyConnectedParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& weights_shape, + const uint8* shuffled_weights_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + int16* output_data, uint8* shuffled_input_workspace_data, + void* gemm_context) { + (void)gemm_context; // only used in optimized code. + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + + TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int weights_dim_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2, + output_shape, output_dim_count - 1); + const int accum_depth = weights_shape.Dims(weights_dim_count - 1); + TFLITE_DCHECK((accum_depth % 16) == 0); + TFLITE_DCHECK((output_depth % 4) == 0); + + // Shuffling and xoring of input activations into the workspace buffer + uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + if (batches == 1) { + for (int i = 0; i < accum_depth; i++) { + shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; + } + } else if (batches == 4) { + for (int c = 0; c < accum_depth; c += 16) { + for (int b = 0; b < 4; b++) { + const uint8* src_data_ptr = input_data + b * accum_depth + c; + for (int j = 0; j < 16; j++) { + uint8 src_val = *src_data_ptr++; + // Flip the sign bit, so that the kernel will only need to + // reinterpret these uint8 values as int8, getting for free the + // subtraction of the zero_point value 128. + uint8 dst_val = src_val ^ 0x80; + *shuffled_input_workspace_ptr++ = dst_val; + } + } + } + } else { + TFLITE_DCHECK(false); + return; + } + + // Actual computation + if (batches == 1) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast<const int8*>(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast<const int8*>(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4] = {0}; + // Accumulation loop. + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_data[d + j]; + int8 weights_val = *shuffled_weights_ptr++; + accum[i] += weights_val * input_val; + } + } + } + for (int i = 0; i < 4; i++) { + // Add bias value + int32 acc = accum[i] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = + MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[c + i] = acc; + } + } + } else if (batches == 4) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast<const int8*>(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast<const int8*>(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = shuffled_input_data; + // Accumulation loop. + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4][4]; + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + accum[i][b] = 0; + } + } + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_ptr[16 * b + j]; + int8 weights_val = shuffled_weights_ptr[16 * i + j]; + accum[i][b] += weights_val * input_val; + } + } + } + shuffled_input_ptr += 64; + shuffled_weights_ptr += 64; + } + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + // Add bias value + int32 acc = accum[i][b] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The + // quantized multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[b * output_depth + c + i] = acc; + } + } + } + } else { + TFLITE_DCHECK(false); + return; + } +} + +} // namespace reference_ops +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h new file mode 100644 index 0000000000..be99240b1f --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h @@ -0,0 +1,2120 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_ + +#include <stdint.h> +#include <sys/types.h> + +#include "tensorflow/contrib/lite/kernels/internal/common.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" + +namespace tflite { + +namespace reference_ops { + +static constexpr int kDepthwiseReverseShift = -1; + +inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, + int dilation_width_factor, int dilation_height_factor, + int pad_width, int pad_height, int depth_multiplier, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { + tflite::DepthwiseParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.depth_multiplier = depth_multiplier; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + DepthwiseConv(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data); +} + +inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { + DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data, + bias_dims, stride_width, stride_height, 1, 1, pad_width, + pad_height, depth_multiplier, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// Legacy, for compatibility with old checked-in code. +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, float* output_data, + const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data, + bias_dims, stride_width, stride_height, pad_width, pad_height, + depth_multiplier, output_activation_min, output_activation_max, + output_data, output_dims); +} + +// Legacy, for compatibility with old checked-in code. +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, int stride, + int pad_width, int pad_height, int depth_multiplier, + float* output_data, const Dims<4>& output_dims) { + DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, + bias_dims, stride, stride, pad_width, pad_height, + depth_multiplier, output_data, output_dims); +} + +inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, + int dilation_width_factor, int dilation_height_factor, + int pad_width, int pad_height, int depth_multiplier, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::DepthwiseParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.depth_multiplier = depth_multiplier; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kDepthwiseReverseShift * output_shift; + + DepthwiseConv(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data); +} + +inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride_width, + stride_height, 1, 1, pad_width, pad_height, depth_multiplier, + output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, + output_dims); +} + +// Legacy, for compatibility with old checked-in code. +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int depth_multiplier, int32 output_offset, + int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride_width, + stride_height, pad_width, pad_height, depth_multiplier, + output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, + output_dims); +} + +// Legacy, for compatibility with old checked-in code. +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, int stride, + int pad_width, int pad_height, int depth_multiplier, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data, + filter_dims, filter_offset, bias_data, bias_dims, stride, + stride, pad_width, pad_height, depth_multiplier, + output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, + output_dims); +} + +inline void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int dilation_width_factor, + int dilation_height_factor, int pad_width, int pad_height, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims, + float* im2col_data, const Dims<4>& im2col_dims) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims), + filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), + output_data, DimsToShape(im2col_dims), im2col_data); +} + +template <FusedActivationFunctionType Ac> +void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, int stride_width, + int stride_height, int dilation_width_factor, + int dilation_height_factor, int pad_width, int pad_height, + float* output_data, const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, + stride_width, stride_height, dilation_width_factor, + dilation_height_factor, pad_width, pad_height, output_activation_min, + output_activation_max, output_data, output_dims, im2col_data, + im2col_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, int stride_width, + int stride_height, int pad_width, int pad_height, float* output_data, + const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, + stride_width, stride_height, 1, 1, pad_width, pad_height, + output_activation_min, output_activation_max, output_data, output_dims, + im2col_data, im2col_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, int stride, + int pad_width, int pad_height, float* output_data, + const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, + bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data, + output_dims, im2col_data, im2col_dims); +} + +inline void Conv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int dilation_width_factor, + int dilation_height_factor, int pad_width, int pad_height, + int32 output_offset, int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims, + uint8* im2col_data, const Dims<4>& im2col_dims, + gemmlowp::GemmContext* gemm_context) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + op_params.dilation_width_factor = dilation_width_factor; + op_params.dilation_height_factor = dilation_height_factor; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + Conv(op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims), + filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), + output_data, DimsToShape(im2col_dims), im2col_data, gemm_context); +} + +inline void Conv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, uint8* im2col_data, + const Dims<4>& im2col_dims, + gemmlowp::GemmContext* gemm_context) { + Conv(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1, + pad_width, pad_height, output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, output_dims, + im2col_data, im2col_dims, gemm_context); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +inline void Conv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, uint8* im2col_data, + const Dims<4>& im2col_dims, + gemmlowp::GemmContext* gemm_context) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + Conv(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride_width, stride_height, + pad_width, pad_height, output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, output_dims, + im2col_data, im2col_dims, gemm_context); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void Conv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, int stride, + int pad_width, int pad_height, int32 output_offset, + int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, + const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { + Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, stride, stride, pad_width, + pad_height, output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max, output_data, + output_dims, im2col_data, im2col_dims, gemm_context); +} + +inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, float* output_data, + const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + tflite::ConvParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = pad_width; + op_params.padding_values.height = pad_height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + + TransposeConv(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(output_dims), + output_data, DimsToShape(im2col_dims), im2col_data); +} + +inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, + const float* weights_data, + const Dims<4>& weights_dims, const float* bias_data, + const Dims<4>& bias_dims, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { + tflite::FullyConnectedParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + FullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(weights_dims), weights_data, + DimsToShape(bias_dims), bias_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void FullyConnected(const float* input_data, const Dims<4>& input_dims, + const float* weights_data, const Dims<4>& weights_dims, + const float* bias_data, const Dims<4>& bias_dims, + float* output_data, const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, + bias_dims, output_activation_min, output_activation_max, + output_data, output_dims); +} + +inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + tflite::FullyConnectedParams op_params; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + FullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data, + gemm_context); +} + +inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, int16* output_data, + const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + tflite::FullyConnectedParams op_params; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + FullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(filter_dims), filter_data, DimsToShape(bias_dims), + bias_data, DimsToShape(output_dims), output_data, + gemm_context); +} + +inline void ShuffledFullyConnected( + const uint8* input_data, const Dims<4>& input_dims, + const uint8* shuffled_weights_data, const Dims<4>& weights_dims, + const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + int16* output_data, const Dims<4>& output_dims, + uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { + tflite::FullyConnectedParams op_params; + op_params.output_multiplier = output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + ShuffledFullyConnected(op_params, DimsToShape(input_dims), input_data, + DimsToShape(weights_dims), shuffled_weights_data, + DimsToShape(bias_dims), bias_data, + DimsToShape(output_dims), output_data, + shuffled_input_workspace_data, gemm_context); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, output_offset, + output_multiplier, output_shift, output_activation_min, + output_activation_max, output_data, output_dims, gemm_context); +} + +inline void LstmCell(const float* input_data, const Dims<4>& input_dims, + const float* prev_activ_data, + const Dims<4>& prev_activ_dims, const float* weights_data, + const Dims<4>& weights_dims, const float* bias_data, + const Dims<4>& bias_dims, const float* prev_state_data, + const Dims<4>& prev_state_dims, float* output_state_data, + const Dims<4>& output_state_dims, float* output_activ_data, + const Dims<4>& output_activ_dims, float* concat_temp_data, + const Dims<4>& concat_temp_dims, float* activ_temp_data, + const Dims<4>& activ_temp_dims) { + tflite::LstmCellParams op_params; + // Float LSTM cell does not need parameters to be set: leave untouched. + + LstmCell(op_params, DimsToShape(input_dims), input_data, + DimsToShape(prev_activ_dims), prev_activ_data, + DimsToShape(weights_dims), weights_data, DimsToShape(bias_dims), + bias_data, DimsToShape(prev_state_dims), prev_state_data, + DimsToShape(output_state_dims), output_state_data, + DimsToShape(output_activ_dims), output_activ_data, + DimsToShape(concat_temp_dims), concat_temp_data, + DimsToShape(activ_temp_dims), activ_temp_data); +} + +template <int StateIntegerBits> +void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, + const uint8* prev_activ_data_uint8, + const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8, + const Dims<4>& weights_dims, const int32* bias_data_int32, + const Dims<4>& bias_dims, const int16* prev_state_data_int16, + const Dims<4>& prev_state_dims, int16* output_state_data_int16, + const Dims<4>& output_state_dims, uint8* output_activ_data_uint8, + const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8, + const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16, + const Dims<4>& activ_temp_dims, int32 weights_zero_point, + int32 accum_multiplier, int accum_shift, + gemmlowp::GemmContext* gemm_context) { + tflite::LstmCellParams op_params; + op_params.weights_zero_point = weights_zero_point; + op_params.accum_multiplier = accum_multiplier; + op_params.accum_shift = accum_shift; + + LstmCell<StateIntegerBits>( + op_params, DimsToShape(input_dims), input_data_uint8, + DimsToShape(prev_activ_dims), prev_activ_data_uint8, + DimsToShape(weights_dims), weights_data_uint8, DimsToShape(bias_dims), + bias_data_int32, DimsToShape(prev_state_dims), prev_state_data_int16, + DimsToShape(output_state_dims), output_state_data_int16, + DimsToShape(output_activ_dims), output_activ_data_uint8, + DimsToShape(concat_temp_dims), concat_temp_data_uint8, + DimsToShape(activ_temp_dims), activ_temp_data_int16, gemm_context); +} + +template <typename T> +void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + BroadcastDiv4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +inline void Div(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + Div(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <FusedActivationFunctionType Ac, typename Scalar> +inline void Concatenation(int concat_dim, const Scalar* const* input_data, + const Dims<4>* const* input_dims, int inputs_count, + Scalar* output_data, const Dims<4>& output_dims) { + // For now we don't have a model with a Concatenation with fused activation. + TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone); + + std::vector<RuntimeShape> input_shapes(inputs_count); + std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count); + for (int i = 0; i < inputs_count; ++i) { + ShapeFromDims(*input_dims[i], &input_shapes[i]); + input_shapes_indirect[i] = &input_shapes[i]; + } + tflite::ConcatenationParams op_params; + op_params.axis = 3 - concat_dim; + op_params.inputs_count = inputs_count; + + Concatenation(op_params, input_shapes_indirect.data(), input_data, + DimsToShape(output_dims), output_data); +} + +inline void Concatenation(int concat_dim, const uint8* const* input_data, + const Dims<4>* const* input_dims, + const int32* input_zeropoint, + const float* input_scale, int inputs_count, + uint8* output_data, const Dims<4>& output_dims, + const int32 output_zeropoint, + const float output_scale) { + std::vector<RuntimeShape> input_shapes(inputs_count); + std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count); + for (int i = 0; i < inputs_count; ++i) { + ShapeFromDims(*input_dims[i], &input_shapes[i]); + input_shapes_indirect[i] = &input_shapes[i]; + } + tflite::ConcatenationParams op_params; + op_params.axis = 3 - concat_dim; + op_params.input_zeropoint = input_zeropoint; + op_params.input_scale = input_scale; + op_params.inputs_count = inputs_count; + op_params.output_zeropoint = output_zeropoint; + op_params.output_scale = output_scale; + + ConcatenationWithScaling(op_params, input_shapes_indirect.data(), input_data, + DimsToShape(output_dims), output_data); +} + +template <FusedActivationFunctionType Ac, typename Scalar> +void DepthConcatenation(const Scalar* const* input_data, + const Dims<4>* const* input_dims, int inputs_count, + Scalar* output_data, const Dims<4>& output_dims) { + // For now we don't have a model with a Concatenation with fused activation. + TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone); + + std::vector<RuntimeShape> input_shapes(inputs_count); + std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count); + for (int i = 0; i < inputs_count; ++i) { + ShapeFromDims(*input_dims[i], &input_shapes[i]); + input_shapes_indirect[i] = &input_shapes[i]; + } + tflite::ConcatenationParams op_params; + op_params.inputs_count = inputs_count; + + DepthConcatenation(op_params, input_shapes_indirect.data(), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename Scalar> +void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, + int axis, int outputs_count, Scalar* const* output_data, + const Dims<4>* const* output_dims) { + std::vector<RuntimeShape> output_shapes(outputs_count); + std::vector<const RuntimeShape*> output_shapes_indirect(outputs_count); + for (int i = 0; i < outputs_count; ++i) { + ShapeFromDims(*output_dims[i], &output_shapes[i]); + output_shapes_indirect[i] = &output_shapes[i]; + } + tflite::SplitParams op_params; + op_params.axis = 3 - axis; + op_params.num_split = outputs_count; + + Split(op_params, DimsToShape(input_dims), input_data, + output_shapes_indirect.data(), output_data); +} + +template <FusedActivationFunctionType Ac, typename Scalar> +void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, + int outputs_count, Scalar* const* output_data, + const Dims<4>* const* output_dims) { + TFLITE_DCHECK_GE(outputs_count, 1); + for (int i = 0; i < outputs_count; i++) { + /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3); + /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2); + /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1); + } + // For now we don't have a model with a Split with fused activation. + TFLITE_DCHECK_EQ(Ac, FusedActivationFunctionType::kNone); + + TensorFlowSplit(input_data, input_dims, /*axis=*/0, outputs_count, + output_data, output_dims); +} + +inline void Softmax(const float* input_data, const RuntimeShape& input_shape, + float beta, float* output_data, + const RuntimeShape& output_shape) { + SoftmaxParams params; + params.beta = beta; + Softmax(params, input_shape, input_data, output_shape, output_data); +} + +inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape, + int32 input_beta_multiplier, int32 input_beta_left_shift, + int diff_min, uint8* output_data, + const RuntimeShape& output_shape) { + SoftmaxParams params; + params.input_multiplier = input_beta_multiplier; + params.input_left_shift = input_beta_left_shift; + params.diff_min = diff_min; + Softmax(params, input_shape, input_data, output_shape, output_data); +} + +inline void LogSoftmax(const float* input_data, const RuntimeShape& input_shape, + float* output_data, const RuntimeShape& output_shape) { + SoftmaxParams params; + // No params currently used for float LogSoftmax. + LogSoftmax(params, input_shape, input_data, output_shape, output_data); +} + +inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape, + int32 input_multiplier, int32 input_left_shift, + int32 reverse_scaling_divisor, + int32 reverse_scaling_right_shift, int diff_min, + uint8* output_data, const RuntimeShape& output_shape) { + SoftmaxParams params; + params.input_multiplier = input_multiplier; + params.input_left_shift = input_left_shift; + params.reverse_scaling_divisor = reverse_scaling_divisor; + params.reverse_scaling_right_shift = reverse_scaling_right_shift; + params.diff_min = diff_min; + LogSoftmax(params, input_shape, input_data, output_shape, output_data); +} + +inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape, + int32 input_zero_point, int32 input_range_radius, + int32 input_multiplier, int input_left_shift, + uint8* output_data, const RuntimeShape& output_shape) { + LogisticParams params; + params.input_zero_point = input_zero_point; + params.input_range_radius = input_range_radius; + params.input_multiplier = input_multiplier; + params.input_left_shift = input_left_shift; + Logistic(params, input_shape, input_data, output_shape, output_data); +} + +inline void Logistic(const RuntimeShape& input_shape, const int16* input_data, + const RuntimeShape& output_shape, int16* output_data) { + LogisticParams params; + // No params currently needed by int16 Logistic. + Logistic(params, input_shape, input_data, output_shape, output_data); +} + +inline void Tanh(const uint8* input_data, const RuntimeShape& input_shape, + int32 input_zero_point, int32 input_range_radius, + int32 input_multiplier, int input_left_shift, + uint8* output_data, const RuntimeShape& output_shape) { + TanhParams params; + params.input_zero_point = input_zero_point; + params.input_range_radius = input_range_radius; + params.input_multiplier = input_multiplier; + params.input_left_shift = input_left_shift; + Tanh(params, input_shape, input_data, output_shape, output_data); +} + +inline void Tanh(const int16* input_data, const RuntimeShape& input_shape, + int input_left_shift, int16* output_data, + const RuntimeShape& output_shape) { + TanhParams params; + params.input_left_shift = input_left_shift; + Tanh(params, input_shape, input_data, output_shape, output_data); +} + +inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, + int32 zero_point, double scale, float* output_data, + const Dims<4>& output_dims) { + tflite::DequantizationParams op_params; + op_params.zero_point = zero_point; + op_params.scale = scale; + + Dequantize(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, + float rmin, float rmax, int num_bits, float* output_data, + const Dims<4>& output_dims) { + tflite::FakeQuantParams op_params; + op_params.num_bits = num_bits; + op_params.minmax.min = rmin; + op_params.minmax.max = rmax; + + FakeQuant(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +inline void Gather(const T* input_data, const Dims<4>& input_dims, + int input_rank, const int32* coords_data, + const Dims<4>& coords_dims, T* output_data, + const Dims<4>& output_dims) { + tflite::GatherParams op_params; + op_params.input_rank = input_rank; + + Gather(op_params, DimsToShape(input_dims), input_data, + DimsToShape(coords_dims), coords_data, DimsToShape(output_dims), + output_data); +} + +inline uint32 LegacyReverseBits32(uint32 n) { + n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1); + n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2); + n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4); + return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) | + ((n & 0xFF000000) >> 24)); +} + +inline void StridedSliceReverseIndices(tflite::StridedSliceParams* p) { + TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count); + TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count); + + std::reverse(p->start_indices, p->start_indices + p->start_indices_count); + std::reverse(p->stop_indices, p->stop_indices + p->stop_indices_count); + std::reverse(p->strides, p->strides + p->strides_count); + + p->begin_mask = LegacyReverseBits32(static_cast<uint32>(p->begin_mask)) >> + (32 - p->start_indices_count); + p->ellipsis_mask = + LegacyReverseBits32(static_cast<uint32>(p->ellipsis_mask)) >> + (32 - p->start_indices_count); + p->end_mask = LegacyReverseBits32(static_cast<uint32>(p->end_mask)) >> + (32 - p->start_indices_count); + p->new_axis_mask = + LegacyReverseBits32(static_cast<uint32>(p->new_axis_mask)) >> + (32 - p->start_indices_count); + p->shrink_axis_mask = + LegacyReverseBits32(static_cast<uint32>(p->shrink_axis_mask)) >> + (32 - p->start_indices_count); +} + +template <typename T> +inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, + int begin_mask, int end_mask, int shrink_axis_mask, + const std::vector<int>& start_indices, + const std::vector<int>& stop_indices, + const std::vector<int>& strides, T* output_data, + const Dims<4>& output_dims) { + TFLITE_DCHECK_EQ(start_indices.size(), 4); + auto op_params = strided_slice::BuildStridedSliceParams( + begin_mask, end_mask, shrink_axis_mask, start_indices, stop_indices, + strides); + StridedSliceReverseIndices(&op_params); + + StridedSlice(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +inline void Mean(const T* input_data, const Dims<4>& input_dims, + const std::vector<int>& reduction_indices, T* output_data, + const Dims<4>& output_dims) { + tflite::MeanParams op_params; + op_params.axis_count = reduction_indices.size(); + for (int i = 0; i < op_params.axis_count; ++i) { + op_params.axis[i] = reduction_indices[op_params.axis_count - 1 - i]; + } + + Mean(op_params, DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +template <typename T> +void Transpose(const T* input, const Dims<4>& input_dims, T* output, + const Dims<4>& output_dims, const int* permuted_axes) { + TransposeParams params; + params.perm_count = 4; + for (int i = 0; i < 4; ++i) { + params.perm[i] = 3 - permuted_axes[3 - i]; + } + Transpose(params, DimsToShape(input_dims), input, DimsToShape(output_dims), + output); +} + +template <typename T, ComparisonFn<T> F> +inline void Comparison(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + bool* output_data, const Dims<4>& output_dims) { + ComparisonParams op_params; + // No parameters needed. + ComparisonImpl<T, F>(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +template <typename T, ComparisonFn<int32> F> +inline void Comparison(int left_shift, const T* input1_data, + const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, + const T* input2_data, const Dims<4>& input2_dims, + int32 input2_offset, int32 input2_multiplier, + int input2_shift, bool* output_data, + const Dims<4>& output_dims) { + tflite::ComparisonParams op_params; + op_params.left_shift = left_shift; + op_params.input1_offset = input1_offset; + op_params.input1_multiplier = input1_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_offset = input2_offset; + op_params.input2_multiplier = input2_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.input2_shift = kReverseShift * input2_shift; + + ComparisonWithScaling<T, F>(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +template <typename T, ComparisonFn<T> F> +inline void BroadcastComparison(const T* input1_data, + const Dims<4>& input1_dims, + const T* input2_data, + const Dims<4>& input2_dims, bool* output_data, + const Dims<4>& output_dims) { + ComparisonParams op_params; + // No parameters needed. + BroadcastComparison4DSlowImpl<T, F>(op_params, DimsToShape(input1_dims), + input1_data, DimsToShape(input2_dims), + input2_data, DimsToShape(output_dims), + output_data); +} + +template <typename T, ComparisonFn<int32> F> +inline void BroadcastComparison(int left_shift, const T* input1_data, + const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, + const T* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, + int32 input2_multiplier, int input2_shift, + bool* output_data, const Dims<4>& output_dims) { + ComparisonParams op_params; + + op_params.left_shift = left_shift; + op_params.input1_offset = input1_offset; + op_params.input1_multiplier = input1_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_offset = input2_offset; + op_params.input2_multiplier = input2_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.input2_shift = kReverseShift * input2_shift; + + BroadcastComparison4DSlowWithScaling<T, F>( + op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +#define TFLITE_LEGACY_COMPARISON_OP(name) \ + template <typename T> \ + inline void name(const T* input1_data, const Dims<4>& input1_dims, \ + const T* input2_data, const Dims<4>& input2_dims, \ + bool* output_data, const Dims<4>& output_dims) { \ + gemmlowp::ScopedProfilingLabel label(#name); \ + Comparison<T, name##Fn>(input1_data, input1_dims, input2_data, \ + input2_dims, output_data, output_dims); \ + } \ + template <typename T> \ + inline void name( \ + int left_shift, const T* input1_data, const Dims<4>& input1_dims, \ + int32 input1_offset, int32 input1_multiplier, int input1_shift, \ + const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset, \ + int32 input2_multiplier, int input2_shift, bool* output_data, \ + const Dims<4>& output_dims) { \ + gemmlowp::ScopedProfilingLabel label(#name "/8bit"); \ + Comparison<T, name##Fn>(left_shift, input1_data, input1_dims, \ + input1_offset, input1_multiplier, input1_shift, \ + input2_data, input2_dims, input2_offset, \ + input2_multiplier, input2_shift, output_data, \ + output_dims); \ + } \ + template <typename T> \ + inline void Broadcast##name( \ + const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, \ + const Dims<4>& input2_dims, bool* output_data, \ + const Dims<4>& output_dims) { \ + gemmlowp::ScopedProfilingLabel label("Broadcast" #name); \ + BroadcastComparison<T, name##Fn>(input1_data, input1_dims, input2_data, \ + input2_dims, output_data, output_dims); \ + } \ + template <typename T> \ + inline void Broadcast##name( \ + int left_shift, const T* input1_data, const Dims<4>& input1_dims, \ + int32 input1_offset, int32 input1_multiplier, int input1_shift, \ + const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset, \ + int32 input2_multiplier, int input2_shift, bool* output_data, \ + const Dims<4>& output_dims) { \ + gemmlowp::ScopedProfilingLabel label("Broadcast" #name "/8bit"); \ + BroadcastComparison<T, name##Fn>(left_shift, input1_data, input1_dims, \ + input1_offset, input1_multiplier, \ + input1_shift, input2_data, input2_dims, \ + input2_offset, input2_multiplier, \ + input2_shift, output_data, output_dims); \ + } +TFLITE_LEGACY_COMPARISON_OP(Equal); +TFLITE_LEGACY_COMPARISON_OP(NotEqual); +TFLITE_LEGACY_COMPARISON_OP(Greater); +TFLITE_LEGACY_COMPARISON_OP(GreaterEqual); +TFLITE_LEGACY_COMPARISON_OP(Less); +TFLITE_LEGACY_COMPARISON_OP(LessEqual); +#undef TFLITE_LEGACY_COMPARISON_OP + +template <typename D, typename T> +inline void Select(const D* input_condition_data, + const Dims<4>& input_condition_dims, const T* input_x_data, + const Dims<4>& input_x_dims, const T* input_y_data, + const Dims<4>& input_y_dims, T* output_data, + const Dims<4>& output_dims) { + Select(DimsToShape(input_condition_dims), input_condition_data, + DimsToShape(input_x_dims), input_x_data, DimsToShape(input_y_dims), + input_y_data, DimsToShape(output_dims), output_data); +} + +template <typename D, typename T> +inline void RankOneSelect(const D* input_condition_data, + const Dims<4>& input_condition_dims, + const T* input_x_data, const Dims<4>& input_x_dims, + const T* input_y_data, const Dims<4>& input_y_dims, + T* output_data, const Dims<4>& output_dims) { + RankOneSelect(DimsToShape(input_condition_dims), input_condition_data, + DimsToShape(input_x_dims), input_x_data, + DimsToShape(input_y_dims), input_y_data, + DimsToShape(output_dims), output_data); +} + +template <typename T, typename TI> +inline void SparseToDense(const std::vector<std::vector<TI>>& indices, + const T* values, T default_value, T* output_data, + const Dims<4>& output_dims, bool value_is_scalar) { + SparseToDense(indices, values, default_value, value_is_scalar, + DimsToShape(output_dims), output_data); +} + +template <typename Scalar> +void Pack(int dim, const Scalar* const* input_data, + const Dims<4>* const* input_dims, int inputs_count, + Scalar* output_data, const Dims<4>& output_dims) { + std::vector<RuntimeShape> input_shapes(inputs_count); + std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count); + for (int i = 0; i < inputs_count; ++i) { + ShapeFromDims(*input_dims[i], &input_shapes[i]); + input_shapes_indirect[i] = &input_shapes[i]; + } + tflite::PackParams op_params; + op_params.axis = 3 - dim; + op_params.inputs_count = inputs_count; + + Pack(op_params, input_shapes_indirect.data(), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename Scalar> +void Unpack(int axis, const Scalar* input_data, const Dims<4>& input_dims, + int dimensions, int outputs_count, Scalar* const* output_datas, + const Dims<4>& output_dims) { + tflite::UnpackParams op_params; + op_params.axis = 3 - axis; + op_params.num_split = outputs_count; + + Unpack(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_datas); +} + +template <typename Scalar> +void Pack(int dim, const Scalar* const* input_data, + const Dims<4>* const* input_dims, const int32* input_zeropoint, + const float* input_scale, int inputs_count, Scalar* output_data, + const Dims<4>& output_dims, const int32 output_zeropoint, + const float output_scale) { + std::vector<RuntimeShape> input_shapes(inputs_count); + std::vector<const RuntimeShape*> input_shapes_indirect(inputs_count); + for (int i = 0; i < inputs_count; ++i) { + ShapeFromDims(*input_dims[i], &input_shapes[i]); + input_shapes_indirect[i] = &input_shapes[i]; + } + tflite::PackParams op_params; + op_params.axis = 3 - dim; + op_params.input_zeropoint = input_zeropoint; + op_params.input_scale = input_scale; + op_params.inputs_count = inputs_count; + op_params.output_zeropoint = output_zeropoint; + op_params.output_scale = output_scale; + + PackWithScaling(op_params, input_shapes_indirect.data(), input_data, + DimsToShape(output_dims), output_data); +} + +template <FusedActivationFunctionType Ac> +void L2Normalization(const float* input_data, const RuntimeShape& input_shape, + float* output_data, const RuntimeShape& output_shape) { + static_assert(Ac == FusedActivationFunctionType::kNone, ""); + tflite::L2NormalizationParams op_params; + // No params need to be set for float. + + L2Normalization(op_params, input_shape, input_data, output_shape, + output_data); +} + +inline void L2Normalization(const uint8* input_data, + const RuntimeShape& input_shape, + int32 input_zero_point, uint8* output_data, + const RuntimeShape& output_shape) { + tflite::L2NormalizationParams op_params; + op_params.input_zero_point = input_zero_point; + + L2Normalization(op_params, input_shape, input_data, output_shape, + output_data); +} + +template <FusedActivationFunctionType Ac> +void L2Normalization(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + L2Normalization<Ac>(input_data, DimsToShape(input_dims), output_data, + DimsToShape(output_dims)); +} + +inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, + int32 input_zero_point, uint8* output_data, + const Dims<4>& output_dims) { + L2Normalization(input_data, DimsToShape(input_dims), input_zero_point, + output_data, DimsToShape(output_dims)); +} + +inline void Relu(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Relu(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void Relu1(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Relu1(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void Relu6(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Relu6(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void ReluX(uint8 min_value, uint8 max_value, const uint8* input_data, + const RuntimeShape& input_shape, uint8* output_data, + const RuntimeShape& output_shape) { + tflite::ActivationParams params; + params.quantized_activation_max = max_value; + params.quantized_activation_min = min_value; + ReluX(params, input_shape, input_data, output_shape, output_data); +} + +template <FusedActivationFunctionType Ac> +inline void Add(int left_shift, const uint8* input1_data, + const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, + const uint8* input2_data, const Dims<4>& input2_dims, + int32 input2_offset, int32 input2_multiplier, int input2_shift, + int32 output_offset, int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + constexpr int kReverseShift = -1; + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + + tflite::ArithmeticParams op_params; + op_params.left_shift = left_shift; + op_params.input1_offset = input1_offset; + op_params.input1_multiplier = input1_multiplier; + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_offset = input2_offset; + op_params.input2_multiplier = input2_multiplier; + op_params.input2_shift = kReverseShift * input2_shift; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + Add(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <FusedActivationFunctionType Ac> +void Add(const int32* input1_data, const Dims<4>& input1_dims, + const int32* input2_data, const Dims<4>& input2_dims, + int32* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Add/int32"); + TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); + + tflite::ArithmeticParams op_params; + op_params.quantized_activation_min = std::numeric_limits<int32>::min(); + op_params.quantized_activation_max = std::numeric_limits<int32>::max(); + Add(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <FusedActivationFunctionType Ac> +inline void BroadcastAdd(int left_shift, const uint8* input1_data, + const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, + const uint8* input2_data, const Dims<4>& input2_dims, + int32 input2_offset, int32 input2_multiplier, + int input2_shift, int32 output_offset, + int32 output_multiplier, int output_shift, + int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + constexpr int kReverseShift = -1; + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + + tflite::ArithmeticParams op_params; + op_params.left_shift = left_shift; + op_params.input1_offset = input1_offset; + op_params.input1_multiplier = input1_multiplier; + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_offset = input2_offset; + op_params.input2_multiplier = input2_multiplier; + op_params.input2_shift = kReverseShift * input2_shift; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +template <FusedActivationFunctionType Ac> +void Add(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + + tflite::ArithmeticParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + Add(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <typename T> +void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +template <FusedActivationFunctionType Ac> +inline void BroadcastAddFivefold( + int y0, int y1, int y2, int y3, int y4, int left_shift, + const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, + int input2_shift, int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + constexpr int kReverseShift = -1; + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + tflite::ArithmeticParams op_params; + op_params.broadcast_category = + tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; + op_params.left_shift = left_shift; + op_params.input1_offset = input1_offset; + op_params.input1_multiplier = input1_multiplier; + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_offset = input2_offset; + op_params.input2_multiplier = input2_multiplier; + op_params.input2_shift = kReverseShift * input2_shift; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = kReverseShift * output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + op_params.broadcast_shape[4] = y0; + op_params.broadcast_shape[3] = y1; + op_params.broadcast_shape[2] = y2; + op_params.broadcast_shape[1] = y3; + op_params.broadcast_shape[0] = y4; + BroadcastAddFivefold(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac, typename T> +void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T* output_data, const Dims<4>& output_dims) { + T output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + + BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims, + output_activation_min, output_activation_max, output_data, + output_dims); +} + +template <FusedActivationFunctionType Ac> +inline void Add(const int16* input1_data, const Dims<4>& input1_dims, + int input1_shift, const int16* input2_data, + const Dims<4>& input2_dims, int input2_shift, + int16 output_activation_min, int16 output_activation_max, + int16* output_data, const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, -32768); + TFLITE_DCHECK_EQ(output_activation_max, 32767); + } + + tflite::ArithmeticParams op_params; + op_params.input1_shift = kReverseShift * input1_shift; + op_params.input2_shift = kReverseShift * input2_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + Add(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void Sub(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(FusedActivationFunctionType::kNone, + &output_activation_min, &output_activation_max); + tflite::ArithmeticParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + Sub(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <typename T> +void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, + const Dims<4>& input2_dims, T* output_data, + const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.quantized_activation_min = std::numeric_limits<T>::min(); + op_params.quantized_activation_max = std::numeric_limits<T>::max(); + Sub(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void AveragePool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int kwidth, int kheight, + float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims) { + tflite::PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = kheight; + params.filter_width = kwidth; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.float_activation_min = output_activation_min; + params.float_activation_max = output_activation_max; + AveragePool(params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +// Transitional version that will be moved shortly to legacy_reference_ops, as +// part of RuntimeShape revisions. +inline void BroadcastMul4DSlow(const uint8* input1_data, + const Dims<4>& input1_dims, int32 input1_offset, + const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + op_params.input1_offset = input1_offset; + op_params.input2_offset = input2_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = output_shift; + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, + int32 input1_offset, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + BroadcastMul4DSlow( + input1_data, input1_dims, input1_offset, input2_data, input2_dims, + input2_offset, output_offset, output_multiplier, + // + kReverseShift * output_shift, + // + output_activation_min, output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, + int32 input1_offset, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + BroadcastMul(input1_data, input1_dims, input1_offset, input2_data, + input2_dims, input2_offset, output_offset, output_multiplier, + output_shift, output_activation_min, output_activation_max, + output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void AveragePool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int kwidth, int kheight, float* output_data, + const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + + AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, kwidth, kheight, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, + int filter_height, float* output_data, + const Dims<4>& output_dims) { + AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_data, output_dims); +} + +inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int filter_width, int filter_height, + int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = filter_height; + params.filter_width = filter_width; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.quantized_activation_min = output_activation_min; + params.quantized_activation_max = output_activation_max; + AveragePool(params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void AveragePool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int filter_width, int filter_height, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, + int filter_height, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +inline void MaxPool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int kwidth, int kheight, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims) { + tflite::PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = kheight; + params.filter_width = kwidth; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.float_activation_min = output_activation_min; + params.float_activation_max = output_activation_max; + MaxPool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void MaxPool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, int pad_height, + int kwidth, int kheight, float* output_data, + const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, kwidth, kheight, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, int filter_height, + float* output_data, const Dims<4>& output_dims) { + MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_data, output_dims); +} + +inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int filter_width, int filter_height, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = filter_height; + params.filter_width = filter_width; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.quantized_activation_min = output_activation_min; + params.quantized_activation_max = output_activation_max; + MaxPool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void MaxPool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, int pad_height, + int filter_width, int filter_height, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + if (Ac == FusedActivationFunctionType::kNone) { + TFLITE_DCHECK_EQ(output_activation_min, 0); + TFLITE_DCHECK_EQ(output_activation_max, 255); + } + MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, int filter_height, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +inline void L2Pool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, int filter_width, int filter_height, + float output_activation_min, float output_activation_max, + float* output_data, const Dims<4>& output_dims) { + PoolParams params; + params.stride_height = stride_height; + params.stride_width = stride_width; + params.filter_height = filter_height; + params.filter_width = filter_width; + params.padding_values.height = pad_height; + params.padding_values.width = pad_width; + params.float_activation_min = output_activation_min; + params.float_activation_max = output_activation_max; + L2Pool(params, DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void L2Pool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, int pad_width, int pad_height, + int filter_width, int filter_height, float* output_data, + const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + L2Pool(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, filter_width, filter_height, output_activation_min, + output_activation_max, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int filter_width, int filter_height, + float* output_data, const Dims<4>& output_dims) { + L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, + filter_width, filter_height, output_data, output_dims); +} + +inline void Softmax(const float* input_data, const Dims<4>& input_dims, + float beta, float* output_data, + const Dims<4>& output_dims) { + Softmax(input_data, DimsToShape(input_dims), beta, output_data, + DimsToShape(output_dims)); +} + +inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, + int32 input_beta_multiplier, int32 input_beta_left_shift, + int diff_min, uint8* output_data, + const Dims<4>& output_dims) { + Softmax(input_data, DimsToShape(input_dims), input_beta_multiplier, + input_beta_left_shift, diff_min, output_data, + DimsToShape(output_dims)); +} + +inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + LogSoftmax(input_data, DimsToShape(input_dims), output_data, + DimsToShape(output_dims)); +} + +inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, + int32 input_multiplier, int32 input_left_shift, + int32 reverse_scaling_divisor, + int32 reverse_scaling_right_shift, int diff_min, + uint8* output_data, const Dims<4>& output_dims) { + LogSoftmax(input_data, DimsToShape(input_dims), input_multiplier, + input_left_shift, reverse_scaling_divisor, + reverse_scaling_right_shift, diff_min, output_data, + DimsToShape(output_dims)); +} + +inline void Logistic(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, + int32 input_zero_point, int32 input_range_radius, + int32 input_multiplier, int input_left_shift, + uint8* output_data, const Dims<4>& output_dims) { + Logistic(input_data, DimsToShape(input_dims), input_zero_point, + input_range_radius, input_multiplier, input_left_shift, output_data, + DimsToShape(output_dims)); +} + +inline void Logistic(const int16* input_data, const Dims<4>& input_dims, + int16* output_data, const Dims<4>& output_dims) { + Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void Tanh(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Tanh(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void Tanh(const uint8* input_data, const Dims<4>& input_dims, + int32 input_zero_point, int32 input_range_radius, + int32 input_multiplier, int input_left_shift, + uint8* output_data, const Dims<4>& output_dims) { + Tanh(input_data, DimsToShape(input_dims), input_zero_point, + input_range_radius, input_multiplier, input_left_shift, output_data, + DimsToShape(output_dims)); +} + +inline void Tanh(const int16* input_data, const Dims<4>& input_dims, + int input_left_shift, int16* output_data, + const Dims<4>& output_dims) { + Tanh(input_data, DimsToShape(input_dims), input_left_shift, output_data, + DimsToShape(output_dims)); +} + +template <typename T> +inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, + int block_size, T* output_data, + const Dims<4>& output_dims) { + tflite::DepthToSpaceParams op_params; + op_params.block_size = block_size; + + DepthToSpace(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, + int block_size, T* output_data, + const Dims<4>& output_dims) { + tflite::SpaceToDepthParams op_params; + op_params.block_size = block_size; + + SpaceToDepth(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +inline void Mul(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void Mul(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + float output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +template <typename T> +void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T output_activation_min, T output_activation_max, + T* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac, typename T> +void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T* output_data, const Dims<4>& output_dims) { + T output_activation_min, output_activation_max; + GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); + + tflite::ArithmeticParams op_params; + SetActivationParams(output_activation_min, output_activation_max, &op_params); + + BroadcastMul4DSlow(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, + const int16* input2_data, const Dims<4>& input2_dims, + int16* output_data, const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + // No params in this version. + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, + const int16* input2_data, const Dims<4>& input2_dims, + int32 output_offset, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + tflite::ArithmeticParams op_params; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + op_params.output_offset = output_offset; + + Mul(op_params, DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, DimsToShape(output_dims), + output_data); +} + +inline void LocalResponseNormalization(const float* input_data, + const Dims<4>& input_dims, int range, + float bias, float alpha, float beta, + float* output_data, + const Dims<4>& output_dims) { + tflite::LocalResponseNormalizationParams op_params; + op_params.range = range; + op_params.bias = bias; + op_params.alpha = alpha; + op_params.beta = beta; + + LocalResponseNormalization(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +template <typename SrcT, typename DstT> +void Cast(const SrcT* input_data, const Dims<4>& input_dims, DstT* output_data, + const Dims<4>& output_dims) { + Cast(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +inline void Floor(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + Floor(DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data); +} + +template <typename T> +inline void ResizeBilinear(const T* input_data, const Dims<4>& input_dims, + const int32* output_size_data, + const Dims<4>& output_size_dims, T* output_data, + const Dims<4>& output_dims, bool align_corners) { + tflite::ResizeBilinearParams op_params; + op_params.align_corners = align_corners; + ResizeBilinear(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_size_dims), output_size_data, + DimsToShape(output_dims), output_data); +} + +// legacy, for compatibility with old checked-in code +inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, + const int32* output_size_data, + const Dims<4>& output_size_dims, float* output_data, + const Dims<4>& output_dims) { + ResizeBilinear<float>(input_data, input_dims, output_size_data, + output_size_dims, output_data, output_dims, + /*align_corners=*/false); +} + +inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims, + const int32* output_size_data, + const Dims<4>& output_size_dims, uint8* output_data, + const Dims<4>& output_dims) { + ResizeBilinear<uint8>(input_data, input_dims, output_size_data, + output_size_dims, output_data, output_dims, + /*align_corners=*/false); +} + +template <typename T> +inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims, + const int32* block_shape_data, + const Dims<4>& block_shape_dims, + const int32* paddings_data, + const Dims<4>& paddings_dims, T* output_data, + const Dims<4>& output_dims, + const int32_t pad_value) { + tflite::SpaceToBatchParams op_params; + op_params.output_offset = pad_value; + + SpaceToBatchND(op_params, DimsToShape(input_dims), input_data, + DimsToShape(block_shape_dims), block_shape_data, + DimsToShape(paddings_dims), paddings_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims, + const int32* block_shape_data, + const Dims<4>& block_shape_dims, + const int32* paddings_data, + const Dims<4>& paddings_dims, T* output_data, + const Dims<4>& output_dims) { + tflite::SpaceToBatchParams op_params; + op_params.output_offset = 0; + + SpaceToBatchND(op_params, DimsToShape(input_dims), input_data, + DimsToShape(block_shape_dims), block_shape_data, + DimsToShape(paddings_dims), paddings_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, + const int32* block_shape_data, + const Dims<4>& block_shape_dims, + const int32* crops_data, const Dims<4>& crops_dims, + T* output_data, const Dims<4>& output_dims) { + BatchToSpaceND(DimsToShape(input_dims), input_data, + DimsToShape(block_shape_dims), block_shape_data, + DimsToShape(crops_dims), crops_data, DimsToShape(output_dims), + output_data); +} + +// Legacy signature, function covered both Pad and PadV2. +template <typename T> +inline void PadV2(const T* input_data, const Dims<4>& input_dims, + const std::vector<int>& left_paddings, + const std::vector<int>& right_paddings, T* output_data, + const Dims<4>& output_dims, const T pad_value) { + TFLITE_DCHECK_EQ(left_paddings.size(), 4); + TFLITE_DCHECK_EQ(right_paddings.size(), 4); + tflite::PadParams op_params; + op_params.left_padding_count = 4; + op_params.right_padding_count = 4; + for (int i = 0; i < 4; ++i) { + op_params.left_padding[i] = left_paddings[3 - i]; + op_params.right_padding[i] = right_paddings[3 - i]; + } + // SetFloatOrInt(pad_value, &op_params.pad_value); + const T pad_value_copy = pad_value; + + Pad(op_params, DimsToShape(input_dims), input_data, &pad_value_copy, + DimsToShape(output_dims), output_data); +} + +// Old Pad that calls legacy PadV2. +template <typename T> +inline void Pad(const T* input_data, const Dims<4>& input_dims, + const std::vector<int>& left_paddings, + const std::vector<int>& right_paddings, T* output_data, + const Dims<4>& output_dims, const int32_t pad_value) { + const T converted_pad_value = static_cast<T>(pad_value); + PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data, + output_dims, converted_pad_value); +} + +// Old Pad that only padded with 0. +template <typename T> +inline void Pad(const T* input_data, const Dims<4>& input_dims, + const std::vector<int>& left_paddings, + const std::vector<int>& right_paddings, T* output_data, + const Dims<4>& output_dims) { + const T pad_value = static_cast<T>(0); + PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data, + output_dims, pad_value); +} + +template <typename T> +void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, T* output_data, + const Dims<4>& output_dims) { + Minimum(DimsToShape(input1_dims), input1_data, input2_data, + DimsToShape(output_dims), output_data); +} + +template <typename T> +void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, T* output_data, + const Dims<4>& output_dims) { + Maximum(DimsToShape(input1_dims), input1_data, input2_data, + DimsToShape(output_dims), output_data); +} + +template <typename T, typename Op> +void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T* output_data, const Dims<4>& output_dims, + Op op) { + MaximumMinimumBroadcast4DSlow(DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data, op); +} + +template <typename T1, typename T2, typename T3> +void ArgMax(const T3* axis, const T1* input_data, + const tflite::Dims<4>& input_dims, T2* output_data, + const tflite::Dims<4>& output_dims) { + ArgMinMax(DimsToShape(input_dims), input_data, axis, DimsToShape(output_dims), + output_data, std::greater<T1>()); +} + +template <typename T1, typename T2, typename T3, typename Cmp> +void ArgMinMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, + T2* output_data, const Dims<4>& output_dims, const Cmp& cmp) { + ArgMinMax(axis, DimsToShape(input_dims), input_data, DimsToShape(output_dims), + output_data, cmp); +} + +template <typename T> +inline void Pow(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T* output_data, const Dims<4>& output_dims) { + Pow(DimsToShape(input1_dims), input1_data, DimsToShape(input2_dims), + input2_data, DimsToShape(output_dims), output_data); +} + +template <typename T> +inline void BroadcastPow(const T* input1_data, const Dims<4>& input1_dims, + const T* input2_data, const Dims<4>& input2_dims, + T* output_data, const Dims<4>& output_dims) { + BroadcastPow4DSlow(DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data); +} + +inline void Logical(const bool* input1_data, const Dims<4>& input1_dims, + const bool* input2_data, const Dims<4>& input2_dims, + bool* output_data, const Dims<4>& output_dims, + const std::function<bool(bool, bool)>& func) { + Logical(DimsToShape(input1_dims), input1_data, DimsToShape(input2_dims), + input2_data, DimsToShape(output_dims), output_data, func); +} + +inline void BroadcastLogical(const bool* input1_data, + const Dims<4>& input1_dims, + const bool* input2_data, + const Dims<4>& input2_dims, bool* output_data, + const Dims<4>& output_dims, + const std::function<bool(bool, bool)>& func) { + BroadcastLogical4DSlow(DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data, func); +} + +// R: Result type. T1: Input 1 type. T2: Input 2 type. +template <typename R, typename T1, typename T2> +inline void BroadcastBinaryFunction(const T1* input1_data, + const Dims<4>& input1_dims, + const T2* input2_data, + const Dims<4>& input2_dims, R* output_data, + const Dims<4>& output_dims, + R (*func)(T1, T2)) { + BroadcastBinaryFunction(DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data, func); +} + +// R: Result type. T1: Input 1 type. T2: Input 2 type. +template <typename R, typename T1, typename T2> +inline void BinaryFunction(const T1* input1_data, const Dims<4>& input1_dims, + const T2* input2_data, const Dims<4>& input2_dims, + R* output_data, const Dims<4>& output_dims, + R (*func)(T1, T2)) { + BinaryFunction(DimsToShape(input1_dims), input1_data, + DimsToShape(input2_dims), input2_data, + DimsToShape(output_dims), output_data, func); +} + +template <typename T> +inline void Slice(const T* input_data, const Dims<4>& input_dims, + const std::vector<int>& begin, const std::vector<int>& size, + T* output_data, const Dims<4>& output_dims) { + tflite::SliceParams op_params; + op_params.begin_count = 4; + op_params.size_count = 4; + for (int i = 0; i < 4; ++i) { + op_params.begin[i] = begin[3 - i]; + op_params.size[i] = size[3 - i]; + } + + Slice(op_params, DimsToShape(input_dims), input_data, + DimsToShape(output_dims), output_data); +} + +} // namespace reference_ops +} // namespace tflite +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc index 2607adc0c1..70d25c4bd9 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc +++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc @@ -14,12 +14,17 @@ limitations under the License. ==============================================================================*/ #include <stdlib.h> #include <string.h> +#include <algorithm> -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/internal/round.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" +#if defined(_MSC_VER) +#define __restrict__ __restrict +#endif + namespace tflite { namespace tensor_utils { @@ -29,23 +34,31 @@ float PortableClip(float f, float abs_limit) { return result; } +bool PortableIsZeroVector(const float* vector, int v_size) { + for (int i = 0; i < v_size; ++i) { + if (*vector++ != 0.0f) return false; + } + return true; +} + void PortableSymmetricQuantizeFloats(const float* values, const int size, - int8_t* quantized_values, float* min, - float* max, float* scaling_factor) { + int8_t* quantized_values, float* min_value, + float* max_value, float* scaling_factor) { auto minmax = std::minmax_element(values, values + size); - *min = *minmax.first; - *max = *minmax.second; + *min_value = *minmax.first; + *max_value = *minmax.second; const int kScale = 127; - const float range = std::max(std::abs(*min), std::abs(*max)); + const float range = std::max(std::abs(*min_value), std::abs(*max_value)); if (range == 0) { memset(quantized_values, 0, size * sizeof(int8_t)); *scaling_factor = 1; return; } - *scaling_factor = kScale / range; + *scaling_factor = range / kScale; + const float scaling_factor_inv = kScale / range; for (int i = 0; i < size; ++i) { const int32_t quantized_value = - static_cast<int32_t>(TfLiteRound(*scaling_factor * values[i])); + static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv)); // Clamp: just in case some odd numeric offset. quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value)); } @@ -60,10 +73,12 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix, for (int b = 0; b < n_batch; b++) { const float* matrix_ptr = matrix; for (int r = 0; r < m_rows; r++) { + float dot_prod = 0.0f; const float* vector_in_batch = vector + b * m_cols; for (int c = 0; c < m_cols; c++) { - *result_in_batch += *matrix_ptr++ * *vector_in_batch++; + dot_prod += *matrix_ptr++ * *vector_in_batch++; } + *result_in_batch += dot_prod; result_in_batch += result_stride; } } @@ -75,20 +90,22 @@ void PortableMatrixBatchVectorMultiplyAccumulate( int n_batch, float* __restrict__ result, int result_stride) { int batch, row, col; for (batch = 0; batch < n_batch; ++batch, vectors += m_cols) { - const float batch_scaling_factor_inv = 1.0 / scaling_factors[batch]; + const float batch_scaling_factor = scaling_factors[batch]; // Get the address of the first row. - int8_t* row_ptr = (int8_t*)matrix; // NOLINT + const int8_t* row_ptr = matrix; for (row = 0; row < m_rows; ++row, result += result_stride) { // Initialize the dot product sum for the row to 0. int32_t dotprod = 0; +#if defined(__GNUC__) // Prefetch the row to cache. __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */); +#endif // For every block of 16 8-bit elements (128-bit register) from each row. for (col = 0; col < m_cols; ++col, ++row_ptr) { dotprod += (*row_ptr) * (vectors[col]); } // for col - *result += (dotprod * batch_scaling_factor_inv); + *result += (dotprod * batch_scaling_factor); } // for row } // for batch } @@ -134,6 +151,16 @@ void PortableVectorVectorCwiseProductAccumulate(const float* vector1, } } +void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size, + const float* batch_vector, + int n_batch, float* result) { + for (int b = 0; b < n_batch; b++) { + for (int v = 0; v < v_size; v++) { + *result++ = vector[v] * *batch_vector++; + } + } +} + void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, const float* batch_vector, @@ -146,6 +173,16 @@ void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector, } } +void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch, + float* batch_vector) { + for (int b = 0; b < n_batch; b++) { + for (int i = 0; i < v_size; ++i) { + batch_vector[i] += vector[i]; + } + batch_vector += v_size; + } +} + void PortableVectorBatchVectorAssign(const float* vector, int v_size, int n_batch, float* batch_vector) { for (int b = 0; b < n_batch; b++) { @@ -184,6 +221,13 @@ void PortableZeroVector(float* vector, int v_size) { memset(vector, 0, v_size * sizeof(float)); } +void PortableVectorScalarMultiply(const int8_t* vector, const int v_size, + const float scale, float* result) { + for (int v = 0; v < v_size; ++v) { + *result++ = scale * *vector++; + } +} + void PortableClipVector(const float* vector, int v_size, float abs_limit, float* result) { for (int v = 0; v < v_size; v++) { @@ -209,5 +253,31 @@ void PortableReductionSumVector(const float* input_vector, float* output_vector, } } +void PortableMeanStddevNormalization(const float* input_vector, + float* output_vector, int v_size, + int n_batch, float normalization_epsilon) { + for (int batch = 0; batch < n_batch; ++batch) { + float sum = 0.0f; + float sum_sq = 0.0f; + for (int i = 0; i < v_size; ++i) { + sum += input_vector[i]; + sum_sq += input_vector[i] * input_vector[i]; + } + const float mean = sum / v_size; + float stddev_inv = 0.0f; + const float variance = sum_sq / v_size - mean * mean; + if (variance == 0) { + stddev_inv = 1.0f / sqrt(normalization_epsilon); + } else { + stddev_inv = 1.0f / sqrt(variance); + } + for (int i = 0; i < v_size; ++i) { + output_vector[i] = (input_vector[i] - mean) * stddev_inv; + } + input_vector += v_size; + output_vector += v_size; + } +} + } // namespace tensor_utils } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h index 1757a9f5e5..714b1164ee 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h @@ -17,7 +17,11 @@ limitations under the License. // TODO(ghodrat): Remove this header file and the dependency to internal data // structure. -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" + +#if defined(_MSC_VER) +#define __restrict__ __restrict +#endif namespace tflite { namespace tensor_utils { @@ -25,9 +29,11 @@ namespace tensor_utils { // Limit a float input f between +abs_limit and -abs_limit. float PortableClip(float f, float abs_limit); +bool PortableIsZeroVector(const float* vector, int v_size); + void PortableSymmetricQuantizeFloats(const float* values, const int size, - int8_t* quantized_values, float* min, - float* max, float* scaling_factor); + int8_t* quantized_values, float* min_value, + float* max_value, float* scaling_factor); // Multiply a matrix by a batch vector, and store results in a batch-size // vector. @@ -63,6 +69,11 @@ void PortableBatchVectorBatchVectorDotProduct(const float* vector1, int n_batch, float* result, int result_stride); +// Cwise product of a vector and a batch-vector. +void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size, + const float* batch_vector, + int n_batch, float* result); + // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC // operation, the assumption here is that result array is initialized to valid // values. @@ -76,6 +87,10 @@ void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector, void PortableVectorBatchVectorAssign(const float* vector, int v_size, int n_batch, float* batch_vector); +// Add another vector for each batch in the batch vector. +void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch, + float* batch_vector); + // Apply sigmoid to elements of a vector. void PortableApplySigmoidToVector(const float* vector, int v_size, float* result); @@ -94,6 +109,10 @@ void PortableSub1Vector(const float* vector, int v_size, float* result); // Fill vector with 0.f. void PortableZeroVector(float* vector, int v_size); +// Multiply all elements of vector with a scalar. +void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale, + float* result); + // Clip elements of a vector using a abs_limit value. void PortableClipVector(const float* vector, int v_size, float abs_limit, float* result); @@ -110,8 +129,18 @@ void PortableVectorShiftLeft(float* vector, int v_size, float shift_value); void PortableReductionSumVector(const float* input_vector, float* output_vector, int output_size, int reduction_size); +// Layer norm for each batch. +// normalization_epsilon is added to avoid divergence. +void PortableMeanStddevNormalization(const float* input_vector, + float* output_vector, int v_size, + int n_batch, float normalization_epsilon); + float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); } +bool IsZeroVector(const float* vector, int v_size) { + return PortableIsZeroVector(vector, v_size); +} + void SymmetricQuantizeFloats(const float* values, const int size, int8_t* quantized_values, float* min, float* max, float* scaling_factor) { @@ -147,6 +176,13 @@ void VectorVectorCwiseProductAccumulate(const float* vector1, PortableVectorVectorCwiseProductAccumulate(vector1, vector2, v_size, result); } +void VectorBatchVectorCwiseProduct(const float* vector, int v_size, + const float* batch_vector, int n_batch, + float* result) { + PortableVectorBatchVectorCwiseProduct(vector, v_size, batch_vector, n_batch, + result); +} + void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, const float* batch_vector, int n_batch, float* result) { @@ -167,6 +203,11 @@ void BatchVectorBatchVectorDotProduct(const float* vector1, result, result_stride); } +void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch, + float* batch_vector) { + PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector); +} + void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch, float* batch_vector) { PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector); @@ -193,6 +234,12 @@ void ZeroVector(float* vector, int v_size) { PortableZeroVector(vector, v_size); } +// Multiply all elements of vector with a scalar. +void VectorScalarMultiply(const int8_t* vector, int v_size, float scale, + float* result) { + PortableVectorScalarMultiply(vector, v_size, scale, result); +} + void ClipVector(const float* vector, int v_size, float abs_limit, float* result) { PortableClipVector(vector, v_size, abs_limit, result); @@ -208,6 +255,13 @@ void ReductionSumVector(const float* input_vector, float* output_vector, reduction_size); } +void MeanStddevNormalization(const float* input_vector, float* output_vector, + int v_size, int n_batch, + float normalization_epsilon) { + PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch, + normalization_epsilon); +} + } // namespace tensor_utils } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index e70d8e5454..59f17ae854 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -19,136 +19,177 @@ limitations under the License. #include <sys/types.h> #include <algorithm> #include <cmath> +#include <functional> #include <limits> #include <memory> #include <type_traits> -#include "third_party/eigen3/Eigen/Core" #include "fixedpoint/fixedpoint.h" #include "public/gemmlowp.h" #include "tensorflow/contrib/lite/kernels/internal/common.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/softmax.h" #include "tensorflow/contrib/lite/kernels/internal/round.h" #include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h" #include "tensorflow/contrib/lite/kernels/internal/types.h" namespace tflite { + +// TODO(b/77858996): Add these to gemmlowp. +template <typename IntegerType> +IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) { + static_assert(std::is_same<IntegerType, void>::value, "unimplemented"); + return a; +} + +template <> +inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) { + std::int64_t a64 = a; + std::int64_t b64 = b; + std::int64_t sum = a64 + b64; + return static_cast<std::int32_t>(std::min( + static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()), + std::max( + static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()), + sum))); +} + +template <typename tRawType, int tIntegerBits> +gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp( + gemmlowp::FixedPoint<tRawType, tIntegerBits> a, + gemmlowp::FixedPoint<tRawType, tIntegerBits> b) { + return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( + SaturatingAddNonGemmlowp(a.raw(), b.raw())); +} + +template <typename IntegerType> +IntegerType SaturatingSub(IntegerType a, IntegerType b) { + static_assert(std::is_same<IntegerType, void>::value, "unimplemented"); + return a; +} + +template <> +inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) { + std::int32_t a32 = a; + std::int32_t b32 = b; + std::int32_t diff = a32 - b32; + return static_cast<std::int16_t>(std::min(32767, std::max(-32768, diff))); +} + +template <> +inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) { + std::int64_t a64 = a; + std::int64_t b64 = b; + std::int64_t diff = a64 - b64; + return static_cast<std::int32_t>(std::min( + static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()), + std::max( + static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()), + diff))); +} + +template <typename tRawType, int tIntegerBits> +gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub( + gemmlowp::FixedPoint<tRawType, tIntegerBits> a, + gemmlowp::FixedPoint<tRawType, tIntegerBits> b) { + return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( + SaturatingSub(a.raw(), b.raw())); +} +// End section to be moved to gemmlowp. + namespace reference_ops { -// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE -// BROADCASTING. -// -// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional -// rectangular array of numbers. -// -// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. -// However, as Dims<N> is to be deprecated, this class exists as an adaptor -// to enable simple unoptimized implementations of element-wise broadcasting -// operations. -template <int N> -struct NdArrayDesc { - // The "extent" of each dimension. Indices along dimension d must be in the - // half-open interval [0, extents[d]). - int extents[N]; - - // The number of *elements* (not bytes) between consecutive indices of each - // dimension. - int strides[N]; -}; - -// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING -// ELEMENT-WISE BROADCASTING. -// -// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. -inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, - int i3) { - TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); - TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); - TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); - TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); - return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + - i3 * desc.strides[3]; -} - -// Given the dimensions of the operands for an element-wise binary broadcast, -// adjusts them so that they can be directly iterated over with simple loops. -// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and -// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. -// -// This function assumes that the two input shapes are compatible up to -// broadcasting and the shorter one has already been prepended with 1s to be the -// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), -// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that -// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be -// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). -// -// When two shapes are compatible up to broadcasting, for each dimension d, -// the input extents are either equal, or one of them is 1. -// -// This function performs the following for each dimension d: -// - If the extents are equal, then do nothing since the loop that walks over -// both of the input arrays is correct. -// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 -// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows -// array0 to be referenced *at any index* in dimension d and still access the -// same slice. -template <int N> -inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, - const Dims<N>& input1_dims, - NdArrayDesc<N>* desc0_out, - NdArrayDesc<N>* desc1_out) { - TFLITE_DCHECK(desc0_out != nullptr); - TFLITE_DCHECK(desc1_out != nullptr); - - // Copy dims to desc. - for (int i = 0; i < N; ++i) { - desc0_out->extents[i] = input0_dims.sizes[i]; - desc0_out->strides[i] = input0_dims.strides[i]; - desc1_out->extents[i] = input1_dims.sizes[i]; - desc1_out->strides[i] = input1_dims.strides[i]; - } - - // Walk over each dimension. If the extents are equal do nothing. - // Otherwise, set the desc with extent 1 to have extent equal to the other and - // stride 0. - for (int i = 0; i < N; ++i) { - const int extent0 = ArraySize(input0_dims, i); - const int extent1 = ArraySize(input1_dims, i); - if (extent0 != extent1) { - if (extent0 == 1) { - desc0_out->strides[i] = 0; - desc0_out->extents[i] = extent1; - } else { - TFLITE_DCHECK_EQ(extent1, 1); - desc1_out->strides[i] = 0; - desc1_out->extents[i] = extent0; - } - } +inline void ShapeFromDims(const tflite::Dims<4>& dims, RuntimeShape* shape) { + shape->BuildFrom( + {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]}); +} + +template <typename T> +int CountLeadingZeros(T integer_input) { + static_assert(std::is_unsigned<T>::value, + "Only unsigned integer types handled."); + if (integer_input == 0) { + return std::numeric_limits<T>::digits; + } + const T one_in_leading_positive = static_cast<T>(1) + << (std::numeric_limits<T>::digits - 1); + int leading_zeros = 0; + while (integer_input < one_in_leading_positive) { + integer_input <<= 1; + ++leading_zeros; + } + return leading_zeros; +} + +template <typename IntegerType> +IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) { + if (exponent == 0) { + return x; } + using ScalarIntegerType = + typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType; + const IntegerType min = + gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min()); + const IntegerType max = + gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max()); + const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType); + + const std::int32_t threshold = + ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1); + const IntegerType positive_mask = + gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold)); + const IntegerType negative_mask = + gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold)); + + IntegerType result = gemmlowp::ShiftLeft(x, exponent); + result = gemmlowp::SelectUsingMask(positive_mask, max, result); + result = gemmlowp::SelectUsingMask(negative_mask, min, result); + return result; +} + +// If we want to leave IntegerBits fixed, then multiplication +// by a power of two has to be saturating/rounding, not exact anymore. +template <typename tRawType, int tIntegerBits> +gemmlowp::FixedPoint<tRawType, tIntegerBits> +SaturatingRoundingMultiplyByPOTParam( + gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) { + return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( + SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent)); } -inline void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int dilation_width_factor, - int dilation_height_factor, int pad_width, int pad_height, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims, - float* im2col_data, const Dims<4>& im2col_dims) { - (void)im2col_data; // only used in optimized code. - (void)im2col_dims; // only used in optimized code. - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); - const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0); +inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& bias_shape, + const float* bias_data, const RuntimeShape& output_shape, + float* output_data, const RuntimeShape& im2col_shape, + float* im2col_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + (void)im2col_data; // only used in optimized code. + (void)im2col_shape; // only used in optimized code. + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); if (bias_data) { - TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0)); - } - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { @@ -166,11 +207,11 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims, // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - float input_value = input_data[Offset(input_dims, in_channel, - in_x, in_y, batch)]; + float input_value = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; float filter_value = - filter_data[Offset(filter_dims, in_channel, filter_x, - filter_y, out_channel)]; + filter_data[Offset(filter_shape, out_channel, filter_y, + filter_x, in_channel)]; total += (input_value * filter_value); } } @@ -178,9 +219,9 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims, } float bias_value = 0.0f; if (bias_data) { - bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; + bias_value = bias_data[out_channel]; } - output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = ActivationFunctionWithMinMax(total + bias_value, output_activation_min, output_activation_max); @@ -190,77 +231,45 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims, } } -template <FusedActivationFunctionType Ac> -void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, int stride_width, - int stride_height, int dilation_width_factor, - int dilation_height_factor, int pad_width, int pad_height, - float* output_data, const Dims<4>& output_dims, float* im2col_data, - const Dims<4>& im2col_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, - stride_width, stride_height, dilation_width_factor, - dilation_height_factor, pad_width, pad_height, output_activation_min, - output_activation_max, output_data, output_dims, im2col_data, - im2col_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, int stride_width, - int stride_height, int pad_width, int pad_height, float* output_data, - const Dims<4>& output_dims, float* im2col_data, - const Dims<4>& im2col_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, - stride_width, stride_height, 1, 1, pad_width, pad_height, - output_activation_min, output_activation_max, output_data, output_dims, - im2col_data, im2col_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Conv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - const float* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, float* output_data, - const Dims<4>& output_dims, float* im2col_data, - const Dims<4>& im2col_dims) { - Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, - bias_dims, stride, stride, 1, 1, pad_width, pad_height, output_data, - output_dims, im2col_data, im2col_dims); -} - -inline void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, - gemmlowp::GemmContext* gemm_context) { +inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& filter_shape, + const uint8* filter_data, const RuntimeShape& bias_shape, + const int32* bias_data, const RuntimeShape& output_shape, + uint8* output_data, const RuntimeShape& im2col_shape, + uint8* im2col_data, gemmlowp::GemmContext* gemm_context) { (void)im2col_data; // only used in optimized code. - (void)im2col_dims; // only used in optimized code. + (void)im2col_shape; // only used in optimized code. (void)gemm_context; // only used in optimized code. + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); - const int output_depth = - MatchingArraySize(filter_dims, 3, bias_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); + + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { @@ -271,17 +280,18 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims, for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { for (int in_channel = 0; in_channel < input_depth; ++in_channel) { - const int in_x = in_x_origin + filter_x; - const int in_y = in_y_origin + filter_y; + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = + in_y_origin + dilation_height_factor * filter_y; // If the location is outside the bounds of the input image, // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - int32 input_val = input_data[Offset(input_dims, in_channel, - in_x, in_y, batch)]; + int32 input_val = input_data[Offset(input_shape, batch, in_y, + in_x, in_channel)]; int32 filter_val = - filter_data[Offset(filter_dims, in_channel, filter_x, - filter_y, out_channel)]; + filter_data[Offset(filter_shape, out_channel, filter_y, + filter_x, in_channel)]; acc += (filter_val + filter_offset) * (input_val + input_offset); } @@ -289,14 +299,14 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims, } } if (bias_data) { - acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; + acc += bias_data[out_channel]; } - acc = MultiplyByQuantizedMultiplierSmallerThanOne( - acc, output_multiplier, output_shift); + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + output_shift); acc += output_offset; acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); - output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = static_cast<uint8>(acc); } } @@ -304,66 +314,30 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -inline void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, - gemmlowp::GemmContext* gemm_context) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - Conv(input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, stride_width, stride_height, - pad_width, pad_height, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data, output_dims, - im2col_data, im2col_dims, gemm_context); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Conv(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, int stride, - int pad_width, int pad_height, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, - const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { - Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, stride, stride, pad_width, - pad_height, output_offset, output_multiplier, output_shift, - output_activation_min, output_activation_max, output_data, - output_dims, im2col_data, im2col_dims, gemm_context); -} - template <typename T> -inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, - int block_size, T* output_data, - const Dims<4>& output_dims) { - const int input_depth = ArraySize(input_dims, 0); - const int input_width = ArraySize(input_dims, 1); - const int input_height = ArraySize(input_dims, 2); - const int input_batch = ArraySize(input_dims, 3); - - const int output_depth = ArraySize(output_dims, 0); - const int output_width = ArraySize(output_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_batch = ArraySize(output_dims, 3); +inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params, + const RuntimeShape& unextended_input_shape, + const T* input_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + const int input_depth = input_shape.Dims(3); + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + const int input_batch = input_shape.Dims(0); + + const int output_depth = output_shape.Dims(3); + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_batch = output_shape.Dims(0); + + const int32 block_size = op_params.block_size; TFLITE_DCHECK_EQ(input_width * block_size, output_width); TFLITE_DCHECK_EQ(input_height * block_size, output_height); @@ -382,9 +356,9 @@ inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, const int in_h = out_h / block_size; const int in_b = out_b; + const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d); const int output_index = - Offset(output_dims, out_d, out_w, out_h, out_b); - const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b); + Offset(output_shape, out_b, out_h, out_w, out_d); output_data[output_index] = input_data[input_index]; } @@ -394,18 +368,29 @@ inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, } template <typename T> -inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, - int block_size, T* output_data, - const Dims<4>& output_dims) { - const int input_depth = ArraySize(input_dims, 0); - const int input_width = ArraySize(input_dims, 1); - const int input_height = ArraySize(input_dims, 2); - const int input_batch = ArraySize(input_dims, 3); - - const int output_depth = ArraySize(output_dims, 0); - const int output_width = ArraySize(output_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_batch = ArraySize(output_dims, 3); +inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params, + const RuntimeShape& unextended_input_shape, + const T* input_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + const int input_depth = input_shape.Dims(3); + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + const int input_batch = input_shape.Dims(0); + + const int output_depth = output_shape.Dims(3); + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_batch = output_shape.Dims(0); + + const int32 block_size = op_params.block_size; TFLITE_DCHECK_EQ(input_width, output_width * block_size); TFLITE_DCHECK_EQ(input_height, output_height * block_size); @@ -423,9 +408,9 @@ inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, const int out_h = in_h / block_size; const int out_b = in_b; + const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d); const int output_index = - Offset(output_dims, out_d, out_w, out_h, out_b); - const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b); + Offset(output_shape, out_b, out_h, out_w, out_d); output_data[output_index] = input_data[input_index]; } @@ -434,366 +419,9 @@ inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, } } -inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, - const float* weights_data, - const Dims<4>& weights_dims, const float* bias_data, - const Dims<4>& bias_dims, - float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims) { - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * - ArraySize(output_dims, 3); - const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0); - const int accum_depth = ArraySize(weights_dims, 0); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - for (int b = 0; b < batches; ++b) { - for (int out_c = 0; out_c < output_depth; ++out_c) { - float total = 0.f; - for (int d = 0; d < accum_depth; ++d) { - total += input_data[b * accum_depth + d] * - weights_data[out_c * accum_depth + d]; - } - float bias_value = 0.0f; - if (bias_data) { - bias_value = bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; - } - output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( - total + bias_value, output_activation_min, output_activation_max); - } - } -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void FullyConnected(const float* input_data, const Dims<4>& input_dims, - const float* weights_data, const Dims<4>& weights_dims, - const float* bias_data, const Dims<4>& bias_dims, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, - bias_dims, output_activation_min, output_activation_max, - output_data, output_dims); -} - -inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - (void)gemm_context; // only used in optimized code. - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * - ArraySize(output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0); - const int accum_depth = ArraySize(filter_dims, 0); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); - for (int b = 0; b < batches; ++b) { - for (int out_c = 0; out_c < output_depth; ++out_c) { - int32 acc = 0; - for (int d = 0; d < accum_depth; ++d) { - int32 input_val = input_data[b * accum_depth + d]; - int32 filter_val = filter_data[out_c * accum_depth + d]; - acc += (filter_val + filter_offset) * (input_val + input_offset); - } - if (bias_data) { - acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; - } - acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier, - output_shift); - acc += output_offset; - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[out_c + output_depth * b] = static_cast<uint8>(acc); - } - } -} - -inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, int16* output_data, - const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - (void)gemm_context; // only used in optimized code. - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - TFLITE_DCHECK_EQ(output_offset, 0); - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * - ArraySize(output_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0); - const int accum_depth = ArraySize(filter_dims, 0); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); - for (int b = 0; b < batches; ++b) { - for (int out_c = 0; out_c < output_depth; ++out_c) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum = bias_data[out_c]; - // Accumulation loop. - for (int d = 0; d < accum_depth; ++d) { - int16 input_val = input_data[b * accum_depth + d] + input_offset; - int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset; - accum += filter_val * input_val; - } - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - accum = MultiplyByQuantizedMultiplier(accum, output_multiplier, - -output_shift); - // Saturate, cast to int16, and store to output array. - accum = std::max(accum, output_activation_min - output_offset); - accum = std::min(accum, output_activation_max - output_offset); - accum += output_offset; - output_data[out_c + output_depth * b] = accum; - } - } -} - -inline void ExperimentalShuffledFullyConnected( - const uint8* input_data, const Dims<4>& input_dims, - const uint8* shuffled_weights_data, const Dims<4>& weights_dims, - const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - int16* output_data, const Dims<4>& output_dims, - uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { - (void)gemm_context; // only used in optimized code. - - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * - ArraySize(output_dims, 3); - const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0); - const int accum_depth = ArraySize(weights_dims, 0); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - TFLITE_DCHECK((accum_depth % 16) == 0); - TFLITE_DCHECK((output_depth % 4) == 0); - - // Shuffling and xoring of input activations into the workspace buffer - uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; - if (batches == 1) { - for (int i = 0; i < accum_depth; i++) { - shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; - } - } else if (batches == 4) { - for (int c = 0; c < accum_depth; c += 16) { - for (int b = 0; b < 4; b++) { - const uint8* src_data_ptr = input_data + b * accum_depth + c; - for (int j = 0; j < 16; j++) { - uint8 src_val = *src_data_ptr++; - // Flip the sign bit, so that the kernel will only need to - // reinterpret these uint8 values as int8, getting for free the - // subtraction of the zero_point value 128. - uint8 dst_val = src_val ^ 0x80; - *shuffled_input_workspace_ptr++ = dst_val; - } - } - } - } else { - TFLITE_DCHECK(false); - return; - } - - // Actual computation - if (batches == 1) { - int16* output_ptr = output_data; - // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to - // subtracting 128 from them, thus implementing for free the subtraction of - // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast<const int8*>(shuffled_weights_data); - // Likewise, we preshuffled and pre-xored the input data above. - const int8* shuffled_input_data = - reinterpret_cast<const int8*>(shuffled_input_workspace_data); - for (int c = 0; c < output_depth; c += 4) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4] = {0}; - // Accumulation loop. - for (int d = 0; d < accum_depth; d += 16) { - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 16; j++) { - int8 input_val = shuffled_input_data[d + j]; - int8 weights_val = *shuffled_weights_ptr++; - accum[i] += weights_val * input_val; - } - } - } - for (int i = 0; i < 4; i++) { - // Add bias value - int acc = accum[i] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, - -output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_ptr[c + i] = acc; - } - } - } else if (batches == 4) { - int16* output_ptr = output_data; - // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to - // subtracting 128 from them, thus implementing for free the subtraction of - // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast<const int8*>(shuffled_weights_data); - // Likewise, we preshuffled and pre-xored the input data above. - const int8* shuffled_input_data = - reinterpret_cast<const int8*>(shuffled_input_workspace_data); - for (int c = 0; c < output_depth; c += 4) { - const int8* shuffled_input_ptr = shuffled_input_data; - // Accumulation loop. - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4][4]; - for (int i = 0; i < 4; i++) { - for (int b = 0; b < 4; b++) { - accum[i][b] = 0; - } - } - for (int d = 0; d < accum_depth; d += 16) { - for (int i = 0; i < 4; i++) { - for (int b = 0; b < 4; b++) { - for (int j = 0; j < 16; j++) { - int8 input_val = shuffled_input_ptr[16 * b + j]; - int8 weights_val = shuffled_weights_ptr[16 * i + j]; - accum[i][b] += weights_val * input_val; - } - } - } - shuffled_input_ptr += 64; - shuffled_weights_ptr += 64; - } - for (int i = 0; i < 4; i++) { - for (int b = 0; b < 4; b++) { - // Add bias value - int acc = accum[i][b] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The - // quantized multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, - -output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_ptr[b * output_depth + c + i] = acc; - } - } - } - } else { - TFLITE_DCHECK(false); - return; - } -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, - int32 input_offset, const uint8* filter_data, - const Dims<4>& filter_dims, int32 filter_offset, - const int32* bias_data, const Dims<4>& bias_dims, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, - filter_offset, bias_data, bias_dims, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims, gemm_context); -} - -template <FusedActivationFunctionType Ac> -void NonGlobalBatchNormalization( - const float* input_data, const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, const float* multiplier_data, - const Dims<4>& multiplier_dims, const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int inner_size = MatchingFlatSizeSkipDim( - input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims); - - for (int b = 0; b < batches; ++b) { - for (int i = 0; i < inner_size; ++i) { - output_data[b * inner_size + i] = ActivationFunction<Ac>( - (input_data[b * inner_size + i] - mean_data[i]) * multiplier_data[i] + - offset_data[i]); - } - } -} - -template <FusedActivationFunctionType Ac> -void GlobalBatchNormalization(const float* input_data, - const Dims<4>& input_dims, const float* mean_data, - const Dims<4>& mean_dims, - const float* multiplier_data, - const Dims<4>& multiplier_dims, - const float* offset_data, - const Dims<4>& offset_dims, float* output_data, - const Dims<4>& output_dims) { - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = - MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, - offset_dims, 0, output_dims, 0); - - for (int i = 0; i < outer_size; ++i) { - for (int c = 0; c < depth; ++c) { - output_data[depth * i + c] = ActivationFunction<Ac>( - (input_data[depth * i + c] - mean_data[c]) * multiplier_data[c] + - offset_data[c]); - } - } -} - -inline void Relu(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input_dims, output_dims); +inline void Relu(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; ++i) { const float val = input_data[i]; const float lower = 0; @@ -802,9 +430,10 @@ inline void Relu(const float* input_data, const Dims<4>& input_dims, } } -inline void Relu1(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input_dims, output_dims); +inline void Relu1(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)"); + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; ++i) { const float val = input_data[i]; const float upper = 1; @@ -814,9 +443,10 @@ inline void Relu1(const float* input_data, const Dims<4>& input_dims, } } -inline void Relu6(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input_dims, output_dims); +inline void Relu6(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)"); + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; ++i) { const float val = input_data[i]; const float upper = 6; @@ -826,12 +456,31 @@ inline void Relu6(const float* input_data, const Dims<4>& input_dims, } } -template <FusedActivationFunctionType Ac> -void L2Normalization(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone, ""); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); +inline void ReluX(const tflite::ActivationParams& params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { + gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)"); + const int flat_size = MatchingFlatSize(input_shape, output_shape); + const uint8 max_value = params.quantized_activation_max; + const uint8 min_value = params.quantized_activation_min; + for (int i = 0; i < flat_size; ++i) { + const uint8 val = input_data[i]; + const uint8 clamped = + val > max_value ? max_value : val < min_value ? min_value : val; + output_data[i] = clamped; + } +} + +inline void L2Normalization(const tflite::L2NormalizationParams& op_params, + const RuntimeShape& input_shape, + const float* input_data, + const RuntimeShape& output_shape, + float* output_data) { + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { float squared_l2_norm = 0; for (int c = 0; c < depth; ++c) { @@ -845,15 +494,17 @@ void L2Normalization(const float* input_data, const Dims<4>& input_dims, } } -inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, - int* output_shift) { +inline void GetInvSqrtQuantizedMultiplierExp(int32 input, + int32* output_inv_sqrt, + int* output_shift) { *output_shift = 11; while (input >= (1 << 29)) { input /= 4; ++*output_shift; } TFLITE_DCHECK_GT(input, 0); - const unsigned max_left_shift_bits = __builtin_clz(input) - 1; + const unsigned max_left_shift_bits = + CountLeadingZeros(static_cast<uint32>(input)) - 1; const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; *output_shift -= left_shift_bit_pairs; @@ -888,151 +539,146 @@ inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, *output_inv_sqrt <<= -*output_shift; *output_shift = 0; } + // Convert right shift (right is positive) to left shift. + *output_shift *= kReverseShift; } -inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, uint8* output_data, - const Dims<4>& output_dims) { - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - TFLITE_DCHECK_EQ(outer_size, 1); - int32 square_l2_norm = 0; - for (int i = 0; i < depth; i++) { - int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point; - square_l2_norm += diff * diff; - } - int32 inv_l2norm_multiplier; - int inv_l2norm_shift; - GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier, - &inv_l2norm_shift); - - for (int i = 0; i < depth; i++) { - int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point; - int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne( - 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); - int32 unclamped_output_val = 128 + rescaled_diff; - int32 output_val = std::min(255, std::max(0, unclamped_output_val)); - output_data[Offset(output_dims, i, 0, 0, 0)] = - static_cast<uint8>(output_val); +inline void L2Normalization(const tflite::L2NormalizationParams& op_params, + const RuntimeShape& input_shape, + const uint8* input_data, + const RuntimeShape& output_shape, + uint8* output_data) { + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int32 input_zero_point = op_params.input_zero_point; + for (int i = 0; i < outer_size; ++i) { + int32 square_l2_norm = 0; + for (int c = 0; c < depth; c++) { + int32 diff = input_data[depth * i + c] - input_zero_point; + square_l2_norm += diff * diff; + } + int32 inv_l2norm_multiplier; + int inv_l2norm_shift; + GetInvSqrtQuantizedMultiplierExp(square_l2_norm, &inv_l2norm_multiplier, + &inv_l2norm_shift); + + for (int c = 0; c < depth; c++) { + int32 diff = input_data[depth * i + c] - input_zero_point; + int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( + 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); + int32 unclamped_output_val = 128 + rescaled_diff; + int32 output_val = std::min(255, std::max(0, unclamped_output_val)); + output_data[depth * i + c] = static_cast<uint8>(output_val); + } } } -inline void Add(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); +template <typename T> +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape& input2_shape, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( - input1_data[i] + input2_data[i], output_activation_min, - output_activation_max); + input1_data[i] + input2_data[i], params.quantized_activation_min, + params.quantized_activation_max); } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Add(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, - output_activation_max, output_data, output_dims); -} - -template <FusedActivationFunctionType Ac> -inline void Add(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, int input2_shift, - int32 output_offset, int32 output_multiplier, int output_shift, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - const int batches = - MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); - const int height = - MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); - const int width = - MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); - const int depth = - MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - const int32 input1_val = - input1_offset + input1_data[Offset(input1_dims, c, x, y, b)]; - const int32 input2_val = - input2_offset + input2_data[Offset(input2_dims, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOne( - raw_sum, output_multiplier, output_shift) + - output_offset; - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = - static_cast<uint8>(clamped_output); - } - } - } +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const float* input1_data, + const RuntimeShape& input2_shape, const float* input2_data, + const RuntimeShape& output_shape, float* output_data) { + const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < size; i++) { + auto x = input1_data[i] + input2_data[i]; + output_data[i] = ActivationFunctionWithMinMax( + x, params.float_activation_min, params.float_activation_max); } } -template <FusedActivationFunctionType Ac> -inline void Add(const int16* input1_data, const Dims<4>& input1_dims, - int input1_shift, const int16* input2_data, - const Dims<4>& input2_dims, int input2_shift, - int16 output_activation_min, int16 output_activation_max, - int16* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, -32768); - TFLITE_DCHECK_EQ(output_activation_max, 32767); +// Element-wise add that can often be used for inner loop of broadcast add as +// well as the non-broadcast add. +inline void AddElementwise(int size, const ArithmeticParams& params, + const uint8* input1_data, const uint8* input2_data, + uint8* output_data) { + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_LT(params.input2_offset, 256); + + for (int i = 0; i < size; ++i) { + const int32 input1_val = params.input1_offset + input1_data[i]; + const int32 input2_val = params.input2_offset + input2_data[i]; + const int32 shifted_input1_val = input1_val * (1 << params.left_shift); + const int32 shifted_input2_val = input2_val * (1 << params.left_shift); + const int32 scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + const int32 scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); + const int32 raw_sum = scaled_input1_val + scaled_input2_val; + const int32 raw_output = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32 clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[i] = static_cast<uint8>(clamped_output); } +} - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const uint8* input1_data, + const RuntimeShape& input2_shape, const uint8* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + + TFLITE_DCHECK_GT(params.input1_offset, -256); + TFLITE_DCHECK_GT(params.input2_offset, -256); + TFLITE_DCHECK_LT(params.input1_offset, 256); + TFLITE_DCHECK_LT(params.input2_offset, 256); + AddElementwise(flat_size, params, input1_data, input2_data, output_data); +} - TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0); - TFLITE_DCHECK_GE(input1_shift, 0); - TFLITE_DCHECK_GE(input2_shift, 0); +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int16* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + + const int input1_shift = params.input1_shift; + const int flat_size = + MatchingFlatSize(output_shape, input1_shape, input2_shape); + const int16 output_activation_min = params.quantized_activation_min; + const int16 output_activation_max = params.quantized_activation_max; + + TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0); + TFLITE_DCHECK_LE(input1_shift, 0); + TFLITE_DCHECK_LE(params.input2_shift, 0); const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data; const int16* shift_input = input1_shift == 0 ? input2_data : input1_data; - const int input_shift = input1_shift == 0 ? input2_shift : input1_shift; + const int input_right_shift = + input1_shift == 0 ? -params.input2_shift : -input1_shift; for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]); - F0 scaled_input = - F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift)); + F0 scaled_input = F0::FromRaw( + gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift)); F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled); const int16 raw_output = result.raw(); const int16 clamped_output = std::min( @@ -1045,16 +691,24 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims, // dimensionality if the runtime code does a single loop over one dimension // that handles broadcasting as the base case. The code generator would then // generate max(D1, D2) nested for loops. -template <typename T> -void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); - +// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from +// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> +// is no longer referenced in this file, move NdArrayDesc<T> from types.h to +// reference_ops.h. +inline void BroadcastAdd4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -1067,49 +721,77 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] + - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); + input1_data[SubscriptToIndex(desc1, b, y, x, c)] + + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.float_activation_min, params.float_activation_max); } } } } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac, typename T> -void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims) { - T output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); +inline void BroadcastAdd4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32"); + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); - BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims, - output_activation_min, output_activation_max, output_data, - output_dims); + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = + ActivationFunctionWithMinMax( + input1_data[SubscriptToIndex(desc1, b, y, x, c)] + + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.quantized_activation_min, + params.quantized_activation_max); + } + } + } + } } -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit"); - +inline void BroadcastAdd4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const uint8* input1_data, + const RuntimeShape& input2_shape, + const uint8* input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -1122,31 +804,37 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; + params.input1_offset + + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); + params.input2_offset + + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + const int32 shifted_input1_val = + input1_val * (1 << params.left_shift); + const int32 shifted_input2_val = + input2_val * (1 << params.left_shift); const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, + params.input1_shift); const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, + params.input2_shift); const int32 raw_sum = scaled_input1_val + scaled_input2_val; const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOne( - raw_sum, output_multiplier, output_shift) + - output_offset; + MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[Offset(extended_output_shape, b, y, x, c)] = static_cast<uint8>(clamped_output); } } @@ -1154,120 +842,73 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data, } } -inline void BroadcastAddFivefold( - int y0, int y1, int y2, int y3, int y4, int left_shift, - const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit"); - - int sb1 = y0; - int sa2 = y0; - int sb2 = y0 * y1; - int sa3 = y0 * y2; - int sa4 = y0 * y2 * y3; - int sb4 = y0 * y1 * y2; - +inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, + const RuntimeShape& unswitched_input1_shape, + const uint8* unswitched_input1_data, + const RuntimeShape& unswitched_input2_shape, + const uint8* unswitched_input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + ArithmeticParams switched_params = unswitched_params; + switched_params.input1_offset = unswitched_params.input2_offset; + switched_params.input1_multiplier = unswitched_params.input2_multiplier; + switched_params.input1_shift = unswitched_params.input2_shift; + switched_params.input2_offset = unswitched_params.input1_offset; + switched_params.input2_multiplier = unswitched_params.input1_multiplier; + switched_params.input2_shift = unswitched_params.input1_shift; + + const bool use_unswitched = + unswitched_params.broadcast_category == + tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; + + const ArithmeticParams& params = + use_unswitched ? unswitched_params : switched_params; + const uint8* input1_data = + use_unswitched ? unswitched_input1_data : unswitched_input2_data; + const uint8* input2_data = + use_unswitched ? unswitched_input2_data : unswitched_input1_data; + + // Fivefold nested loops. The second input resets its position for each + // iteration of the second loop. The first input resets its position at the + // beginning of the fourth loop. The innermost loop is an elementwise add of + // sections of the arrays. uint8* output_data_ptr = output_data; - for (int i4 = 0; i4 < y4; ++i4) { - for (int i3 = 0; i3 < y3; ++i3) { + const uint8* input1_data_ptr = input1_data; + const uint8* input2_data_reset = input2_data; + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + for (int i0 = 0; i0 < y0; ++i0) { + const uint8* input2_data_ptr; + for (int i1 = 0; i1 < y1; ++i1) { + input2_data_ptr = input2_data_reset; for (int i2 = 0; i2 < y2; ++i2) { - for (int i1 = 0; i1 < y1; ++i1) { - for (int i0 = 0; i0 < y0; ++i0) { - const int32 input1_val = - input1_offset + - input1_data[i4 * sa4 + i3 * sa3 + i2 * sa2 + i0]; - const int32 input2_val = - input2_offset + - input2_data[i4 * sb4 + i2 * sb2 + i1 * sb1 + i0]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOne( - raw_sum, output_multiplier, output_shift) + - output_offset; - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - *output_data_ptr = static_cast<uint8>(clamped_output); - ++output_data_ptr; - } + for (int i3 = 0; i3 < y3; ++i3) { + AddElementwise(y4, params, input1_data_ptr, input2_data_ptr, + output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; } + input1_data_ptr += y4; } } + input2_data_reset = input2_data_ptr; } } -template <FusedActivationFunctionType Ac> -inline void BroadcastAdd(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset, - input1_multiplier, input1_shift, input2_data, input2_dims, - input2_offset, input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims); -} - -template <FusedActivationFunctionType Ac> -inline void BroadcastAddFivefold( - int y0, int y1, int y2, int y3, int y4, int left_shift, - const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims, - input1_offset, input1_multiplier, input1_shift, - input2_data, input2_dims, input2_offset, - input2_multiplier, input2_shift, output_offset, - output_multiplier, output_shift, output_activation_min, - output_activation_max, output_data, output_dims); -} - -inline void Mul(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); +template <typename T> +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape& input2_shape, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + T output_activation_min; + T output_activation_max; + GetActivationParams(params, &output_activation_min, &output_activation_max); + + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( input1_data[i] * input2_data[i], output_activation_min, @@ -1275,52 +916,57 @@ inline void Mul(const float* input1_data, const Dims<4>& input1_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void Mul(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, - output_activation_max, output_data, output_dims); -} - // TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary // dimensionality if the runtime code does a single loop over one dimension // that handles broadcasting as the base case. The code generator would then // generate max(D1, D2) nested for loops. +// TODO(benoitjacob): BroadcastMul is intentionally duplicated from +// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> +// is no longer referenced in this file, move NdArrayDesc<T> from types.h to +// reference_ops.h. template <typename T> -void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastMul"); +void BroadcastMul4DSlow(const ArithmeticParams& params, + const RuntimeShape& unextended_input1_shape, + const T* input1_data, + const RuntimeShape& unextended_input2_shape, + const T* input2_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow"); + T output_activation_min; + T output_activation_max; + GetActivationParams(params, &output_activation_min, &output_activation_max); + + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest - // stride, typically 1 element). + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). // // In generated C code, we store arrays with the dimensions reversed. The // first dimension has smallest stride. // // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for - // the best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + output_data[Offset(output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] * - input2_data[SubscriptToIndex(desc2, c, x, y, b)], + input1_data[SubscriptToIndex(desc1, b, y, x, c)] * + input2_data[SubscriptToIndex(desc2, b, y, x, c)], output_activation_min, output_activation_max); } } @@ -1328,59 +974,127 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac, typename T> -void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims) { - T output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); +// Element-wise mul that can often be used for inner loop of broadcast Mul as +// well as the non-broadcast Mul. +inline void MulElementwise(int size, const ArithmeticParams& params, + const uint8* input1_data, const uint8* input2_data, + uint8* output_data) { + for (int i = 0; i < size; ++i) { + const int32 input1_val = params.input1_offset + input1_data[i]; + const int32 input2_val = params.input2_offset + input2_data[i]; + const int32 unclamped_result = + params.output_offset + + MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32 clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + output_data[i] = static_cast<uint8>(clamped_output); + } +} + +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const uint8* input1_data, + const RuntimeShape& input2_shape, const uint8* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + gemmlowp::ScopedProfilingLabel label("Mul/8bit"); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + + MulElementwise(flat_size, params, input1_data, input2_data, output_data); +} - BroadcastMul(input1_data, input1_dims, input2_data, input2_dims, - output_activation_min, output_activation_max, output_data, - output_dims); +inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params, + const RuntimeShape& unswitched_input1_shape, + const uint8* unswitched_input1_data, + const RuntimeShape& unswitched_input2_shape, + const uint8* unswitched_input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + ArithmeticParams switched_params = unswitched_params; + switched_params.input1_offset = unswitched_params.input2_offset; + switched_params.input2_offset = unswitched_params.input1_offset; + + const bool use_unswitched = + unswitched_params.broadcast_category == + tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; + + const ArithmeticParams& params = + use_unswitched ? unswitched_params : switched_params; + const uint8* input1_data = + use_unswitched ? unswitched_input1_data : unswitched_input2_data; + const uint8* input2_data = + use_unswitched ? unswitched_input2_data : unswitched_input1_data; + + // Fivefold nested loops. The second input resets its position for each + // iteration of the second loop. The first input resets its position at the + // beginning of the fourth loop. The innermost loop is an elementwise Mul of + // sections of the arrays. + uint8* output_data_ptr = output_data; + const uint8* input1_data_ptr = input1_data; + const uint8* input2_data_reset = input2_data; + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + for (int i0 = 0; i0 < y0; ++i0) { + const uint8* input2_data_ptr; + for (int i1 = 0; i1 < y1; ++i1) { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) { + for (int i3 = 0; i3 < y3; ++i3) { + MulElementwise(y4, params, input1_data_ptr, input2_data_ptr, + output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; + } + input1_data_ptr += y4; + } + } + input2_data_reset = input2_data_ptr; + } } -inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, - int32 input1_offset, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit"); +inline void BroadcastMul4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const uint8* input1_data, + const RuntimeShape& input2_shape, + const uint8* input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest - // stride, typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for - // the best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + // The input shapes are extended as part of NdArrayDesc initialization. + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; + params.input1_offset + + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; + params.input2_offset + + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; const int32 unclamped_result = - output_offset + - MultiplyByQuantizedMultiplierSmallerThanOne( - input1_val * input2_val, output_multiplier, output_shift); - const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, unclamped_result)); - output_data[Offset(output_dims, c, x, y, b)] = + params.output_offset + + MultiplyByQuantizedMultiplierSmallerThanOneExp( + input1_val * input2_val, params.output_multiplier, + params.output_shift); + const int32 clamped_output = std::min( + params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + output_data[Offset(extended_output_shape, b, y, x, c)] = static_cast<uint8>(clamped_output); } } @@ -1388,12 +1102,14 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, } } -inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, - const int16* input2_data, const Dims<4>& input2_dims, - int16* output_data, const Dims<4>& output_dims) { +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int16* output_data) { gemmlowp::ScopedProfilingLabel label("Mul/Int16"); - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. @@ -1405,15 +1121,18 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, } } -inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, - const int16* input2_data, const Dims<4>& input2_dims, - int32 output_offset, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, uint8* output_data) { gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8"); + int32 output_offset = params.output_offset; + int32 output_activation_min = params.quantized_activation_min; + int32 output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. @@ -1431,35 +1150,32 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, - int32 input1_offset, const uint8* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, - int32 output_offset, int32 output_multiplier, - int output_shift, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - BroadcastMul(input1_data, input1_dims, input1_offset, input2_data, - input2_dims, input2_offset, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max, - output_data, output_dims); -} - // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary // dimensionality if the runtime code does a single loop over one dimension // that handles broadcasting as the base case. The code generator would then // generate max(D1, D2) nested for loops. template <typename T> -void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastDiv"); +void BroadcastDiv4DSlow(const ArithmeticParams& params, + const RuntimeShape& unextended_input1_shape, + const T* input1_data, + const RuntimeShape& unextended_input2_shape, + const T* input2_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { + T output_activation_min; + T output_activation_max; + GetActivationParams(params, &output_activation_min, &output_activation_max); + + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -1472,14 +1188,14 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for // the best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + output_data[Offset(output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] / - input2_data[SubscriptToIndex(desc2, c, x, y, b)], + input1_data[SubscriptToIndex(desc1, b, y, x, c)] / + input2_data[SubscriptToIndex(desc2, b, y, x, c)], output_activation_min, output_activation_max); } } @@ -1487,11 +1203,17 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, } } -inline void Div(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); +template <typename T> +inline void Div(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape& input2_shape, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + T output_activation_min; + T output_activation_max; + GetActivationParams(params, &output_activation_min, &output_activation_max); + + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( input1_data[i] / input2_data[i], output_activation_min, @@ -1499,15 +1221,35 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims, } } -inline void Sub(const float* input1_data, const Dims<4>& input1_dims, - const float* input2_data, const Dims<4>& input2_dims, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims); +inline void SubNonBroadcast(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { output_data[i] = ActivationFunctionWithMinMax( - input1_data[i] - input2_data[i], output_activation_min, - output_activation_max); + input1_data[i] - input2_data[i], params.float_activation_min, + params.float_activation_max); + } +} + +inline void SubNonBroadcast(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.quantized_activation_min, + params.quantized_activation_max); } } @@ -1515,16 +1257,24 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims, // dimensionality if the runtime code does a single loop over one dimension // that handles broadcasting as the base case. The code generator would then // generate max(D1, D2) nested for loops. -template <typename T> -void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T output_activation_min, T output_activation_max, - T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastSub"); - +// TODO(benoitjacob): BroadcastSub is intentionally duplicated from +// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> +// is no longer referenced in this file, move NdArrayDesc<T> from types.h to +// reference_ops.h. +inline void BroadcastSub4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/float"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -1537,36 +1287,35 @@ void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - input1_data[SubscriptToIndex(desc1, c, x, y, b)] - - input2_data[SubscriptToIndex(desc2, c, x, y, b)], - output_activation_min, output_activation_max); + input1_data[SubscriptToIndex(desc1, b, y, x, c)] - + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.float_activation_min, params.float_activation_max); } } } } } -inline void BroadcastSub(int left_shift, const uint8* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const uint8* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, int32 output_offset, - int32 output_multiplier, int output_shift, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit"); - +inline void BroadcastSub4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const uint8* input1_data, + const RuntimeShape& input2_shape, + const uint8* input2_data, + const RuntimeShape& output_shape, + uint8* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/uint8"); NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); // In Tensorflow, the dimensions are canonically named (batch_number, row, // col, channel), with extents (batches, height, width, depth), with the @@ -1579,31 +1328,37 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data, // We name our variables by their Tensorflow convention, but generate C code // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; + params.input1_offset + + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); + params.input2_offset + + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + const int32 shifted_input1_val = + input1_val * (1 << params.left_shift); + const int32 shifted_input2_val = + input2_val * (1 << params.left_shift); const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, + params.input1_shift); const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, + params.input2_shift); const int32 raw_sub = scaled_input1_val - scaled_input2_val; const int32 raw_output = - MultiplyByQuantizedMultiplierSmallerThanOne( - raw_sub, output_multiplier, output_shift) + - output_offset; + MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sub, params.output_multiplier, params.output_shift) + + params.output_offset; const int32 clamped_output = - std::min(output_activation_max, - std::max(output_activation_min, raw_output)); - output_data[Offset(output_dims, c, x, y, b)] = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[Offset(extended_output_shape, b, y, x, c)] = static_cast<uint8>(clamped_output); } } @@ -1611,31 +1366,193 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data, } } -template <FusedActivationFunctionType Ac, typename Scalar> -void Concatenation(int concat_dim, const Scalar* const* input_data, - const Dims<4>* const* input_dims, int inputs_count, - Scalar* output_data, const Dims<4>& output_dims) { - TFLITE_DCHECK_GT(inputs_count, 1); - int concat_size = 0; +inline void BroadcastSub4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/int32"); + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = + ActivationFunctionWithMinMax( + input1_data[SubscriptToIndex(desc1, b, y, x, c)] - + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.quantized_activation_min, + params.quantized_activation_max); + } + } + } + } +} + +template <typename T> +void BroadcastSub4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape& input2_shape, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastSub4DSlow/templated"); + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = + ActivationFunctionWithMinMax( + input1_data[SubscriptToIndex(desc1, b, y, x, c)] - + input2_data[SubscriptToIndex(desc2, b, y, x, c)], + params.quantized_activation_min, + params.quantized_activation_max); + } + } + } + } +} + +template <typename T> +void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape, + const T* input1_data, const RuntimeShape& input2_shape, + const T* input2_data, const RuntimeShape& output_shape, + T* output_data) { + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + output_data[Offset(extended_output_shape, b, y, x, c)] = + input1_data[SubscriptToIndex(desc1, b, y, x, c)] - + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + } + } + } + } +} + +inline void SubWithActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int32* input1_data, + const RuntimeShape& input2_shape, + const int32* input2_data, + const RuntimeShape& output_shape, + int32* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, input2_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.quantized_activation_min, + params.quantized_activation_max); + } +} + +inline void SubWithActivation(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + float* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, input2_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = ActivationFunctionWithMinMax( + input1_data[i] - input2_data[i], params.float_activation_min, + params.float_activation_max); + } +} + +template <typename Scalar> +inline void Concatenation(const ConcatenationParams& params, + const RuntimeShape* const* input_shapes, + const Scalar* const* input_data, + const RuntimeShape& output_shape, + Scalar* output_data) { + int axis = params.axis; + int inputs_count = params.inputs_count; + const int concat_dimensions = output_shape.DimensionsCount(); + TFLITE_DCHECK_LT(axis, concat_dimensions); + + int64_t concat_size = 0; for (int i = 0; i < inputs_count; i++) { - for (int j = 0; j < 4; j++) { - if (j != concat_dim) { - MatchingArraySize(*input_dims[i], j, output_dims, j); + TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions); + for (int j = 0; j < concat_dimensions; j++) { + if (j != axis) { + MatchingDim(*input_shapes[i], j, output_shape, j); } } - concat_size += ArraySize(*input_dims[i], concat_dim); + concat_size += input_shapes[i]->Dims(axis); } - TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); - TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); - int outer_size = 1; - for (int i = concat_dim + 1; i < 4; i++) { - outer_size *= output_dims.sizes[i]; + TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis)); + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) { + outer_size *= output_shape.Dims(i); } + // For all input arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < concat_dimensions; ++i) { + base_inner_size *= output_shape.Dims(i); + } + Scalar* output_ptr = output_data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < inputs_count; ++i) { - const int copy_size = - input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; + const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar)); output_ptr += copy_size; @@ -1647,37 +1564,48 @@ void Concatenation(int concat_dim, const Scalar* const* input_data, // TODO(prabhumk): The quantized implementation of concatentation isn't fully // quantized as it takes scale as a floating point value. This should be fixed // when optimizng this routine further. -inline void Concatenation(int concat_dim, const uint8* const* input_data, - const Dims<4>* const* input_dims, - const int32* input_zeropoint, - const float* input_scale, int inputs_count, - uint8* output_data, const Dims<4>& output_dims, - const int32 output_zeropoint, - const float output_scale) { - // The arguments input_zeropoint and input_scale are expected to be an array - // that have the quantization parameters for all the inputs to the concat - // operator. - TFLITE_DCHECK_GT(inputs_count, 1); +inline void ConcatenationWithScaling(const ConcatenationParams& params, + const RuntimeShape* const* input_shapes, + const uint8* const* input_data, + const RuntimeShape& output_shape, + uint8* output_data) { + int axis = params.axis; + const int32* input_zeropoint = params.input_zeropoint; + const float* input_scale = params.input_scale; + int inputs_count = params.inputs_count; + const int32 output_zeropoint = params.output_zeropoint; + const float output_scale = params.output_scale; + + const int concat_dimensions = output_shape.DimensionsCount(); + TFLITE_DCHECK_LT(axis, concat_dimensions); + int64_t concat_size = 0; for (int i = 0; i < inputs_count; i++) { - for (int j = 0; j < 4; j++) { - if (j != concat_dim) { - MatchingArraySize(*input_dims[i], j, output_dims, j); + TFLITE_DCHECK_EQ(input_shapes[i]->DimensionsCount(), concat_dimensions); + for (int j = 0; j < concat_dimensions; j++) { + if (j != axis) { + MatchingDim(*input_shapes[i], j, output_shape, j); } } - concat_size += ArraySize(*input_dims[i], concat_dim); + concat_size += input_shapes[i]->Dims(axis); } - TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); + TFLITE_DCHECK_EQ(concat_size, output_shape.Dims(axis)); int64_t outer_size = 1; - for (int i = concat_dim + 1; i < 4; i++) { - outer_size *= output_dims.sizes[i]; + for (int i = 0; i < axis; ++i) { + outer_size *= output_shape.Dims(i); } + // For all input arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < concat_dimensions; ++i) { + base_inner_size *= output_shape.Dims(i); + } + const float inverse_output_scale = 1.f / output_scale; uint8* output_ptr = output_data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < inputs_count; ++i) { - const int copy_size = - input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; + const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; const uint8* input_ptr = input_data[i] + k * copy_size; if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale) { @@ -1698,64 +1626,203 @@ inline void Concatenation(int concat_dim, const uint8* const* input_data, } } -template <FusedActivationFunctionType Ac, typename Scalar> -void DepthConcatenation(const Scalar* const* input_data, - const Dims<4>* const* input_dims, int inputs_count, - Scalar* output_data, const Dims<4>& output_dims) { - Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count, - output_data, output_dims); +template <typename Scalar> +void Pack(const PackParams& params, const RuntimeShape* const* input_shapes, + const Scalar* const* input_data, const RuntimeShape& output_shape, + Scalar* output_data) { + const int dimensions = output_shape.DimensionsCount(); + int axis = params.axis; + int inputs_count = params.inputs_count; + + int outer_size = 1; + for (int i = 0; i < axis; i++) { + outer_size *= output_shape.Dims(i); + } + int copy_size = 1; + for (int i = params.axis + 1; i < dimensions; i++) { + copy_size *= output_shape.Dims(i); + } + TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size); + + for (int i = 0; i < inputs_count; ++i) { + for (int k = 0; k < outer_size; k++) { + const Scalar* input_ptr = input_data[i] + copy_size * k; + int loc = k * inputs_count * copy_size + i * copy_size; + memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar)); + } + } +} + +template <typename Scalar> +void Unpack(const UnpackParams& params, const RuntimeShape& input_shape, + const Scalar* input_data, const RuntimeShape& output_shape, + Scalar* const* output_datas) { + const int dimensions = input_shape.DimensionsCount(); + const int outputs_count = params.num_split; + + int outer_size = 1; + for (int i = 0; i < params.axis; i++) { + outer_size *= input_shape.Dims(i); + } + int copy_size = 1; + for (int i = params.axis + 1; i < dimensions; i++) { + copy_size *= input_shape.Dims(i); + } + TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size); + + for (int i = 0; i < outputs_count; ++i) { + for (int k = 0; k < outer_size; k++) { + Scalar* output_ptr = output_datas[i] + copy_size * k; + int loc = k * outputs_count * copy_size + i * copy_size; + memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar)); + } + } +} + +template <typename Scalar> +void PackWithScaling(const PackParams& params, + const RuntimeShape* const* input_shapes, + const uint8* const* input_data, + const RuntimeShape& output_shape, uint8* output_data) { + const int dimensions = output_shape.DimensionsCount(); + int axis = params.axis; + const int32* input_zeropoint = params.input_zeropoint; + const float* input_scale = params.input_scale; + int inputs_count = params.inputs_count; + const int32 output_zeropoint = params.output_zeropoint; + const float output_scale = params.output_scale; + + int outer_size = 1; + for (int i = 0; i < axis; i++) { + outer_size *= output_shape.Dims(i); + } + int copy_size = 1; + for (int i = axis + 1; i < dimensions; i++) { + copy_size *= output_shape.Dims(i); + } + TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size); + + Scalar* output_ptr = output_data; + const float inverse_output_scale = 1.f / output_scale; + for (int k = 0; k < outer_size; k++) { + for (int i = 0; i < inputs_count; ++i) { + if (input_zeropoint[i] == output_zeropoint && + input_scale[i] == output_scale) { + memcpy(output_ptr, input_data[i] + k * copy_size, + copy_size * sizeof(Scalar)); + } else { + assert(false); + const float scale = input_scale[i] * inverse_output_scale; + const float bias = -input_zeropoint[i] * scale; + auto input_ptr = input_data[i]; + for (int j = 0; j < copy_size; ++j) { + const int32_t value = + static_cast<int32_t>(round(input_ptr[j] * scale + bias)) + + output_zeropoint; + output_ptr[j] = + static_cast<uint8_t>(std::max(std::min(255, value), 0)); + } + } + output_ptr += copy_size; + } + } +} + +template <typename Scalar> +void DepthConcatenation(const ConcatenationParams& params, + const RuntimeShape* const* input_shapes, + const Scalar* const* input_data, + const RuntimeShape& output_shape, Scalar* output_data) { + auto params_copy = params; + params_copy.axis = 3; + Concatenation(params_copy, input_shapes, input_data, output_shape, + output_data); } -inline void LstmCell(const float* input_data, const Dims<4>& input_dims, - const float* prev_activ_data, - const Dims<4>& prev_activ_dims, const float* weights_data, - const Dims<4>& weights_dims, const float* bias_data, - const Dims<4>& bias_dims, const float* prev_state_data, - const Dims<4>& prev_state_dims, float* output_state_data, - const Dims<4>& output_state_dims, float* output_activ_data, - const Dims<4>& output_activ_dims, float* concat_temp_data, - const Dims<4>& concat_temp_dims, float* activ_temp_data, - const Dims<4>& activ_temp_dims) { +inline void LstmCell( + const LstmCellParams& params, const RuntimeShape& unextended_input_shape, + const float* input_data, const RuntimeShape& unextended_prev_activ_shape, + const float* prev_activ_data, const RuntimeShape& weights_shape, + const float* weights_data, const RuntimeShape& unextended_bias_shape, + const float* bias_data, const RuntimeShape& unextended_prev_state_shape, + const float* prev_state_data, + const RuntimeShape& unextended_output_state_shape, float* output_state_data, + const RuntimeShape& unextended_output_activ_shape, float* output_activ_data, + const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data, + const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data) { + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape prev_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape); + const RuntimeShape bias_shape = + RuntimeShape::ExtendedShape(4, unextended_bias_shape); + const RuntimeShape prev_state_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_state_shape); + const RuntimeShape output_state_shape = + RuntimeShape::ExtendedShape(4, unextended_output_state_shape); + const RuntimeShape output_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_output_activ_shape); + const RuntimeShape concat_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape); + const RuntimeShape activ_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); + + const int weights_dim_count = weights_shape.DimensionsCount(); const int batches = - MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, - output_state_dims, 3, output_activ_dims, 3); + MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0, + output_state_shape, 0, output_activ_shape, 0); const int height = - MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, - output_state_dims, 2, output_activ_dims, 2); + MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1, + output_state_shape, 1, output_activ_shape, 1); const int width = - MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, - output_state_dims, 1, output_activ_dims, 1); - TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); - TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); - const int input_depth = ArraySize(input_dims, 0); - const int prev_activ_depth = ArraySize(prev_activ_dims, 0); + MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2, + output_state_shape, 2, output_activ_shape, 2); + const int input_depth = input_shape.Dims(3); + const int prev_activ_depth = prev_activ_shape.Dims(3); const int total_input_depth = prev_activ_depth + input_depth; - TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); - TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), - 1); + TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), + total_input_depth); + TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1); const int intern_activ_depth = - MatchingArraySize(weights_dims, 1, bias_dims, 0); - TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); + MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3); + TFLITE_DCHECK_EQ(weights_shape.FlatSize(), + intern_activ_depth * total_input_depth); + TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0); const int output_depth = - MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, - output_state_dims, 0, output_activ_dims, 0); - TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); + MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape, + 3, output_activ_shape, 3); + TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4); // Concatenate prev_activ and input data together std::vector<float const*> concat_input_arrays_data; - std::vector<Dims<4> const*> concat_input_arrays_dims; + std::vector<RuntimeShape const*> concat_input_arrays_shapes; concat_input_arrays_data.push_back(input_data); concat_input_arrays_data.push_back(prev_activ_data); - concat_input_arrays_dims.push_back(&input_dims); - concat_input_arrays_dims.push_back(&prev_activ_dims); - Concatenation<FusedActivationFunctionType::kNone, float>( - 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]), - concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims); + concat_input_arrays_shapes.push_back(&input_shape); + concat_input_arrays_shapes.push_back(&prev_activ_shape); + tflite::ConcatenationParams concat_params; + concat_params.axis = 3; + concat_params.inputs_count = concat_input_arrays_data.size(); + Concatenation(concat_params, &(concat_input_arrays_shapes[0]), + &(concat_input_arrays_data[0]), concat_temp_shape, + concat_temp_data); // Fully connected - FullyConnected<FusedActivationFunctionType::kNone>( - concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data, - bias_dims, activ_temp_data, activ_temp_dims); + tflite::FullyConnectedParams fc_params; + fc_params.float_activation_min = std::numeric_limits<float>::lowest(); + fc_params.float_activation_max = std::numeric_limits<float>::max(); + FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, + weights_data, bias_shape, bias_data, activ_temp_shape, + activ_temp_data); // Memory state update (the LSTM "guts") for (int b = 0; b < batches; ++b) { @@ -1764,24 +1831,24 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims, for (int c = 0; c < output_depth; ++c) { const float input_gate = 1.f / - (1.f + std::exp(-activ_temp_data[Offset( - activ_temp_dims, 0 * output_depth + c, w, h, b)])); + (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, + 0 * output_depth + c)])); const float new_input = std::tanh(activ_temp_data[Offset( - activ_temp_dims, 1 * output_depth + c, w, h, b)]); + activ_temp_shape, b, h, w, 1 * output_depth + c)]); const float forget_gate = 1.f / - (1.f + std::exp(-activ_temp_data[Offset( - activ_temp_dims, 2 * output_depth + c, w, h, b)])); + (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, + 2 * output_depth + c)])); const float output_gate = 1.f / - (1.f + std::exp(-activ_temp_data[Offset( - activ_temp_dims, 3 * output_depth + c, w, h, b)])); + (1.f + std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, + 3 * output_depth + c)])); const float new_state = input_gate * new_input + forget_gate * - prev_state_data[Offset(prev_state_dims, c, w, h, b)]; - output_state_data[Offset(output_state_dims, c, w, h, b)] = new_state; - output_activ_data[Offset(output_activ_dims, c, w, h, b)] = + prev_state_data[Offset(prev_state_shape, b, h, w, c)]; + output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state; + output_activ_data[Offset(output_activ_shape, b, h, w, c)] = output_gate * std::tanh(new_state); } } @@ -1874,52 +1941,90 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims, // aiming for 16-bit fixed-point quantization of these internal nodes here. // template <int StateIntegerBits> -void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, - const uint8* prev_activ_data_uint8, - const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8, - const Dims<4>& weights_dims, const int32* bias_data_int32, - const Dims<4>& bias_dims, const int16* prev_state_data_int16, - const Dims<4>& prev_state_dims, int16* output_state_data_int16, - const Dims<4>& output_state_dims, uint8* output_activ_data_uint8, - const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8, - const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16, - const Dims<4>& activ_temp_dims, int32 weights_zero_point, - int32 accum_multiplier, int accum_shift, - gemmlowp::GemmContext* gemm_context) { +inline void LstmCell( + const LstmCellParams& params, const RuntimeShape& unextended_input_shape, + const uint8* input_data_uint8, + const RuntimeShape& unextended_prev_activ_shape, + const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape, + const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape, + const int32* bias_data_int32, + const RuntimeShape& unextended_prev_state_shape, + const int16* prev_state_data_int16, + const RuntimeShape& unextended_output_state_shape, + int16* output_state_data_int16, + const RuntimeShape& unextended_output_activ_shape, + uint8* output_activ_data_uint8, + const RuntimeShape& unextended_concat_temp_shape, + uint8* concat_temp_data_uint8, + const RuntimeShape& unextended_activ_temp_shape, + int16* activ_temp_data_int16, gemmlowp::GemmContext* gemm_context) { (void)gemm_context; // only used in optimized code. + int32 weights_zero_point = params.weights_zero_point; + int32 accum_multiplier = params.accum_multiplier; + int accum_shift = params.accum_shift; + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape prev_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape); + const RuntimeShape bias_shape = + RuntimeShape::ExtendedShape(4, unextended_bias_shape); + const RuntimeShape prev_state_shape = + RuntimeShape::ExtendedShape(4, unextended_prev_state_shape); + const RuntimeShape output_state_shape = + RuntimeShape::ExtendedShape(4, unextended_output_state_shape); + const RuntimeShape output_activ_shape = + RuntimeShape::ExtendedShape(4, unextended_output_activ_shape); + const RuntimeShape concat_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape); + const RuntimeShape activ_temp_shape = + RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape); + TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); // Gather dimensions information, and perform consistency checks. - const int outer_size = - MatchingFlatSizeSkipDim(input_dims, 0, prev_activ_dims, prev_state_dims, - output_state_dims, output_activ_dims); - TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); - TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); - const int input_depth = ArraySize(input_dims, 0); - const int prev_activ_depth = ArraySize(prev_activ_dims, 0); + const int weights_dim_count = weights_shape.DimensionsCount(); + const int outer_size = MatchingFlatSizeSkipDim( + input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape, + output_activ_shape); + const int input_depth = input_shape.Dims(3); + const int prev_activ_depth = prev_activ_shape.Dims(3); const int total_input_depth = prev_activ_depth + input_depth; - TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); - TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), - 1); + TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), + total_input_depth); const int intern_activ_depth = - MatchingArraySize(weights_dims, 1, bias_dims, 0); - TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); + MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3); + TFLITE_DCHECK_EQ(weights_shape.FlatSize(), + intern_activ_depth * total_input_depth); + TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1); + TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0); const int output_depth = - MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, - output_state_dims, 0, output_activ_dims, 0); - TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); - const int fc_batches = FlatSizeSkipDim(activ_temp_dims, 0); + MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape, + 3, output_activ_shape, 3); + TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4); + const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3); const int fc_output_depth = - MatchingArraySize(weights_dims, 1, activ_temp_dims, 0); - const int fc_accum_depth = ArraySize(weights_dims, 0); - TFLITE_CHECK_EQ(fc_output_depth, 4 * output_depth); + MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3); + const int fc_accum_depth = total_input_depth; + TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth); // Depth-concatenate prev_activ and input data together. uint8 const* concat_input_arrays_data[2] = {input_data_uint8, prev_activ_data_uint8}; - Dims<4> const* concat_input_arrays_dims[2] = {&input_dims, &prev_activ_dims}; - Concatenation<FusedActivationFunctionType::kNone, uint8>( - 0, concat_input_arrays_data, concat_input_arrays_dims, 2, - concat_temp_data_uint8, concat_temp_dims); + const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape, + &prev_activ_shape}; + tflite::ConcatenationParams concat_params; + concat_params.axis = 3; + concat_params.inputs_count = 2; + Concatenation(concat_params, concat_input_arrays_shapes, + concat_input_arrays_data, concat_temp_shape, + concat_temp_data_uint8); // Implementation of the fully connected node inside the LSTM cell. // The operands are 8-bit integers, the accumulators are internally 32bit @@ -2026,110 +2131,81 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, } template <typename Scalar> -void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, - int axis, int outputs_count, Scalar* const* output_data, - const Dims<4>* const* output_dims) { - const int batches = ArraySize(*output_dims[0], 3); - const int height = ArraySize(*output_dims[0], 2); - const int width = ArraySize(*output_dims[0], 1); - const int depth = ArraySize(*output_dims[0], 0); - - const int slice_size = ArraySize(*output_dims[0], axis); +void Split(const SplitParams& params, const RuntimeShape& input_shape, + const Scalar* input_data, const RuntimeShape* const* output_shapes, + Scalar* const* output_data) { + const int concat_dimensions = input_shape.DimensionsCount(); + int axis = params.axis < 0 ? params.axis + concat_dimensions : params.axis; + int outputs_count = params.num_split; + TFLITE_DCHECK_LT(axis, concat_dimensions); - for (int i = 0; i < outputs_count; ++i) { - int offset = i * slice_size * input_dims.strides[axis]; - for (int b = 0; b < batches; ++b) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < depth; ++c) { - auto out = Offset(*output_dims[i], c, x, y, b); - auto in = Offset(input_dims, c, x, y, b); - output_data[i][out] = input_data[offset + in]; - } - } + int64_t concat_size = 0; + for (int i = 0; i < outputs_count; i++) { + TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), concat_dimensions); + for (int j = 0; j < concat_dimensions; j++) { + if (j != axis) { + MatchingDim(*output_shapes[i], j, input_shape, j); } } + concat_size += output_shapes[i]->Dims(axis); } -} - -template <FusedActivationFunctionType Ac, typename Scalar> -void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, - int outputs_count, Scalar* const* output_data, - const Dims<4>* const* output_dims) { - TFLITE_DCHECK_GE(outputs_count, 1); - for (int i = 0; i < outputs_count; i++) { - /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3); - /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2); - /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1); + TFLITE_DCHECK_EQ(concat_size, input_shape.Dims(axis)); + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) { + outer_size *= input_shape.Dims(i); } - // for now we dont have a model with a TensorFlowSplit - // with fused activation function. - TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); - - TensorFlowSplit(input_data, input_dims, /*axis=*/0, outputs_count, - output_data, output_dims); -} - -// TODO(benoitjacob) make this a proper reference impl without Eigen! -template <typename Scalar> -using MatrixMap = typename std::conditional< - std::is_const<Scalar>::value, - Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, - Eigen::Dynamic, Eigen::Dynamic>>, - Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; - -template <typename Scalar, int N> -MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data, - const Dims<N>& dims) { - const int rows = dims.sizes[0]; - int cols = 1; - for (int d = 1; d < N; d++) { - cols *= dims.sizes[d]; + // For all output arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < concat_dimensions; ++i) { + base_inner_size *= input_shape.Dims(i); } - return MatrixMap<Scalar>(data, rows, cols); -} -template <typename Scalar, int N> -MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data, - const Dims<N>& dims) { - const int cols = dims.sizes[N - 1]; - int rows = 1; - for (int d = 0; d < N - 1; d++) { - rows *= dims.sizes[d]; + const Scalar* input_ptr = input_data; + for (int k = 0; k < outer_size; k++) { + for (int i = 0; i < outputs_count; ++i) { + const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size; + memcpy(output_data[i] + k * copy_size, input_ptr, + copy_size * sizeof(Scalar)); + input_ptr += copy_size; + } } - return MatrixMap<Scalar>(data, rows, cols); } inline int NodeOffset(int b, int h, int w, int height, int width) { return (b * height + h) * width + w; } -inline void AveragePool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - float output_activation_min, - float output_activation_max, float* output_data, - const Dims<4>& output_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); +inline void AveragePool(const PoolParams& params, + const RuntimeShape& input_shape, + const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); float total = 0.f; float filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; @@ -2139,70 +2215,52 @@ inline void AveragePool(const float* input_data, const Dims<4>& input_dims, const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; total += - input_data[Offset(input_dims, channel, in_x, in_y, batch)]; + input_data[Offset(input_shape, batch, in_y, in_x, channel)]; filter_count++; } } const float average = total / filter_count; - output_data[Offset(output_dims, channel, out_x, out_y, batch)] = - ActivationFunctionWithMinMax(average, output_activation_min, - output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + ActivationFunctionWithMinMax(average, params.float_activation_min, + params.float_activation_max); } } } } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void AveragePool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - float* output_data, const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, - int filter_height, float* output_data, - const Dims<4>& output_dims) { - AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_data, output_dims); -} - -inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); +inline void AveragePool(const PoolParams& params, + const RuntimeShape& input_shape, + const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); int32 acc = 0; int filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; @@ -2211,14 +2269,15 @@ inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, ++filter_x) { const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; - acc += input_data[Offset(input_dims, channel, in_x, in_y, batch)]; + acc += + input_data[Offset(input_shape, batch, in_y, in_x, channel)]; filter_count++; } } acc = (acc + filter_count / 2) / filter_count; - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[Offset(output_dims, channel, out_x, out_y, batch)] = + acc = std::max(acc, params.quantized_activation_min); + acc = std::min(acc, params.quantized_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = static_cast<uint8>(acc); } } @@ -2226,64 +2285,35 @@ inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void AveragePool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, - int filter_height, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -inline void L2Pool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); +inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); float sum_squares = 0.f; int filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; @@ -2293,69 +2323,51 @@ inline void L2Pool(const float* input_data, const Dims<4>& input_dims, const int in_x = in_x_origin + filter_x; const int in_y = in_y_origin + filter_y; const float val = - input_data[Offset(input_dims, channel, in_x, in_y, batch)]; + input_data[Offset(input_shape, batch, in_y, in_x, channel)]; sum_squares += val * val; filter_count++; } } const float l2pool_result = std::sqrt(sum_squares / filter_count); - output_data[Offset(output_dims, channel, out_x, out_y, batch)] = - ActivationFunctionWithMinMax(l2pool_result, output_activation_min, - output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + ActivationFunctionWithMinMax(l2pool_result, + params.float_activation_min, + params.float_activation_max); } } } } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void L2Pool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, int pad_height, - int filter_width, int filter_height, float* output_data, - const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - - L2Pool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, int filter_height, - float* output_data, const Dims<4>& output_dims) { - L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_data, output_dims); -} - -inline void MaxPool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - float output_activation_min, float output_activation_max, - float* output_data, const Dims<4>& output_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); +inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); float max = std::numeric_limits<float>::lowest(); for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { @@ -2365,68 +2377,51 @@ inline void MaxPool(const float* input_data, const Dims<4>& input_dims, const int in_y = in_y_origin + filter_y; max = std::max( max, - input_data[Offset(input_dims, channel, in_x, in_y, batch)]); + input_data[Offset(input_shape, batch, in_y, in_x, channel)]); } } - output_data[Offset(output_dims, channel, out_x, out_y, batch)] = - ActivationFunctionWithMinMax(max, output_activation_min, - output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + ActivationFunctionWithMinMax(max, params.float_activation_min, + params.float_activation_max); } } } } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void MaxPool(const float* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, int pad_height, - int filter_width, int filter_height, float* output_data, - const Dims<4>& output_dims) { - float output_activation_min, output_activation_max; - GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); - MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, int filter_height, - float* output_data, const Dims<4>& output_dims) { - MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_data, output_dims); -} - -inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - TFLITE_DCHECK_GE(output_activation_min, 0); - TFLITE_DCHECK_LE(output_activation_max, 255); - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); +inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& output_shape, + uint8* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + TFLITE_DCHECK_GE(params.quantized_activation_min, 0); + TFLITE_DCHECK_LE(params.quantized_activation_max, 255); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) { for (int channel = 0; channel < depth; ++channel) { - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; + const int in_x_origin = + (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = + (out_y * stride_height) - params.padding_values.height; // Compute the boundaries of the filter region clamped so as to // ensure that the filter window fits in the input array. const int filter_x_start = std::max(0, -in_x_origin); const int filter_x_end = - std::min(filter_width, input_width - in_x_origin); + std::min(params.filter_width, input_width - in_x_origin); const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = - std::min(filter_height, input_height - in_y_origin); + std::min(params.filter_height, input_height - in_y_origin); uint8 max = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { @@ -2436,12 +2431,12 @@ inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, const int in_y = in_y_origin + filter_y; max = std::max( max, - input_data[Offset(input_dims, channel, in_x, in_y, batch)]); + input_data[Offset(input_shape, batch, in_y, in_x, channel)]); } } - max = std::max<uint8>(max, output_activation_min); - max = std::min<uint8>(max, output_activation_max); - output_data[Offset(output_dims, channel, out_x, out_y, batch)] = + max = std::max<uint8>(max, params.quantized_activation_min); + max = std::min<uint8>(max, params.quantized_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = static_cast<uint8>(max); } } @@ -2449,71 +2444,45 @@ inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, } } -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void MaxPool(const uint8* input_data, const Dims<4>& input_dims, - int stride_width, int stride_height, int pad_width, int pad_height, - int filter_width, int filter_height, int32 output_activation_min, - int32 output_activation_max, uint8* output_data, - const Dims<4>& output_dims) { - static_assert(Ac == FusedActivationFunctionType::kNone || - Ac == FusedActivationFunctionType::kRelu || - Ac == FusedActivationFunctionType::kRelu6 || - Ac == FusedActivationFunctionType::kRelu1, - ""); - if (Ac == FusedActivationFunctionType::kNone) { - TFLITE_DCHECK_EQ(output_activation_min, 0); - TFLITE_DCHECK_EQ(output_activation_max, 255); - } - MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, - pad_height, filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -// legacy, for compatibility with old checked-in code -template <FusedActivationFunctionType Ac> -void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride, - int pad_width, int pad_height, int filter_width, int filter_height, - int32 output_activation_min, int32 output_activation_max, - uint8* output_data, const Dims<4>& output_dims) { - MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, - filter_width, filter_height, output_activation_min, - output_activation_max, output_data, output_dims); -} - -inline void LocalResponseNormalization(const float* input_data, - const Dims<4>& input_dims, int range, - float bias, float alpha, float beta, - float* output_data, - const Dims<4>& output_dims) { - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); +inline void LocalResponseNormalization( + const tflite::LocalResponseNormalizationParams& op_params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { for (int c = 0; c < depth; ++c) { - const int begin_input_c = std::max(0, c - range); - const int end_input_c = std::min(depth, c + range); + const int begin_input_c = std::max(0, c - op_params.range); + const int end_input_c = std::min(depth, c + op_params.range); float accum = 0.f; for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { const float input_val = input_data[i * depth + input_c]; accum += input_val * input_val; } - const float multiplier = std::pow(bias + alpha * accum, -beta); + const float multiplier = + std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta); output_data[i * depth + c] = input_data[i * depth + c] * multiplier; } } } -inline void Softmax(const float* input_data, const Dims<4>& input_dims, - float beta, float* output_data, - const Dims<4>& output_dims) { - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); +inline void LogSoftmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { // Find max element value which we'll use to ensure numerical stability // taking advantage of the following equality: - // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C))) float max = std::numeric_limits<float>::lowest(); for (int c = 0; c < depth; ++c) { max = std::max(max, input_data[i * depth + c]); @@ -2522,133 +2491,145 @@ inline void Softmax(const float* input_data, const Dims<4>& input_dims, // Compute sum. float sum = 0.f; for (int c = 0; c < depth; ++c) { - sum += std::exp((input_data[i * depth + c] - max) * beta); + sum += std::exp(input_data[i * depth + c] - max); } // Compute result. + const float log_sum = std::log(sum); for (int c = 0; c < depth; ++c) { - output_data[i * depth + c] = - std::exp((input_data[i * depth + c] - max) * beta) / sum; + output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum; } } } -inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, - int32 input_beta_multiplier, int32 input_beta_left_shift, - int diff_min, uint8* output_data, - const Dims<4>& output_dims) { - // The representation chosen for the input to the exp() function is Q5.26. - // We need to leave extra space since values that we skip might be as large as - // -32 before multiplying by input_beta_multiplier, and therefore as large as - // -16 afterwards. Note that exp(-8) is definitely not insignificant to - // accumulation, but exp(-16) definitely is. - static const int kScaledDiffIntegerBits = 5; - static const int kAccumulationIntegerBits = 12; - using FixedPointScaledDiff = - gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; - using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; +// Although currently the name of this function says that it cannot handle +// values less than 1, in practice it can handle as low as 1/x_max, where +// x_max is the largest representable input. In other words, the output range +// is symmetric. +template <int OutputIntegerBits, int InputIntegerBits> +inline gemmlowp::FixedPoint<int32, OutputIntegerBits> +log_x_for_x_greater_than_or_equal_to_1_impl( + gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) { using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; - - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - - for (int i = 0; i < outer_size; ++i) { - uint8 max_in_row = 0; - for (int c = 0; c < depth; ++c) { - max_in_row = std::max(max_in_row, input_data[i * depth + c]); - } - - FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); - for (int c = 0; c < depth; ++c) { - int32 input_diff = - static_cast<int32>(input_data[i * depth + c]) - max_in_row; - if (input_diff >= diff_min) { - const int32 input_diff_rescaled = - MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( - exp_on_negative_values(scaled_diff_f8)); - } - } - - int32 fixed_sum_of_exps = sum_of_exps.raw(); - int headroom_plus_one = - CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps)); - // This is the number of bits to the left of the binary point above 1.0. - // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and - // no later adjustment will be needed. - int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; - int32 shifted_sum_minus_one = static_cast<int32>( - (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - - (static_cast<uint32>(1) << 31)); - - FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( - FixedPoint0::FromRaw(shifted_sum_minus_one)); - - for (int c = 0; c < depth; ++c) { - int32 input_diff = - static_cast<int32>(input_data[i * depth + c]) - max_in_row; - if (input_diff >= diff_min) { - const int32 input_diff_rescaled = - MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - - FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); - int32 unsat_output = gemmlowp::RoundingDivideByPOT( - (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); - - output_data[i * depth + c] = static_cast<uint8>( - std::max(std::min(unsat_output, static_cast<int32>(255)), 0)); - - } else { - output_data[i * depth + c] = 0; - } - } - } + // The reason for accumulating the result with an extra bit of headroom is + // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled * + // recip_denom will otherwise introduce an error. + static constexpr int kAccumIntegerBits = OutputIntegerBits + 1; + using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>; + + const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 1488522236, std::log(2.0)); + const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5))); + const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 1518500250, std::sqrt(0.5)); + const FixedPoint0 one_quarter = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0); + + const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0))); + const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0))); + const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 1057819769, + 2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0))); + const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( + FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0))); + + const FixedPointAccum shifted_quarter = + gemmlowp::Rescale<kAccumIntegerBits>(one_quarter); + + // Reinterpret the input value as Q0.31, because we will figure out the + // required shift "ourselves" instead of using, say, Rescale. + FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw()); + // z_a_pow_2 = input_integer_bits - z_a_headroom; + int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw())); + FixedPoint0 r_a_tmp = + SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1)); + const int32 r_a_raw = + SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1); + // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25); + // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25, + // InputIntegerBits - z_b_headroom - 0.25); + const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp( + FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam( + InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)), + shifted_quarter); + + // z_b is treated like z_a, but premultiplying by sqrt(0.5). + FixedPoint0 z_b = z_a * sqrt_half; + int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1; + const int32 r_b_raw = + SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom); + const FixedPointAccum z_b_pow_2_adj = SaturatingSub( + FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam( + InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)), + shifted_quarter); + + const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw)); + const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw( + std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw())); + + const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half); + FixedPoint0 q = r - sqrt_sqrt_half; + q = q + q; + + const FixedPoint0 common_sq = q * q; + const FixedPoint0 num = q * r + q * common_sq * alpha_n; + const FixedPoint0 denom_minus_one_0 = + p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q; + const FixedPoint0 recip_denom = + one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0); + + const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num); + return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 + + num_scaled * recip_denom); } -inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); - - for (int i = 0; i < outer_size; ++i) { - // Find max element value which we'll use to ensure numerical stability - // taking advantage of the following equality: - // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C))) - float max = std::numeric_limits<float>::lowest(); - for (int c = 0; c < depth; ++c) { - max = std::max(max, input_data[i * depth + c]); - } - - // Compute sum. - float sum = 0.f; - for (int c = 0; c < depth; ++c) { - sum += std::exp(input_data[i * depth + c] - max); - } +// Minimum output bits to accommodate log of maximum input range. It actually +// does not matter if one considers, say, [-64,64] or [-64,64). +// +// For example, run this through Octave: +// [0:127; ... +// ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ... +// ceil(log(abs( log(2.^(0:127))+1 ))/log(2))] +constexpr int min_log_x_output_bits(int input_bits) { + return input_bits > 90 + ? 7 + : input_bits > 44 + ? 6 + : input_bits > 21 + ? 5 + : input_bits > 10 + ? 4 + : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1; +} - // Compute result. - const float log_sum = std::log(sum); - for (int c = 0; c < depth; ++c) { - output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum; - } - } +template <int OutputIntegerBits, int InputIntegerBits> +inline gemmlowp::FixedPoint<int32, OutputIntegerBits> +log_x_for_x_greater_than_or_equal_to_1( + gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) { + static_assert( + OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits), + "Output integer bits must be sufficent to accommodate logs of inputs."); + return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits, + InputIntegerBits>( + input_val); } -inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, - int32 input_multiplier, int32 input_left_shift, - int32 reverse_scaling_divisor, - int32 reverse_scaling_right_shift, int diff_min, - uint8* output_data, const Dims<4>& output_dims) { +inline void LogSoftmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { + const int32 input_multiplier = params.input_multiplier; + const int32 input_left_shift = params.input_left_shift; + const int32 reverse_scaling_divisor = params.reverse_scaling_divisor; + const int32 reverse_scaling_right_shift = params.reverse_scaling_right_shift; + const int diff_min = params.diff_min; // The representation chosen for the input to the exp() function is Q5.26. - // We need to leave extra space since values that we skip might be as large as - // -32 before multiplying by input_beta_multiplier, and therefore as large as - // -16 afterwards. Note that exp(-8) is definitely not insignificant to - // accumulation, but exp(-16) definitely is. + // We need to leave extra space since values that we skip might be as large + // as -32 before multiplying by input_beta_multiplier, and therefore as + // large as -16 afterwards. Note that exp(-8) is definitely not + // insignificant to accumulation, but exp(-16) definitely is. static constexpr int kScaledDiffIntegerBits = 5; static constexpr int kAccumulationIntegerBits = 12; static constexpr int kOutputIntegerBits = 4; @@ -2657,8 +2638,11 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); for (int i = 0; i < outer_size; ++i) { uint8 max_in_row = 0; @@ -2681,13 +2665,10 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, } } - // TODO(b/77858996): Implement fixed-point log(). - // Not a fully-quantized implementation: floating-point log(). - const float float_log_sum_of_exps = - std::log(static_cast<float>(sum_of_exps.raw()) / - (1 << (31 - kAccumulationIntegerBits))); - const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound( - float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits)))); + const int32 fixed_log_sum_of_exps = + log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>( + sum_of_exps) + .raw(); // rescaled_diff_min is smallest representable in // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the @@ -2698,9 +2679,9 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest(); const int adjusted_diff_min = std::max(diff_min - 1, // Note use of > below instead of >= above. - MultiplyByQuantizedMultiplierSmallerThanOne( + MultiplyByQuantizedMultiplierSmallerThanOneExp( rescaled_diff_min, reverse_scaling_divisor, - reverse_scaling_right_shift)); + -reverse_scaling_right_shift)); for (int c = 0; c < depth; ++c) { int32 input_diff = @@ -2725,9 +2706,9 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims, } } -inline void Logistic(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input_dims); +inline void Logistic(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { float val = input_data[i]; @@ -2736,11 +2717,23 @@ inline void Logistic(const float* input_data, const Dims<4>& input_dims, } } -inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, int32 input_range_radius, - int32 input_multiplier, int input_left_shift, - uint8* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input_dims); +// Convenience version that allows, for example, generated-code calls to be +// uniform between data types. +inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { + // Drop params: not needed. + Logistic(input_shape, input_data, output_shape, output_data); +} + +inline void Logistic(const LogisticParams& params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { + const int32 input_zero_point = params.input_zero_point; + const int32 input_range_radius = params.input_range_radius; + const int32 input_multiplier = params.input_multiplier; + const int input_left_shift = params.input_left_shift; + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { const uint8 input_val_u8 = input_data[i]; @@ -2774,9 +2767,10 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, } } -inline void Logistic(const int16* input_data, const Dims<4>& input_dims, - int16* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input_dims); +inline void Logistic(const LogisticParams& params, + const RuntimeShape& input_shape, const int16* input_data, + const RuntimeShape& output_shape, int16* output_data) { + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { // F0 uses 0 integer bits, range [-1, 1]. @@ -2792,9 +2786,9 @@ inline void Logistic(const int16* input_data, const Dims<4>& input_dims, } } -inline void Tanh(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input_dims); +inline void Tanh(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { float val = input_data[i]; @@ -2803,12 +2797,24 @@ inline void Tanh(const float* input_data, const Dims<4>& input_dims, } } -inline void Tanh(const uint8* input_data, const Dims<4>& input_dims, - int32 input_zero_point, int32 input_range_radius, - int32 input_multiplier, int input_left_shift, - uint8* output_data, const Dims<4>& output_dims) { +// Convenience version that allows, for example, generated-code calls to be +// uniform between data types. +inline void Tanh(const TanhParams&, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& output_shape, + float* output_data) { + // Drop params: not needed. + Tanh(input_shape, input_data, output_shape, output_data); +} + +inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, + const uint8* input_data, const RuntimeShape& output_shape, + uint8* output_data) { + const int32 input_zero_point = params.input_zero_point; + const int32 input_range_radius = params.input_range_radius; + const int32 input_multiplier = params.input_multiplier; + const int input_left_shift = params.input_left_shift; const int32 output_zero_point = 128; - const int flat_size = MatchingFlatSize(output_dims, input_dims); + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { const uint8 input_val_u8 = input_data[i]; @@ -2843,15 +2849,16 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims, } } -inline void Tanh(const int16* input_data, const Dims<4>& input_dims, - int input_left_shift, int16* output_data, - const Dims<4>& output_dims) { +inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, + const int16* input_data, const RuntimeShape& output_shape, + int16* output_data) { + const int input_left_shift = params.input_left_shift; // Support for shifts is limited until we have a parameterized version of // SaturatingRoundingMultiplyByPOT(). TFLITE_DCHECK_GE(input_left_shift, 0); TFLITE_DCHECK_LE(input_left_shift, 1); - const int flat_size = MatchingFlatSize(output_dims, input_dims); + const int flat_size = MatchingFlatSize(input_shape, output_shape); // F0 uses 0 integer bits, range [-1, 1]. // This is the return type of math functions such as tanh, logistic, @@ -2876,10 +2883,12 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims, } } -inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, - int32 zero_point, double scale, float* output_data, - const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input_dims); +inline void Dequantize(const tflite::DequantizationParams& op_params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& output_shape, float* output_data) { + int32 zero_point = op_params.zero_point; + double scale = op_params.scale; + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { int32 val = input_data[i]; @@ -2888,9 +2897,12 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, } } -inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, - float rmin, float rmax, int num_bits, float* output_data, - const Dims<4>& output_dims) { +inline void FakeQuant(const tflite::FakeQuantParams& op_params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + float rmin = op_params.minmax.min; + float rmax = op_params.minmax.max; + int num_bits = op_params.num_bits; // 0 should always be a representable value. Let's assume that the initial // min,max range contains 0. TFLITE_DCHECK_LE(rmin, 0.0f); @@ -2903,24 +2915,15 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, float nudged_min, nudged_max, nudged_scale; NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, &nudged_max, &nudged_scale); - const float inv_nudged_scale = 1.0f / nudged_scale; - - const int flat_size = MatchingFlatSize(output_dims, input_dims); - for (int i = 0; i < flat_size; i++) { - const float src_val = input_data[i]; - const float clamped = std::min(nudged_max, std::max(nudged_min, src_val)); - const float clamped_shifted = clamped - nudged_min; - const float dst_val = - TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale + - nudged_min; - output_data[i] = dst_val; - } + const int flat_size = MatchingFlatSize(input_shape, output_shape); + FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data, + output_data, flat_size); } template <typename SrcT, typename DstT> -inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, - DstT* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input_dims); +inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data, + const RuntimeShape& output_shape, DstT* output_data) { + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { int offset = i; @@ -2928,9 +2931,9 @@ inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, } } -inline void Floor(const float* input_data, const Dims<4>& input_dims, - float* output_data, const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input_dims); +inline void Floor(const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { int offset = i; @@ -2939,44 +2942,79 @@ inline void Floor(const float* input_data, const Dims<4>& input_dims, } template <typename T> -inline void Gather(const T* input_data, const Dims<4>& input_dims, - int input_rank, const int32* coords_data, - const Dims<4>& coords_dims, T* output_data, - const Dims<4>& output_dims) { - TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]); - int stride = input_dims.strides[input_rank - 1]; +inline void Gather(const tflite::GatherParams& op_params, + const RuntimeShape& unextended_input_shape, + const T* input_data, const RuntimeShape& coords_shape, + const int32* coords_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + const int input_rank = op_params.input_rank; + const int gather_dimensions = output_shape.DimensionsCount(); + TFLITE_DCHECK_GE(input_shape.DimensionsCount(), gather_dimensions); + const int axis = gather_dimensions - input_rank; + TFLITE_DCHECK_LT(axis, gather_dimensions); + TFLITE_DCHECK_GE(axis, 0); + const int coords_count = coords_shape.FlatSize(); + TFLITE_DCHECK_EQ(coords_count, output_shape.Dims(axis)); + + int64_t stride = 1; + for (int i = axis + 1; i < gather_dimensions; ++i) { + stride *= input_shape.Dims(i); + } T* out = output_data; - for (int i = 0; i < coords_dims.sizes[0]; i++) { + for (int i = 0; i < coords_count; ++i) { TFLITE_DCHECK_GE(coords_data[i], 0); - TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]); + TFLITE_DCHECK_LT(coords_data[i], input_shape.Dims(axis)); const T* in = input_data + coords_data[i] * stride; memcpy(out, in, sizeof(T) * stride); out += stride; } } -inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, +template <typename T> +inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params, + const RuntimeShape& unextended_input_shape, + const T* input_data, + const RuntimeShape& unextended_output_size_shape, const int32* output_size_data, - const Dims<4>& output_size_dims, float* output_data, - const Dims<4>& output_dims, bool align_corners) { - int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3); - int32 input_height = ArraySize(input_dims, 2); - int32 input_width = ArraySize(input_dims, 1); - int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0); - - TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1); - TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1); - TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1); - TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2); - int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)]; - int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)]; + const RuntimeShape& unextended_output_shape, + T* output_data) { + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_size_shape = + RuntimeShape::ExtendedShape(4, unextended_output_size_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + int32 batches = MatchingDim(input_shape, 0, output_shape, 0); + int32 input_height = input_shape.Dims(1); + int32 input_width = input_shape.Dims(2); + int32 depth = MatchingDim(input_shape, 3, output_shape, 3); + + TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1); + TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1); + TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1); + TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2); + int32 output_height = output_size_data[Offset(output_size_shape, 0, 0, 0, 0)]; + int32 output_width = output_size_data[Offset(output_size_shape, 0, 0, 0, 1)]; + float height_scale = static_cast<float>(input_height) / output_height; float width_scale = static_cast<float>(input_width) / output_width; - if (align_corners && output_height > 1) { + if (op_params.align_corners && output_height > 1) { height_scale = static_cast<float>(input_height - 1) / (output_height - 1); } - if (align_corners && output_width > 1) { + if (op_params.align_corners && output_width > 1) { width_scale = static_cast<float>(input_width - 1) / (output_width - 1); } @@ -2990,70 +3028,73 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, int32 x0 = static_cast<int32>(std::floor(input_x)); int32 x1 = std::min(x0 + 1, input_width - 1); for (int c = 0; c < depth; ++c) { - float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] * - (1 - (input_y - y0)) * - (1 - (input_x - x0)) + - input_data[Offset(input_dims, c, x0, y1, b)] * - (input_y - y0) * (1 - (input_x - x0)) + - input_data[Offset(input_dims, c, x1, y0, b)] * - (1 - (input_y - y0)) * (input_x - x0) + - input_data[Offset(input_dims, c, x1, y1, b)] * - (input_y - y0) * (input_x - x0); - output_data[Offset(output_dims, c, x, y, b)] = interpolation; + T interpolation = + static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] * + (1 - (input_y - y0)) * (1 - (input_x - x0)) + + input_data[Offset(input_shape, b, y1, x0, c)] * + (input_y - y0) * (1 - (input_x - x0)) + + input_data[Offset(input_shape, b, y0, x1, c)] * + (1 - (input_y - y0)) * (input_x - x0) + + input_data[Offset(input_shape, b, y1, x1, c)] * + (input_y - y0) * (input_x - x0)); + output_data[Offset(output_shape, b, y, x, c)] = interpolation; } } } } } -// legacy, for compatibility with old checked-in code -inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, - const int32* output_size_data, - const Dims<4>& output_size_dims, float* output_data, - const Dims<4>& output_dims) { - ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims, - output_data, output_dims, /*align_corners=*/false); -} - template <typename T> -inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims, - const int32* block_shape_data, - const Dims<4>& block_shape_dims, - const int32* paddings_data, - const Dims<4>& paddings_dims, T* output_data, - const Dims<4>& output_dims) { - const int output_batch_size = ArraySize(output_dims, 3); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int input_batch_size = ArraySize(input_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int depth = ArraySize(input_dims, 0); +inline void SpaceToBatchND( + const SpaceToBatchParams& params, + const RuntimeShape& unextended_input1_shape, const T* input1_data, + const RuntimeShape& unextended_input2_shape, const int32* block_shape_data, + const RuntimeShape& unextended_input3_shape, const int32* paddings_data, + const RuntimeShape& unextended_output_shape, T* output_data) { + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input1_shape = + RuntimeShape::ExtendedShape(4, unextended_input1_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + const int depth = input1_shape.Dims(3); + const int input_width = input1_shape.Dims(2); + const int input_height = input1_shape.Dims(1); + const int input_batch_size = input1_shape.Dims(0); + + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_batch_size = output_shape.Dims(0); + const int block_shape_height = block_shape_data[0]; const int block_shape_width = block_shape_data[1]; const int padding_top = paddings_data[0]; const int padding_left = paddings_data[2]; + // For uint8 quantized, the correct padding "zero value" is the output offset. + const int32_t pad_value = params.output_offset; + for (int out_b = 0; out_b < output_batch_size; ++out_b) { int input_batch = out_b % input_batch_size; int shift_w = (out_b / input_batch_size) % block_shape_width; int shift_h = (out_b / input_batch_size) / block_shape_width; for (int out_h = 0; out_h < output_height; ++out_h) { for (int out_w = 0; out_w < output_width; ++out_w) { - T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b); + T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0); if (out_h * block_shape_height + shift_h < padding_top || out_h * block_shape_height + shift_h >= padding_top + input_height || out_w * block_shape_width + shift_w < padding_left || out_w * block_shape_width + shift_w >= padding_left + input_width) { - memset(out, 0, depth * sizeof(T)); + // This may not execute correctly when pad_value != 0 and T != uint8. + memset(out, pad_value, depth * sizeof(T)); } else { const T* in = - input_data + - Offset(input_dims, 0, - (out_w * block_shape_width + shift_w) - padding_left, + input1_data + + Offset(input1_shape, input_batch, (out_h * block_shape_height + shift_h) - padding_top, - input_batch); + (out_w * block_shape_width + shift_w) - padding_left, 0); memcpy(out, in, depth * sizeof(T)); } } @@ -3062,18 +3103,27 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims, } template <typename T> -inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, - const int32* block_shape_data, - const Dims<4>& block_shape_dims, - const int32* crops_data, const Dims<4>& crops_dims, - T* output_data, const Dims<4>& output_dims) { - const int output_batch_size = ArraySize(output_dims, 3); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int input_batch_size = ArraySize(input_dims, 3); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int depth = ArraySize(input_dims, 0); +inline void BatchToSpaceND( + const RuntimeShape& unextended_input1_shape, const T* input1_data, + const RuntimeShape& unextended_input2_shape, const int32* block_shape_data, + const RuntimeShape& unextended_input3_shape, const int32* crops_data, + const RuntimeShape& unextended_output_shape, T* output_data) { + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input1_shape = + RuntimeShape::ExtendedShape(4, unextended_input1_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_batch_size = output_shape.Dims(0); + + const int depth = input1_shape.Dims(3); + const int input_width = input1_shape.Dims(2); + const int input_height = input1_shape.Dims(1); + const int input_batch_size = input1_shape.Dims(0); + const int block_shape_width = block_shape_data[1]; const int block_shape_height = block_shape_data[0]; const int crops_top = crops_data[0]; @@ -3095,36 +3145,61 @@ inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, if (out_w < 0 || out_w >= output_width) { continue; } - T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch); - const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch); + T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0); + const T* in = + input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0); memcpy(out, in, depth * sizeof(T)); } } } } -template <typename T> -inline void PadV2(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& left_paddings, - const std::vector<int>& right_paddings, T* output_data, - const Dims<4>& output_dims, const T pad_value) { - TFLITE_DCHECK_EQ(left_paddings.size(), 4); - TFLITE_DCHECK_EQ(right_paddings.size(), 4); - - const int output_batch = ArraySize(output_dims, 3); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int output_depth = ArraySize(output_dims, 0); - - const int left_b_padding = left_paddings[3]; - const int left_h_padding = left_paddings[2]; - const int left_w_padding = left_paddings[1]; - const int left_d_padding = left_paddings[0]; - - const int right_b_padding = right_paddings[3]; - const int right_h_padding = right_paddings[2]; - const int right_w_padding = right_paddings[1]; - const int right_d_padding = right_paddings[0]; +// There are two versions of pad: Pad and PadV2. In PadV2 there is a second +// scalar input that provides the padding value. Therefore pad_value_ptr can be +// equivalent to a simple input1_data. For Pad, it should point to a zero +// value. +// +// Note that two typenames are required, so that T=P=int32 is considered a +// specialization distinct from P=int32. +template <typename T, typename P> +inline void PadImpl(const tflite::PadParams& op_params, + const RuntimeShape& input_shape, const T* input_data, + const P* pad_value_ptr, const RuntimeShape& output_shape, + T* output_data) { + const RuntimeShape ext_input_shape = + RuntimeShape::ExtendedShape(4, input_shape); + const RuntimeShape ext_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + TFLITE_DCHECK_LE(op_params.left_padding_count, 4); + TFLITE_DCHECK_LE(op_params.right_padding_count, 4); + + // Runtime calls are currently fixed at 4 dimensions. Copy inputs so + // we can pad them to 4 dims (yes, we are "padding the padding"). + std::vector<int> left_padding_copy(4, 0); + for (int i = 0; i < op_params.left_padding_count; ++i) { + left_padding_copy[i] = op_params.left_padding[i]; + } + std::vector<int> right_padding_copy(4, 0); + for (int i = 0; i < op_params.right_padding_count; ++i) { + right_padding_copy[i] = op_params.right_padding[i]; + } + + const int output_batch = ext_output_shape.Dims(0); + const int output_height = ext_output_shape.Dims(1); + const int output_width = ext_output_shape.Dims(2); + const int output_depth = ext_output_shape.Dims(3); + + const int left_b_padding = left_padding_copy[0]; + const int left_h_padding = left_padding_copy[1]; + const int left_w_padding = left_padding_copy[2]; + const int left_d_padding = left_padding_copy[3]; + + const int right_b_padding = right_padding_copy[0]; + const int right_h_padding = right_padding_copy[1]; + const int right_w_padding = right_padding_copy[2]; + const int right_d_padding = right_padding_copy[3]; + + const T pad_value = *pad_value_ptr; const T* in_ptr = input_data; T* out_ptr = output_data; @@ -3150,69 +3225,83 @@ inline void PadV2(const T* input_data, const Dims<4>& input_dims, } } -// Legacy Pad() method that casts an int32_t to T before padding. -template <typename T> -inline void Pad(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& left_paddings, - const std::vector<int>& right_paddings, T* output_data, - const Dims<4>& output_dims, const int32_t pad_value) { - const T converted_pad_value = static_cast<T>(pad_value); - PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data, - output_dims, converted_pad_value); +template <typename T, typename P> +inline void Pad(const tflite::PadParams& op_params, + const RuntimeShape& input_shape, const T* input_data, + const P* pad_value_ptr, const RuntimeShape& output_shape, + T* output_data) { + PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape, + output_data); } +// The second (pad-value) input can be int32 when, say, the first is uint8. template <typename T> -inline void Pad(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& left_paddings, - const std::vector<int>& right_paddings, T* output_data, - const Dims<4>& output_dims) { - Pad(input_data, input_dims, left_paddings, right_paddings, output_data, - output_dims, 0); +inline void Pad(const tflite::PadParams& op_params, + const RuntimeShape& input_shape, const T* input_data, + const int32* pad_value_ptr, const RuntimeShape& output_shape, + T* output_data) { + const T converted_pad_value = static_cast<T>(*pad_value_ptr); + PadImpl(op_params, input_shape, input_data, &converted_pad_value, + output_shape, output_data); +} + +// This version avoids conflicting template matching. +template <> +inline void Pad(const tflite::PadParams& op_params, + const RuntimeShape& input_shape, const int32* input_data, + const int32* pad_value_ptr, const RuntimeShape& output_shape, + int32* output_data) { + PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape, + output_data); } template <typename T> -inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, - int begin_mask, int end_mask, - const std::vector<int>& start_indices, - const std::vector<int>& stop_indices, - const std::vector<int>& strides, T* output_data, - const Dims<4>& output_dims) { - // Note that the axis orders are reversed for runtime ops, so the indices, - // strides and masks must be as well too. - TFLITE_DCHECK_EQ(start_indices.size(), 4); - TFLITE_DCHECK_EQ(stop_indices.size(), 4); - TFLITE_DCHECK_EQ(strides.size(), 4); - const int start_b = strided_slice::StartForAxis(begin_mask, start_indices, - strides, input_dims.sizes, 3); - const int stop_b = strided_slice::StopForAxis(end_mask, stop_indices, strides, - input_dims.sizes, 3); - const int start_h = strided_slice::StartForAxis(begin_mask, start_indices, - strides, input_dims.sizes, 2); - const int stop_h = strided_slice::StopForAxis(end_mask, stop_indices, strides, - input_dims.sizes, 2); - const int start_w = strided_slice::StartForAxis(begin_mask, start_indices, - strides, input_dims.sizes, 1); - const int stop_w = strided_slice::StopForAxis(end_mask, stop_indices, strides, - input_dims.sizes, 1); - const int start_d = strided_slice::StartForAxis(begin_mask, start_indices, - strides, input_dims.sizes, 0); - const int stop_d = strided_slice::StopForAxis(end_mask, stop_indices, strides, - input_dims.sizes, 0); +inline void StridedSlice(const tflite::StridedSliceParams& op_params, + const RuntimeShape& unextended_input_shape, + const T* input_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { + // Note that the output_shape is not used herein. + tflite::StridedSliceParams params_copy = op_params; + + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + // Reverse and pad to 4 dimensions because that is what the runtime code + // requires (ie. all shapes must be 4D and are given backwards). + strided_slice::StridedSlicePadIndices(¶ms_copy, 4); + + const int start_b = strided_slice::StartForAxis(params_copy, input_shape, 0); + const int stop_b = + strided_slice::StopForAxis(params_copy, input_shape, 0, start_b); + const int start_h = strided_slice::StartForAxis(params_copy, input_shape, 1); + const int stop_h = + strided_slice::StopForAxis(params_copy, input_shape, 1, start_h); + const int start_w = strided_slice::StartForAxis(params_copy, input_shape, 2); + const int stop_w = + strided_slice::StopForAxis(params_copy, input_shape, 2, start_w); + const int start_d = strided_slice::StartForAxis(params_copy, input_shape, 3); + const int stop_d = + strided_slice::StopForAxis(params_copy, input_shape, 3, start_d); T* out_ptr = output_data; for (int in_b = start_b; - !strided_slice::LoopCondition(in_b, stop_b, strides[3]); - in_b += strides[3]) { + !strided_slice::LoopCondition(in_b, stop_b, params_copy.strides[0]); + in_b += params_copy.strides[0]) { for (int in_h = start_h; - !strided_slice::LoopCondition(in_h, stop_h, strides[2]); - in_h += strides[2]) { + !strided_slice::LoopCondition(in_h, stop_h, params_copy.strides[1]); + in_h += params_copy.strides[1]) { for (int in_w = start_w; - !strided_slice::LoopCondition(in_w, stop_w, strides[1]); - in_w += strides[1]) { - for (int in_d = start_d; - !strided_slice::LoopCondition(in_d, stop_d, strides[0]); - in_d += strides[0]) { - *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; + !strided_slice::LoopCondition(in_w, stop_w, params_copy.strides[2]); + in_w += params_copy.strides[2]) { + for (int in_d = start_d; !strided_slice::LoopCondition( + in_d, stop_d, params_copy.strides[3]); + in_d += params_copy.strides[3]) { + *out_ptr++ = input_data[Offset(input_shape, in_b, in_h, in_w, in_d)]; } } } @@ -3220,31 +3309,39 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, } template <typename T> -inline void Slice(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& begin, const std::vector<int>& size, - T* output_data, const Dims<4>& output_dims) { - // TODO(dkalenichenko): This op only supports 4D tensors. - TFLITE_DCHECK_EQ(begin.size(), 4); - TFLITE_DCHECK_EQ(size.size(), 4); - const int start_b = begin[3]; - const int stop_b = - size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3]; - const int start_h = begin[2]; - const int stop_h = - size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2]; - const int start_w = begin[1]; - const int stop_w = - size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1]; - const int start_d = begin[0]; - const int stop_d = - size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0]; +inline void Slice(const tflite::SliceParams& op_params, + const RuntimeShape& input_shape, const T* input_data, + const RuntimeShape& output_shape, T* output_data) { + const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape); + // TODO(dkalenichenko): This op only supports 4D tensors or smaller. + TFLITE_DCHECK_LE(op_params.begin_count, 4); + TFLITE_DCHECK_LE(op_params.size_count, 4); + const int begin_count = op_params.begin_count; + const int size_count = op_params.size_count; + // We front-pad the begin and size vectors. + const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0]; + const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1) + ? ext_shape.Dims(0) - start_b + : start_b + op_params.size[0]; + const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3]; + const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1) + ? ext_shape.Dims(1) - start_h + : start_h + op_params.size[size_count - 3]; + const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2]; + const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1) + ? ext_shape.Dims(2) - start_w + : start_w + op_params.size[size_count - 2]; + const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1]; + const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1) + ? ext_shape.Dims(3) - start_d + : start_d + op_params.size[size_count - 1]; T* out_ptr = output_data; for (int in_b = start_b; in_b < stop_b; ++in_b) { for (int in_h = start_h; in_h < stop_h; ++in_h) { for (int in_w = start_w; in_w < stop_w; ++in_w) { for (int in_d = start_d; in_d < stop_d; ++in_d) { - *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; + *out_ptr++ = input_data[Offset(ext_shape, in_b, in_h, in_w, in_d)]; } } } @@ -3259,63 +3356,170 @@ inline void Exp(const T* input_data, const size_t num_elements, } } +// A generic reduce method that can be used for reduce_sum, reduce_mean, etc. +// This method iterates through input data and reduce elements along the +// dimensions given in axis. +template <typename In, typename Out> +inline bool Reduce(const In* input_data, const int* input_dims, + const int* output_dims, const int input_num_dims, + const int output_num_dims, const int* axis, + const int num_axis, int* input_iter, + Out reducer(const Out current, const In in), + Out* output_data) { + // Reset input iterator. + for (int idx = 0; idx < input_num_dims; ++idx) { + input_iter[idx] = 0; + } + // Iterate through input_data. + do { + size_t input_offset = + ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); + size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims, + input_iter, num_axis, axis); + output_data[output_offset] = + reducer(output_data[output_offset], input_data[input_offset]); + } while (NextIndex(input_num_dims, input_dims, input_iter)); + return true; +} + +inline bool ResolveAxis(const int num_dims, const int* axis, + const int64_t num_axis, int* out_axis, + int* out_num_axis) { + *out_num_axis = 0; // Just in case. + // Short-circuit axis resolution for scalars; the axis will go unused. + if (num_dims == 0) { + return true; + } + // o(n^2) is fine since out_num_axis should be really small, mostly <= 4 + for (int64_t idx = 0; idx < num_axis; ++idx) { + // Handle negative index. + int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx]; + TFLITE_DCHECK(current >= 0 && current < num_dims); + bool is_dup = false; + for (int j = 0; j < *out_num_axis; ++j) { + if (out_axis[j] == current) { + is_dup = true; + break; + } + } + if (!is_dup) { + out_axis[*out_num_axis] = current; + *out_num_axis += 1; + } + } + return true; +} + +// This method expects that output_data has been initialized. +template <typename In, typename Out> +inline bool ReduceSumImpl(const In* input_data, const int* input_dims, + const int* output_dims, const int input_num_dims, + const int output_num_dims, const int* axis, + const int num_axis, int* input_iter, + Out* output_data) { + auto reducer = [](const Out current, const In in) -> Out { + const Out actual_in = static_cast<Out>(in); + return current + actual_in; + }; + return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims, + output_num_dims, axis, num_axis, input_iter, reducer, + output_data); +} + +template <typename T> +inline bool InitTensorDataForReduce(const int* dims, const int num_dims, + const T init_value, T* data) { + size_t num_elements = 1; + for (int idx = 0; idx < num_dims; ++idx) { + size_t current = static_cast<size_t>(dims[idx]); + // Overflow prevention. + if (num_elements > std::numeric_limits<size_t>::max() / current) { + return false; + } + num_elements *= current; + } + for (size_t idx = 0; idx < num_elements; ++idx) { + data[idx] = init_value; + } + return true; +} + +// Computes the generic value (i.e., sum/max/min/prod) of elements across +// dimensions given in axis. It needs to pass in init_value and reducer. +template <typename T> +inline bool ReduceGeneric(const T* input_data, const int* input_dims, + const int input_num_dims, T* output_data, + const int* output_dims, const int output_num_dims, + const int* axis, const int64_t num_axis_dimensions, + bool keep_dims, int* temp_index, int* resolved_axis, + T init_value, + T reducer(const T current, const T in)) { + // Reset output data. + if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value, + output_data)) { + return false; + } + + // Resolve axis. + int num_resolved_axis = 0; + if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, + &num_resolved_axis)) { + return false; + } + + return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims, + output_num_dims, resolved_axis, num_resolved_axis, + temp_index, reducer, output_data); +} + +// Computes the mean of elements across dimensions given in axis. +// It does so in two stages, first calculates the sum of elements along the axis +// then divides it by the number of element in axis. template <typename T, typename U> inline bool Mean(const T* input_data, const int* input_dims, const int input_num_dims, T* output_data, const int* output_dims, const int output_num_dims, const int* axis, const int num_axis_dimensions, bool keep_dims, int* temp_index, int* resolved_axis, U* temp_sum) { - // resets output data. + // Reset output data. size_t num_outputs = 1; for (int idx = 0; idx < output_num_dims; ++idx) { - num_outputs *= static_cast<size_t>(output_dims[idx]); + size_t current = static_cast<size_t>(output_dims[idx]); + // Overflow prevention. + if (num_outputs > std::numeric_limits<size_t>::max() / current) { + return false; + } + num_outputs *= current; } for (size_t idx = 0; idx < num_outputs; ++idx) { output_data[idx] = T(); temp_sum[idx] = U(); } - // resets temp index. - for (int idx = 0; idx < input_num_dims; ++idx) { - temp_index[idx] = 0; - } - // resolves axis. + + // Resolve axis. int num_resolved_axis = 0; - for (int idx = 0; idx < num_axis_dimensions; ++idx) { - int current = axis[idx]; - TFLITE_DCHECK(current < input_num_dims && current + input_num_dims >= 0); - if (current < 0) { - current += input_num_dims; - } - bool is_dup = false; - for (int j = 0; j < num_resolved_axis; ++j) { - if (resolved_axis[j] == current) { - is_dup = true; - break; - } - } - if (!is_dup) { - resolved_axis[num_resolved_axis++] = current; - } + if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, + &num_resolved_axis)) { + return false; } - // iterates through input_data. - for (bool has_next = true; has_next; - has_next = NextIndex(input_num_dims, input_dims, temp_index)) { - size_t input_offset = - ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr); - size_t output_offset = - ReducedOutputOffset(input_num_dims, input_dims, temp_index, - num_resolved_axis, resolved_axis); - temp_sum[output_offset] += static_cast<U>(input_data[input_offset]); - } - // takes average by num of elements added to get mean. - size_t num_elements_in_axis = 1; + + if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims, + output_num_dims, resolved_axis, num_resolved_axis, + temp_index, temp_sum)) { + return false; + } + + // Calculate mean by dividing output_data by num of aggregated element. + U num_elements_in_axis = 1; for (int idx = 0; idx < num_resolved_axis; ++idx) { size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]); + // Overflow prevention. if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) { return false; } num_elements_in_axis *= current; } + if (num_elements_in_axis > 0) { for (size_t idx = 0; idx < num_outputs; ++idx) { output_data[idx] = @@ -3326,22 +3530,32 @@ inline bool Mean(const T* input_data, const int* input_dims, } template <typename T> -inline void Mean(const T* input_data, const Dims<4>& input_dims, - const std::vector<int>& reduction_indices, T* output_data, - const Dims<4>& output_dims) { - const int output_batch = ArraySize(output_dims, 3); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); - const int output_depth = ArraySize(output_dims, 0); - - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); +inline void Mean(const tflite::MeanParams& op_params, + const RuntimeShape& unextended_input_shape, + const T* input_data, + const RuntimeShape& unextended_output_shape, T* output_data) { + gemmlowp::ScopedProfilingLabel label("Mean"); + + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + const int output_batch = output_shape.Dims(0); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int output_depth = output_shape.Dims(3); + + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); // The current implementation only supports simultaneous reduction over // width and height. - TFLITE_DCHECK_EQ(reduction_indices.size(), 2); - TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) || - (reduction_indices[0] == 2 && reduction_indices[1] == 1)); + TFLITE_DCHECK_EQ(op_params.axis_count, 2); + TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) || + (op_params.axis[0] == 2 && op_params.axis[1] == 1)); TFLITE_DCHECK_EQ(output_height, 1); TFLITE_DCHECK_EQ(output_width, 1); @@ -3350,52 +3564,97 @@ inline void Mean(const T* input_data, const Dims<4>& input_dims, float value = 0; for (int in_h = 0; in_h < input_height; ++in_h) { for (int in_w = 0; in_w < input_width; ++in_w) { - value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)]; + value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)]; } } - output_data[Offset(output_dims, out_d, 0, 0, out_b)] = + output_data[Offset(output_shape, out_b, 0, 0, out_d)] = value / (input_width * input_height); } } } -template <typename T> -void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, - const Dims<4>& input2_dims, T* output_data, - const Dims<4>& output_dims) { - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); +// Computes the mean of elements across dimensions given in axis. +// It does so in two stages, first calculates the sum of elements along the axis +// then divides it by the number of element in axis for quantized values. +template <typename T, typename U> +inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point, + float input_scale, const int* input_dims, + const int input_num_dims, T* output_data, + int32 output_zero_point, float output_scale, + const int* output_dims, + const int output_num_dims, const int* axis, + const int num_axis_dimensions, bool keep_dims, + int* temp_index, int* resolved_axis, U* temp_sum, + bool compute_sum) { + // Reset output data. + size_t num_outputs = 1; + for (int idx = 0; idx < output_num_dims; ++idx) { + size_t current = static_cast<size_t>(output_dims[idx]); + // Overflow prevention. + if (num_outputs > std::numeric_limits<size_t>::max() / current) { + return false; + } + num_outputs *= current; + } + for (size_t idx = 0; idx < num_outputs; ++idx) { + output_data[idx] = T(); + temp_sum[idx] = U(); + } - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - input1_data[SubscriptToIndex(desc1, c, x, y, b)] - - input2_data[SubscriptToIndex(desc2, c, x, y, b)]; - } + // Resolve axis. + int num_resolved_axis = 0; + if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, + &num_resolved_axis)) { + return false; + } + + if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims, + output_num_dims, resolved_axis, num_resolved_axis, + temp_index, temp_sum)) { + return false; + } + + // Calculate mean by dividing output_data by num of aggregated element. + U num_elements_in_axis = 1; + for (int idx = 0; idx < num_resolved_axis; ++idx) { + size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]); + // Overflow prevention. + if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) { + return false; + } + num_elements_in_axis *= current; + } + + if (num_elements_in_axis > 0) { + const float scale = input_scale / output_scale; + if (compute_sum) { + // TODO(b/116341117): Eliminate float and do this completely in 8bit. + const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5; + for (size_t idx = 0; idx < num_outputs; ++idx) { + const U value = static_cast<U>(round(temp_sum[idx] * scale + bias)) + + output_zero_point; + output_data[idx] = static_cast<T>(value); + } + } else { + const float bias = -input_zero_point * scale + 0.5; + for (size_t idx = 0; idx < num_outputs; ++idx) { + float float_mean = static_cast<float>(temp_sum[idx]) / + static_cast<float>(num_elements_in_axis); + + // Convert to float value. + output_data[idx] = static_cast<T>(round(float_mean * scale + bias)) + + output_zero_point; } } } + return true; } template <typename T> -void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, T* output_data, - const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input1_dims); +void Minimum(const RuntimeShape& input1_shape, const T* input1_data, + const T* input2_data, const RuntimeShape& output_shape, + T* output_data) { + const int flat_size = MatchingFlatSize(input1_shape, output_shape); auto min_value = input2_data[0]; for (int i = 0; i < flat_size; i++) { @@ -3403,11 +3662,21 @@ void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, } } +// Convenience version that allows, for example, generated-code calls to be +// the same as other binary ops. +template <typename T> +inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape&, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + // Drop shape of second input: not needed. + Minimum(input1_shape, input1_data, input2_data, output_shape, output_data); +} + template <typename T> -void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, T* output_data, - const Dims<4>& output_dims) { - const int flat_size = MatchingFlatSize(output_dims, input1_dims); +void Maximum(const RuntimeShape& input1_shape, const T* input1_data, + const T* input2_data, const RuntimeShape& output_shape, + T* output_data) { + const int flat_size = MatchingFlatSize(input1_shape, output_shape); auto max_value = input2_data[0]; for (int i = 0; i < flat_size; i++) { @@ -3415,22 +3684,41 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, } } +// Convenience version that allows, for example, generated-code calls to be +// the same as other binary ops. +template <typename T> +inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape&, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + // Drop shape of second input: not needed. + Maximum(input1_shape, input1_data, input2_data, output_shape, output_data); +} + template <typename T, typename Op> -void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - T* output_data, const Dims<4>& output_dims, - Op op) { +void MaximumMinimumBroadcast4DSlow(const RuntimeShape& unextended_input1_shape, + const T* input1_data, + const RuntimeShape& unextended_input2_shape, + const T* input2_data, + const RuntimeShape& unextended_output_shape, + T* output_data, Op op) { + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - auto out_idx = Offset(output_dims, c, x, y, b); - auto in1_idx = SubscriptToIndex(desc1, c, x, y, b); - auto in2_idx = SubscriptToIndex(desc2, c, x, y, b); + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); auto in1_val = input1_data[in1_idx]; auto in2_val = input2_data[in2_idx]; output_data[out_idx] = op(in1_val, in2_val); @@ -3440,9 +3728,10 @@ void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims, } } -template <typename T1, typename T2, typename T3> -void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, - T2* output_data, const Dims<4>& output_dims) { +template <typename T1, typename T2, typename T3, typename Cmp> +void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data, + const T3* input2_data, const RuntimeShape& output_shape, + T2* output_data, const Cmp& cmp) { // The current ArgMax implemention can only determine the index of the maximum // value in the last dimension. So the axis argument is ignored. @@ -3450,67 +3739,121 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, // 1). For the sake of simplicity, the output dimensions are equal to the // input dimensions here. We enforce the constraint that the last dimension // must always be 1. - TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1); - const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims); - const int depth = ArraySize(input_dims, 0); + const int trailing_dim = output_shape.DimensionsCount() - 1; + TFLITE_DCHECK_EQ(input1_shape.DimensionsCount(), + output_shape.DimensionsCount()); + TFLITE_DCHECK_EQ(output_shape.Dims(trailing_dim), 1); + const int outer_size = + MatchingFlatSizeSkipDim(input1_shape, trailing_dim, output_shape); + const int depth = input1_shape.Dims(trailing_dim); for (int i = 0; i < outer_size; ++i) { - auto max_value = input_data[i * depth]; - int max_index = 0; + auto min_max_value = input1_data[i * depth]; + int min_max_index = 0; for (int d = 1; d < depth; ++d) { - const auto& curr_value = input_data[i * depth + d]; - if (curr_value > max_value) { - max_value = curr_value; - max_index = d; + const auto& curr_value = input1_data[i * depth + d]; + if (cmp(curr_value, min_max_value)) { + min_max_value = curr_value; + min_max_index = d; } } - output_data[i] = max_index; + output_data[i] = min_max_index; } } +template <typename T1, typename T2, typename T3> +void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data, + const T3* input2_data, const RuntimeShape& output_shape, + T2* output_data) { + ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data, + std::greater<T1>()); +} + +// Convenience version that allows, for example, generated-code calls to be +// the same as other binary ops. +template <typename T1, typename T2, typename T3> +inline void ArgMax(const RuntimeShape& input1_shape, const T1* input1_data, + const RuntimeShape& input2_shape, const T3* input2_data, + const RuntimeShape& output_shape, T2* output_data) { + // Drop shape of second input: not needed. + ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data); +} + template <typename T> -void Transpose(const T* input, const Dims<4>& input_dims, T* output, - const Dims<4>& output_dims, int* permuted_axes) { +void Transpose(const TransposeParams& params, + const RuntimeShape& unextended_input_shape, const T* input_data, + const RuntimeShape& unextended_output_shape, T* output_data) { + const int unextended_output_size = unextended_output_shape.DimensionsCount(); + TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_size, 4); + TFLITE_DCHECK_EQ(unextended_output_size, params.perm_count); + const RuntimeShape input_shape = + RuntimeShape::ExtendedShape(4, unextended_input_shape); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + const int input_ext_size = 4 - unextended_input_shape.DimensionsCount(); + const int output_ext_size = 4 - unextended_output_size; + + // The perm data is extended to match the output, each index incremented by + // the amount of front padding of the input shape. + int extended_perm[4]; + for (int i = 0; i < output_ext_size; ++i) { + extended_perm[i] = i; + } + for (int i = 0; i < unextended_output_size; ++i) { + extended_perm[i + output_ext_size] = params.perm[i] + input_ext_size; + } + int out_sizes[4]; // Compute the inverse permutation array so we can do an output centered // transpose. Also, check to make sure output_dims is matching input_dims. for (int k = 0; k < 4; k++) { - out_sizes[k] = - MatchingArraySize(input_dims, permuted_axes[k], output_dims, k); + out_sizes[k] = MatchingDim(input_shape, extended_perm[k], output_shape, k); } // Naive transpose loop (iterate on output index and compute input index). int o[4]; // loop index (on output). int i[4]; for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) { - i[permuted_axes[3]] = o[3]; + i[extended_perm[3]] = o[3]; for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) { - i[permuted_axes[2]] = o[2]; + i[extended_perm[2]] = o[2]; for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) { - i[permuted_axes[1]] = o[1]; + i[extended_perm[1]] = o[1]; for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) { - i[permuted_axes[0]] = o[0]; - output[Offset(output_dims, o)] = input[Offset(input_dims, i)]; + i[extended_perm[0]] = o[0]; + output_data[Offset(output_shape, o)] = + input_data[Offset(input_shape, i)]; } } } } } -inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, - const float* filter_data, const Dims<4>& filter_dims, - int stride_width, int stride_height, int pad_width, - int pad_height, float* output_data, - const Dims<4>& output_dims) { - const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); - const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 3); - const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); - const int input_height = ArraySize(input_dims, 2); - const int input_width = ArraySize(input_dims, 1); - const int filter_height = ArraySize(filter_dims, 2); - const int filter_width = ArraySize(filter_dims, 1); - const int output_height = ArraySize(output_dims, 2); - const int output_width = ArraySize(output_dims, 1); +inline void TransposeConv( + const ConvParams& params, const RuntimeShape& input_shape, + const float* input_data, const RuntimeShape& filter_shape, + const float* filter_data, const RuntimeShape& output_shape, + float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + (void)im2col_data; // only used in optimized code. + (void)im2col_shape; // only used in optimized code. + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); // Although transpose convolution simplifies to convolution with transposed // weights for strides of 1, non-unitary striding complicates matters. To @@ -3519,7 +3862,8 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, // computing their influence on the output, rather than looping through the // output elements in the typical "gather" access pattern of a conv. We // therefore must initialize the output array to zero. - for (int i = 0; i < FlatSize(output_dims); i++) { + const int num_elements = output_shape.FlatSize(); + for (int i = 0; i < num_elements; i++) { output_data[i] = 0.0f; } @@ -3541,13 +3885,14 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, // We cannot accumulate out of bounds if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) && (out_y < output_height)) { - float input_value = input_data[Offset(input_dims, in_channel, - in_x, in_y, batch)]; + float input_value = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; float filter_value = - filter_data[Offset(filter_dims, out_channel, filter_x, - filter_y, in_channel)]; - output_data[Offset(output_dims, out_channel, out_x, out_y, - batch)] += input_value * filter_value; + filter_data[Offset(filter_shape, out_channel, filter_y, + filter_x, in_channel)]; + output_data[Offset(output_shape, batch, out_y, out_x, + out_channel)] += + input_value * filter_value; } } } @@ -3559,6 +3904,16 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims, } template <typename T> +inline bool EqualFn(T lhs, T rhs) { + return lhs == rhs; +} + +template <typename T> +inline bool NotEqualFn(T lhs, T rhs) { + return lhs != rhs; +} + +template <typename T> inline bool GreaterFn(T lhs, T rhs) { return lhs > rhs; } @@ -3579,89 +3934,144 @@ template <typename T> using ComparisonFn = bool (*)(T, T); template <typename T, ComparisonFn<T> F> -inline void Comparison(const T* input1_data, const Dims<4>& input1_dims, - const T* input2_data, const Dims<4>& input2_dims, - bool* output_data, const Dims<4>& output_dims) { +inline void ComparisonImpl( + const ComparisonParams& op_params, const RuntimeShape& input1_shape, + const T* input1_data, const RuntimeShape& input2_shape, + const T* input2_data, const RuntimeShape& output_shape, bool* output_data) { const int64_t flatsize = - MatchingFlatSize(input1_dims, input2_dims, output_dims); + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int64_t i = 0; i < flatsize; ++i) { output_data[i] = F(input1_data[i], input2_data[i]); } } +template <ComparisonFn<float> F> +inline void Comparison(const ComparisonParams& op_params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, bool* output_data) { + ComparisonImpl<float, F>(op_params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); +} + template <typename T, ComparisonFn<int32> F> -inline void Comparison(int left_shift, const T* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const T* input2_data, const Dims<4>& input2_dims, - int32 input2_offset, int32 input2_multiplier, - int input2_shift, bool* output_data, - const Dims<4>& output_dims) { +inline void ComparisonWithScaling( + const ComparisonParams& op_params, const RuntimeShape& input1_shape, + const T* input1_data, const RuntimeShape& input2_shape, + const T* input2_data, const RuntimeShape& output_shape, bool* output_data) { + int left_shift = op_params.left_shift; + int32 input1_offset = op_params.input1_offset; + int32 input1_multiplier = op_params.input1_multiplier; + int input1_shift = op_params.input1_shift; + int32 input2_offset = op_params.input2_offset; + int32 input2_multiplier = op_params.input2_multiplier; + int input2_shift = op_params.input2_shift; + const int64_t flatsize = - MatchingFlatSize(input1_dims, input2_dims, output_dims); + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int64_t i = 0; i < flatsize; ++i) { const int32 input1_val = input1_offset + input1_data[i]; const int32 input2_val = input2_offset + input2_data[i]; const int32 shifted_input1_val = input1_val * (1 << left_shift); const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne( - shifted_input2_val, input2_multiplier, input2_shift); + const int32 scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, input1_multiplier, input1_shift); + const int32 scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, input2_multiplier, input2_shift); output_data[i] = F(scaled_input1_val, scaled_input2_val); } } template <typename T, ComparisonFn<T> F> -inline void BroadcastComparison(const T* input1_data, - const Dims<4>& input1_dims, - const T* input2_data, - const Dims<4>& input2_dims, bool* output_data, - const Dims<4>& output_dims) { +inline void BroadcastComparison4DSlowImpl( + const ComparisonParams& op_params, + const RuntimeShape& unextended_input1_shape, const T* input1_data, + const RuntimeShape& unextended_input2_shape, const T* input2_data, + const RuntimeShape& unextended_output_shape, bool* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlow"); + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { - output_data[Offset(output_dims, c, x, y, b)] = - F(input1_data[SubscriptToIndex(desc1, c, x, y, b)], - input2_data[SubscriptToIndex(desc2, c, x, y, b)]); + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + output_data[Offset(output_shape, b, y, x, c)] = + F(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]); } } } } } +template <ComparisonFn<float> F> +inline void BroadcastComparison4DSlow(const ComparisonParams& op_params, + const RuntimeShape& input1_shape, + const float* input1_data, + const RuntimeShape& input2_shape, + const float* input2_data, + const RuntimeShape& output_shape, + bool* output_data) { + BroadcastComparison4DSlowImpl<float, F>(op_params, input1_shape, input1_data, + input2_shape, input2_data, + output_shape, output_data); +} template <typename T, ComparisonFn<int32> F> -inline void BroadcastComparison(int left_shift, const T* input1_data, - const Dims<4>& input1_dims, int32 input1_offset, - int32 input1_multiplier, int input1_shift, - const T* input2_data, - const Dims<4>& input2_dims, int32 input2_offset, - int32 input2_multiplier, int input2_shift, - bool* output_data, const Dims<4>& output_dims) { +inline void BroadcastComparison4DSlowWithScaling( + const ComparisonParams& op_params, + const RuntimeShape& unextended_input1_shape, const T* input1_data, + const RuntimeShape& unextended_input2_shape, const T* input2_data, + const RuntimeShape& unextended_output_shape, bool* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastComparison4DSlowWithScaling"); + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); - for (int b = 0; b < ArraySize(output_dims, 3); ++b) { - for (int y = 0; y < ArraySize(output_dims, 2); ++y) { - for (int x = 0; x < ArraySize(output_dims, 1); ++x) { - for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); + + int left_shift = op_params.left_shift; + int32 input1_offset = op_params.input1_offset; + int32 input1_multiplier = op_params.input1_multiplier; + int input1_shift = op_params.input1_shift; + int32 input2_offset = op_params.input2_offset; + int32 input2_multiplier = op_params.input2_multiplier; + int input2_shift = op_params.input2_shift; + + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { const int32 input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; + input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; const int32 input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; + input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; const int32 shifted_input1_val = input1_val * (1 << left_shift); const int32 shifted_input2_val = input2_val * (1 << left_shift); const int32 scaled_input1_val = - MultiplyByQuantizedMultiplierSmallerThanOne( + MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, input1_multiplier, input1_shift); const int32 scaled_input2_val = - MultiplyByQuantizedMultiplierSmallerThanOne( + MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, input2_multiplier, input2_shift); - output_data[Offset(output_dims, c, x, y, b)] = + output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val); } } @@ -3669,52 +4079,71 @@ inline void BroadcastComparison(int left_shift, const T* input1_data, } } -#define TFLITE_COMPARISON_OP(name) \ - template <typename T> \ - inline void name(const T* input1_data, const Dims<4>& input1_dims, \ - const T* input2_data, const Dims<4>& input2_dims, \ - bool* output_data, const Dims<4>& output_dims) { \ - gemmlowp::ScopedProfilingLabel label(#name); \ - Comparison<T, name##Fn>(input1_data, input1_dims, input2_data, \ - input2_dims, output_data, output_dims); \ - } \ - template <typename T> \ - inline void name( \ - int left_shift, const T* input1_data, const Dims<4>& input1_dims, \ - int32 input1_offset, int32 input1_multiplier, int input1_shift, \ - const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset, \ - int32 input2_multiplier, int input2_shift, bool* output_data, \ - const Dims<4>& output_dims) { \ - gemmlowp::ScopedProfilingLabel label(#name "/8bit"); \ - Comparison<T, name##Fn>(left_shift, input1_data, input1_dims, \ - input1_offset, input1_multiplier, input1_shift, \ - input2_data, input2_dims, input2_offset, \ - input2_multiplier, input2_shift, output_data, \ - output_dims); \ - } \ - template <typename T> \ - inline void Broadcast##name( \ - const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, \ - const Dims<4>& input2_dims, bool* output_data, \ - const Dims<4>& output_dims) { \ - gemmlowp::ScopedProfilingLabel label("Broadcast" #name); \ - BroadcastComparison<T, name##Fn>(input1_data, input1_dims, input2_data, \ - input2_dims, output_data, output_dims); \ - } \ - template <typename T> \ - inline void Broadcast##name( \ - int left_shift, const T* input1_data, const Dims<4>& input1_dims, \ - int32 input1_offset, int32 input1_multiplier, int input1_shift, \ - const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset, \ - int32 input2_multiplier, int input2_shift, bool* output_data, \ - const Dims<4>& output_dims) { \ - gemmlowp::ScopedProfilingLabel label("Broadcast" #name "/8bit"); \ - BroadcastComparison<T, name##Fn>(left_shift, input1_data, input1_dims, \ - input1_offset, input1_multiplier, \ - input1_shift, input2_data, input2_dims, \ - input2_offset, input2_multiplier, \ - input2_shift, output_data, output_dims); \ +#define TFLITE_COMPARISON_OP(name) \ + inline void name(const ComparisonParams& op_params, \ + const RuntimeShape& input1_shape, const float* input1_data, \ + const RuntimeShape& input2_shape, const float* input2_data, \ + const RuntimeShape& output_shape, bool* output_data) { \ + gemmlowp::ScopedProfilingLabel label(#name); \ + Comparison<name##Fn>(op_params, input1_shape, input1_data, input2_shape, \ + input2_data, output_shape, output_data); \ + } \ + template <typename T> \ + inline void name##NoScaling( \ + const ComparisonParams& op_params, const RuntimeShape& input1_shape, \ + const T* input1_data, const RuntimeShape& input2_shape, \ + const T* input2_data, const RuntimeShape& output_shape, \ + bool* output_data) { \ + gemmlowp::ScopedProfilingLabel label(#name "NoScaling"); \ + ComparisonImpl<T, name##Fn>(op_params, input1_shape, input1_data, \ + input2_shape, input2_data, output_shape, \ + output_data); \ + } \ + template <typename T> \ + inline void name##WithScaling( \ + const ComparisonParams& op_params, const RuntimeShape& input1_shape, \ + const T* input1_data, const RuntimeShape& input2_shape, \ + const T* input2_data, const RuntimeShape& output_shape, \ + bool* output_data) { \ + gemmlowp::ScopedProfilingLabel label(#name "WithScaling/8bit"); \ + ComparisonWithScaling<T, name##Fn>(op_params, input1_shape, input1_data, \ + input2_shape, input2_data, \ + output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name##NoScaling( \ + const ComparisonParams& op_params, const RuntimeShape& input1_shape, \ + const T* input1_data, const RuntimeShape& input2_shape, \ + const T* input2_data, const RuntimeShape& output_shape, \ + bool* output_data) { \ + gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name "NoScaling"); \ + BroadcastComparison4DSlowImpl<T, name##Fn>( \ + op_params, input1_shape, input1_data, input2_shape, input2_data, \ + output_shape, output_data); \ + } \ + inline void Broadcast4DSlow##name( \ + const ComparisonParams& op_params, const RuntimeShape& input1_shape, \ + const float* input1_data, const RuntimeShape& input2_shape, \ + const float* input2_data, const RuntimeShape& output_shape, \ + bool* output_data) { \ + gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name); \ + BroadcastComparison4DSlow<name##Fn>(op_params, input1_shape, input1_data, \ + input2_shape, input2_data, \ + output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name##WithScaling( \ + const ComparisonParams& op_params, const RuntimeShape& input1_shape, \ + const T* input1_data, const RuntimeShape& input2_shape, \ + const T* input2_data, const RuntimeShape& output_shape, \ + bool* output_data) { \ + gemmlowp::ScopedProfilingLabel label("Broadcast4DSlow" #name "/8bit"); \ + BroadcastComparison4DSlowWithScaling<T, name##Fn>( \ + op_params, input1_shape, input1_data, input2_shape, input2_data, \ + output_shape, output_data); \ } +TFLITE_COMPARISON_OP(Equal); +TFLITE_COMPARISON_OP(NotEqual); TFLITE_COMPARISON_OP(Greater); TFLITE_COMPARISON_OP(GreaterEqual); TFLITE_COMPARISON_OP(Less); @@ -3722,13 +4151,13 @@ TFLITE_COMPARISON_OP(LessEqual); #undef TFLITE_COMPARISON_OP template <typename D, typename T> -inline void Select(const D* input_condition_data, - const Dims<4>& input_condition_dims, const T* input_x_data, - const Dims<4>& input_x_dims, const T* input_y_data, - const Dims<4>& input_y_dims, T* output_data, - const Dims<4>& output_dims) { - const int64_t flatsize = - MatchingFlatSize(input_x_dims, input_y_dims, output_dims); +void Select(const RuntimeShape& input_condition_shape, + const D* input_condition_data, const RuntimeShape& input_x_shape, + const T* input_x_data, const RuntimeShape& input_y_shape, + const T* input_y_data, const RuntimeShape& output_shape, + T* output_data) { + const int64_t flatsize = MatchingFlatSize( + input_condition_shape, input_x_shape, input_y_shape, output_shape); for (int64_t i = 0; i < flatsize; ++i) { output_data[i] = input_condition_data[i] ? input_x_data[i] : input_y_data[i]; @@ -3736,24 +4165,210 @@ inline void Select(const D* input_condition_data, } template <typename D, typename T> -inline void RankOneSelect(const D* input_condition_data, - const Dims<4>& input_condition_dims, - const T* input_x_data, const Dims<4>& input_x_dims, - const T* input_y_data, const Dims<4>& input_y_dims, - T* output_data, const Dims<4>& output_dims) { - const int64_t rank = MatchingArraySize(input_condition_dims, 0, input_x_dims, - 3, input_y_dims, 3, output_dims, 3); +void RankOneSelect(const RuntimeShape& input_condition_shape, + const D* input_condition_data, + const RuntimeShape& input_x_shape, const T* input_x_data, + const RuntimeShape& input_y_shape, const T* input_y_data, + const RuntimeShape& output_shape, T* output_data) { + const int64_t outer_size = input_condition_shape.FlatSize(); + TFLITE_DCHECK_EQ( + MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0), + outer_size); const int64_t inner_size = - MatchingFlatSizeSkipDim(input_x_dims, 3, input_y_dims, output_dims); + MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape); int64_t offset = 0; - for (int64_t i = 0; i < rank; i++) { + for (int64_t i = 0; i < outer_size; i++) { const T* input_data = input_condition_data[i] ? input_x_data : input_y_data; memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T)); offset += inner_size; } } +// For easy implementation, the indices is always a vector of size-4 vectors. +template <typename T, typename TI> +inline void SparseToDense(const std::vector<std::vector<TI>>& indices, + const T* values, T default_value, + bool value_is_scalar, + const RuntimeShape& unextended_output_shape, + T* output_data) { + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + const int value_count = indices.size(); + + // First fill the output_data with default value. + const int num_elements = output_shape.FlatSize(); + for (int i = 0; i < num_elements; ++i) { + output_data[i] = default_value; + } + + // Special handle for value is scalar case to avoid checking the boolean + // condition within the loop every time. + if (value_is_scalar) { + for (int i = 0; i < value_count; ++i) { + const std::vector<TI>& index = indices[i]; + TFLITE_DCHECK_EQ(index.size(), 4); + const T value = *values; // just use the first value. + output_data[Offset(output_shape, index[0], index[1], index[2], + index[3])] = value; + } + return; + } + + // Go through the values and indices to fill the sparse values. + for (int i = 0; i < value_count; ++i) { + const std::vector<TI>& index = indices[i]; + TFLITE_DCHECK_EQ(index.size(), 4); + const T value = values[i]; + output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = + value; + } +} + +template <typename T> +inline void Pow(const RuntimeShape& input1_shape, const T* input1_data, + const RuntimeShape& input2_shape, const T* input2_data, + const RuntimeShape& output_shape, T* output_data) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = std::pow(input1_data[i], input2_data[i]); + } +} + +template <typename T> +inline void BroadcastPow4DSlow(const RuntimeShape& unextended_input1_shape, + const T* input1_data, + const RuntimeShape& unextended_input2_shape, + const T* input2_data, + const RuntimeShape& unextended_output_shape, + T* output_data) { + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = std::pow(in1_val, in2_val); + } + } + } + } +} + +inline void Logical(const RuntimeShape& input1_shape, const bool* input1_data, + const RuntimeShape& input2_shape, const bool* input2_data, + const RuntimeShape& output_shape, bool* output_data, + const std::function<bool(bool, bool)>& func) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = func(input1_data[i], input2_data[i]); + } +} + +inline void BroadcastLogical4DSlow( + const RuntimeShape& unextended_input1_shape, const bool* input1_data, + const RuntimeShape& unextended_input2_shape, const bool* input2_data, + const RuntimeShape& unextended_output_shape, bool* output_data, + const std::function<bool(bool, bool)>& func) { + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = func(in1_val, in2_val); + } + } + } + } +} + +// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more +// generalized and efficient BroadcastBinaryFunction. +// +// Also appears to duplicte MinimumMaximum. +// +// R: Result type. T1: Input 1 type. T2: Input 2 type. +template <typename R, typename T1, typename T2> +inline void BroadcastBinaryFunction4DSlow( + const RuntimeShape& unextended_input1_shape, const T1* input1_data, + const RuntimeShape& unextended_input2_shape, const T2* input2_data, + const RuntimeShape& unextended_output_shape, R* output_data, + R (*func)(T1, T2)) { + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = func(in1_val, in2_val); + } + } + } + } +} + +// R: Result type. T1: Input 1 type. T2: Input 2 type. +// TODO(renjieliu): Refactor other binary functions to use this one. +template <typename R, typename T1, typename T2> +inline void BinaryFunction(const RuntimeShape& input1_shape, + const T1* input1_data, + const RuntimeShape& input2_shape, + const T2* input2_data, + const RuntimeShape& output_shape, R* output_data, + R (*func)(T1, T2)) { + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) { + output_data[i] = func(input1_data[i], input2_data[i]); + } +} + } // namespace reference_ops } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/reference/softmax.h b/tensorflow/contrib/lite/kernels/internal/reference/softmax.h new file mode 100644 index 0000000000..7d44296134 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/reference/softmax.h @@ -0,0 +1,179 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_ + +#include "fixedpoint/fixedpoint.h" +#include "tensorflow/contrib/lite/kernels/internal/common.h" +#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" +#include "tensorflow/contrib/lite/kernels/internal/round.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace reference_ops { + +inline void Softmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const float* input_data, + const RuntimeShape& output_shape, float* output_data) { + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) { + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + float max = std::numeric_limits<float>::lowest(); + for (int c = 0; c < depth; ++c) { + max = std::max(max, input_data[i * depth + c]); + } + + // Compute sum. + float sum = 0.f; + for (int c = 0; c < depth; ++c) { + sum += std::exp((input_data[i * depth + c] - max) * params.beta); + } + + // Compute result. + for (int c = 0; c < depth; ++c) { + output_data[i * depth + c] = + std::exp((input_data[i * depth + c] - max) * params.beta) / sum; + } + } +} + +inline void Softmax(const SoftmaxParams& params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& output_shape, uint8* output_data) { + const int32 input_beta_multiplier = params.input_multiplier; + const int32 input_beta_left_shift = params.input_left_shift; + const int diff_min = params.diff_min; + // The representation chosen for the input to the exp() function is Q5.26. + // We need to leave extra space since values that we skip might be as large as + // -32 before multiplying by input_beta_multiplier, and therefore as large as + // -16 afterwards. Note that exp(-8) is definitely not insignificant to + // accumulation, but exp(-16) definitely is. + static const int kScaledDiffIntegerBits = 5; + static const int kAccumulationIntegerBits = 12; + using FixedPointScaledDiff = + gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; + using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; + using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; + + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = + MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = + MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) { + uint8 max_in_row = 0; + for (int c = 0; c < depth; ++c) { + max_in_row = std::max(max_in_row, input_data[i * depth + c]); + } + + FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); + for (int c = 0; c < depth; ++c) { + int32 input_diff = + static_cast<int32>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) { + const int32 input_diff_rescaled = + MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( + exp_on_negative_values(scaled_diff_f8)); + } + } + + int32 fixed_sum_of_exps = sum_of_exps.raw(); + int headroom_plus_one = + CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps)); + // This is the number of bits to the left of the binary point above 1.0. + // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and + // no later adjustment will be needed. + int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; + int32 shifted_sum_minus_one = static_cast<int32>( + (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - + (static_cast<uint32>(1) << 31)); + + FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( + FixedPoint0::FromRaw(shifted_sum_minus_one)); + + for (int c = 0; c < depth; ++c) { + int32 input_diff = + static_cast<int32>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) { + const int32 input_diff_rescaled = + MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + + FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); + int32 unsat_output = gemmlowp::RoundingDivideByPOT( + (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); + + output_data[i * depth + c] = static_cast<uint8>( + std::max(std::min(unsat_output, static_cast<int32>(255)), + static_cast<int32>(0))); + + } else { + output_data[i * depth + c] = 0; + } + } + } +} + +// Performs softmax along the input of size (input_size * batch_size). +inline void Softmax(const float* in, const int input_size, const int batch_size, + const float beta, float* out) { + // TF_LITE_ASSERT(input_size > 0); + + // For each batch + for (int b = 0; b < batch_size; b++) { + // Find the max coeff. + float max_coeff = in[0]; + for (int i = 1; i < input_size; i++) { + if (in[i] > max_coeff) max_coeff = in[i]; + } + + // Compute the normalized sum of exps. + float exp_sum = 0.0; + for (int i = 0; i < input_size; i++) { + out[i] = std::exp((in[i] - max_coeff) * beta); + exp_sum += out[i]; + } + + // Divide by the sum of exps. + float reciprocal_sum_exp = 1.f / exp_sum; + for (int i = 0; i < input_size; i++) { + out[i] *= reciprocal_sum_exp; + } + + // Advance in and out pointers for the next batch. + in += input_size; + out += input_size; + } +} + +} // namespace reference_ops +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/internal/resize_bilinear_test.cc new file mode 100644 index 0000000000..15df31f75a --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/resize_bilinear_test.cc @@ -0,0 +1,138 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <algorithm> +#include <cmath> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/test_util.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" + +namespace tflite { +namespace { +template <typename T> +void TestOneResizeBilinear(int batch, int depth, int input_width, + int input_height, int output_width, + int output_height, float error_threshold) { + RuntimeShape input_dims_inference({batch, input_height, input_width, depth}); + RuntimeShape output_dims_inference( + {batch, output_height, output_width, depth}); + + const int input_buffer_size = input_dims_inference.FlatSize(); + const int output_buffer_size = output_dims_inference.FlatSize(); + + std::vector<T> input_data(input_buffer_size, 0); + std::vector<T> reference_output_data(output_buffer_size, 0); + // Initialize the output data with something other than zero, so we can catch + // issue with kernels failing to initialize the output. + std::vector<T> output_data(output_buffer_size, 3); + + const T min_amplitude = static_cast<T>(0); + const T max_amplitude = static_cast<T>(255); + FillRandom(&input_data, min_amplitude, max_amplitude); + + RuntimeShape output_size_dims({1, 1, 1, 2}); + std::vector<int32> output_size_data = {output_height, output_width}; + + tflite::ResizeBilinearParams op_params; + op_params.align_corners = false; + + reference_ops::ResizeBilinear(op_params, input_dims_inference, + input_data.data(), output_size_dims, + output_size_data.data(), output_dims_inference, + reference_output_data.data()); + optimized_ops::ResizeBilinear( + op_params, input_dims_inference, input_data.data(), output_size_dims, + output_size_data.data(), output_dims_inference, output_data.data()); + + double sum_diff = 0; + float max_abs_val = 0; + for (int i = 0; i < output_buffer_size; i++) { + sum_diff += std::abs(static_cast<float>(output_data[i]) - + static_cast<float>(reference_output_data[i])); + max_abs_val = std::max( + max_abs_val, std::abs(static_cast<float>(reference_output_data[i]))); + } + + if (sum_diff != 0.f) { + const float mean_diff = static_cast<float>(sum_diff / output_buffer_size); + const float relative_error = std::abs(mean_diff) / max_abs_val; + ASSERT_LT(relative_error, error_threshold); + } +} + +TEST(ResizeBilinear, TestResizeBilinear8Bit) { + const int kTestsToRun = 100 * 1000; + for (int i = 0; i < kTestsToRun; i++) { + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50); + const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200); + + TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height, + output_width, output_height, 0.025); + } +} + +TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) { + const int kTestsToRun = 100 * 1000; + for (int i = 0; i < kTestsToRun; i++) { + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50); + const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int output_width = input_width * 2; + const int output_height = input_height * 2; + + TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height, + output_width, output_height, 1e-5); + } +} + +TEST(ResizeBilinear, TestResizeBilinear) { + const int kTestsToRun = 100 * 1000; + for (int i = 0; i < kTestsToRun; i++) { + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50); + const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200); + + TestOneResizeBilinear<float>(batch, depth, input_width, input_height, + output_width, output_height, 1e-5); + } +} + +TEST(ResizeBilinear2x2, TestResizeBilinear) { + const int kTestsToRun = 100 * 1000; + for (int i = 0; i < kTestsToRun; i++) { + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50); + const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200); + const int output_width = input_width * 2; + const int output_height = input_height * 2; + + TestOneResizeBilinear<float>(batch, depth, input_width, input_height, + output_width, output_height, 1e-5); + } +} +} // namespace +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc new file mode 100644 index 0000000000..831fb3c243 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc @@ -0,0 +1,236 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <algorithm> +#include <cmath> +#include <cstdlib> +#include <functional> +#include <iterator> +#include <limits> +#include <random> +#include <string> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/test_util.h" +#include "tensorflow/contrib/lite/string.h" + +namespace tflite { +namespace { + +void RunSoftmaxFloatReference(const uint8* input_data, + const RuntimeShape& shape_common, + int32 input_offset, const double input_scale, + int stride, float beta, + uint8* reference_output_data) { + const int ref_buffer_size = shape_common.FlatSize(); + std::vector<float> reference_dequant_data(ref_buffer_size); + std::vector<float> reference_output_float_data(ref_buffer_size); + + // Reference data generated via Dequant of input into float, and then applying + // float Softmax. + DequantizationParams dq_params; + dq_params.zero_point = input_offset; + dq_params.scale = input_scale; + reference_ops::Dequantize(dq_params, shape_common, input_data, shape_common, + reference_dequant_data.data()); + SoftmaxParams sm_params; + sm_params.beta = beta; + optimized_ops::Softmax(sm_params, shape_common, reference_dequant_data.data(), + shape_common, reference_output_float_data.data()); + // Work with quantized scaling for Softmax, under which 256 represents 1, but + // we limit this to 255. + for (int i = 0; i < ref_buffer_size; i++) { + reference_output_data[i] = std::min( + 255, + static_cast<int>(std::round(256.0f * reference_output_float_data[i]))); + } +} + +void CheckOutputData(const uint8* test_output, const uint8* reference_output, + const RuntimeShape& shape_common, + const string& check_label, bool be_exacting) { + const int buffer_size = shape_common.FlatSize(); + // While calculating some metrics in floating point, we work with quantized + // scaling. + std::vector<int> diff(buffer_size); + int64_t sum_diff = 0; + int64_t sum_abs_diff = 0; + for (int i = 0; i < buffer_size; i++) { + diff[i] = static_cast<int>(test_output[i]) - reference_output[i]; + sum_diff += diff[i]; + sum_abs_diff += std::abs(diff[i]); + } + // These stats help understand test failures. + std::sort(std::begin(diff), std::end(diff)); + const int min_diff = diff.front(); + const int max_diff = diff.back(); + const int median_diff = diff[diff.size() / 2]; + const float mean_diff = static_cast<float>(sum_diff) / buffer_size; + const float mean_abs_diff = static_cast<float>(sum_abs_diff) / buffer_size; + // We either check for bit exactness (against the reference quantized version) + // or for general accuracy, allowing off-by-one (against the float reference). + if (be_exacting) { + ASSERT_TRUE(std::abs(min_diff) == 0 && std::abs(max_diff) == 0); + } else { + // For small numbers of samples, the estimates of the means vary more. + // Rather than widen the tolerances, we skip the smaller tests. + ASSERT_TRUE(((std::abs(mean_diff) < 2e-2f && mean_abs_diff < 3e-2f) || + buffer_size < 10000) && + std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 && + std::abs(max_diff) <= 1); + } +} + +// Runs the Softmax and compares against the float reference implementation and +// the quantized reference implementation. +void RunOneSoftmaxTest(const uint8* input_data, + const RuntimeShape& shape_common, int32 input_offset, + const double input_scale, int stride, float beta) { + const int buffer_size = shape_common.FlatSize(); + std::vector<uint8> optimized_softmax_output(buffer_size); + std::vector<uint8> reference_float_softmax_output(buffer_size); + std::vector<uint8> reference_quant_softmax_output(buffer_size); + + RunSoftmaxFloatReference(input_data, shape_common, input_offset, input_scale, + stride, beta, reference_float_softmax_output.data()); + + int32 input_beta_multiplier; + int input_beta_left_shift; + static const int kScaledDiffIntegerBits = 5; + tflite::PreprocessSoftmaxScaling(beta, input_scale, kScaledDiffIntegerBits, + &input_beta_multiplier, + &input_beta_left_shift); + // diff_min has a negative value, and is used to limit the maximum magnitude + // of the diffs, which are <= 0. + const int diff_min = -tflite::CalculateInputRadius(kScaledDiffIntegerBits, + input_beta_left_shift); + + SoftmaxParams params; + params.input_multiplier = input_beta_multiplier; + params.input_left_shift = input_beta_left_shift; + params.diff_min = diff_min; + optimized_ops::Softmax(params, shape_common, input_data, shape_common, + optimized_softmax_output.data()); + reference_ops::Softmax(params, shape_common, input_data, shape_common, + reference_quant_softmax_output.data()); + + CheckOutputData(optimized_softmax_output.data(), + reference_float_softmax_output.data(), shape_common, + "Optimized vs float reference", false); + CheckOutputData(optimized_softmax_output.data(), + reference_quant_softmax_output.data(), shape_common, + "Optimized vs quant reference", true); + CheckOutputData(reference_quant_softmax_output.data(), + reference_float_softmax_output.data(), shape_common, + "Quant reference vs float reference", false); +} + +// This function picks some random Softmax params, which are checked for +// desirability. If not acceptable, it returns false. If they're OK, +// it runs the Softmax test and returns true. This allows the caller +// to loop until a test has been run. +// +// Currently we do not reject for any reason. +bool TryOneUniformSoftmax() { + // We pick mostly positive values, on the whole emphasizing smaller values and + // therefore faster tests. We test a wider range of depths. In the case of + // Softmax, the width and height really just create test repetitions. + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int input_depth = ExponentialRandomPositiveInt(0.75f, 175, 500); + const int input_width = ExponentialRandomPositiveInt(0.8f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.8f, 20, 200); + const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8); + const double input_scale = std::pow(10.0, UniformRandomFloat(-2.0, 1.0)); + const int32 input_offset = UniformRandomInt(-256, 0); + const float beta = 1.0f + ExponentialRandomPositiveFloat(0.9f, 2, 10); + + auto shape_common = + RuntimeShape({batch, input_height, input_width, input_depth}); + const int buffer_size = shape_common.FlatSize(); + + std::vector<uint8> input_data(buffer_size); + FillRandom(&input_data); + RunOneSoftmaxTest(input_data.data(), shape_common, input_offset, input_scale, + stride, beta); + return true; +} + +// See TryOneUniformSoftmax() for a general description. +// +// Tests with "skyscraper" input patterns are included for two reasons. (a) +// Bimodal distributions are potentially challenging and perhaps more +// realistic than simple uniform random inputs. (b) Some implementations of +// Softmax may adapt as they traverse the depth, and so we test handling of +// cases where relatively small values are encountered at the beginning and end. +bool TryOneSkyscraperSoftmax(bool small_depth) { + // We pick mostly positive values, on the whole emphasizing smaller values and + // therefore faster tests. We test a wider range of depths. In the case of + // Softmax, the width and height really just create test repetitions. + const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20); + const int input_depth = small_depth + ? ExponentialRandomPositiveInt(0.75f, 40, 500) + : ExponentialRandomPositiveInt(0.75f, 175, 500); + const int input_width = ExponentialRandomPositiveInt(0.7f, 20, 200); + const int input_height = ExponentialRandomPositiveInt(0.7f, 20, 200); + const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8); + const double input_scale = std::pow(10.0, UniformRandomFloat(-2.0, 1.0)); + const int32 input_offset = UniformRandomInt(-256, 0); + const float beta = 1.0f + ExponentialRandomPositiveFloat(0.9f, 2, 10); + // Extra parameters for skyscraper input patterns. + const double middle_proportion = + ExponentialRandomPositiveFloat(0.65f, 0.1, 1.0); + const int middle_min = UniformRandomInt(0, 255); + const int sides_max = UniformRandomInt(0, middle_min); + + auto shape_common = + RuntimeShape({batch, input_height, input_width, input_depth}); + const int buffer_size = shape_common.FlatSize(); + + std::vector<uint8> input_data(buffer_size); + FillRandomSkyscraper(&input_data, input_depth, middle_proportion, middle_min, + sides_max); + RunOneSoftmaxTest(input_data.data(), shape_common, input_offset, input_scale, + stride, beta); + return true; +} + +TEST(TestQuantizedSoftmax, UniformSoftmaxTests) { + const int kTestsToRun = 1000; + for (int i = 0; i < kTestsToRun; i++) { + while (!TryOneUniformSoftmax()) { + } + } +} + +TEST(TestQuantizedSoftmax, SkyscraperSoftmaxTests) { + const int kTestsToRun = 1000; + for (int i = 0; i < kTestsToRun; i++) { + while (!TryOneSkyscraperSoftmax(false)) { + } + } +} + +TEST(TestQuantizedSoftmax, SmallSkyscraperSoftmaxTests) { + const int kTestsToRun = 1000; + for (int i = 0; i < kTestsToRun; i++) { + while (!TryOneSkyscraperSoftmax(true)) { + } + } +} +} // namespace +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc index 4eddf7bf0a..20abcb7258 100644 --- a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc +++ b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc @@ -43,13 +43,13 @@ bool Spectrogram::Initialize(int window_length, int step_length) { return Initialize(window, step_length); } -inline int Log2Floor(uint n) { +inline int Log2Floor(uint32_t n) { if (n == 0) return -1; int log = 0; - uint value = n; + uint32_t value = n; for (int i = 4; i >= 0; --i) { int shift = (1 << i); - uint x = value >> shift; + uint32_t x = value >> shift; if (x != 0) { value = x; log += shift; @@ -58,7 +58,7 @@ inline int Log2Floor(uint n) { return log; } -inline int Log2Ceiling(uint n) { +inline int Log2Ceiling(uint32_t n) { int floor = Log2Floor(n); if (n == (n & ~(n - 1))) // zero or a power of two return floor; @@ -66,7 +66,7 @@ inline int Log2Ceiling(uint n) { return floor + 1; } -inline uint NextPowerOfTwo(uint value) { +inline uint32_t NextPowerOfTwo(uint32_t value) { int exponent = Log2Ceiling(value); // DCHECK_LT(exponent, std::numeric_limits<uint32>::digits); return 1 << exponent; diff --git a/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h b/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h index ef77371bf6..af5db1064c 100644 --- a/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h +++ b/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h @@ -19,9 +19,9 @@ limitations under the License. #include <limits> #include <vector> #include "tensorflow/contrib/lite/kernels/internal/compatibility.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" namespace tflite { - namespace strided_slice { // Use until std::clamp() is available from C++17. @@ -32,15 +32,51 @@ inline int Clamp(const int v, const int lo, const int hi) { return v; } +inline void StridedSlicePadIndices(tflite::StridedSliceParams* p, + int dim_count) { + // Add indices and mask bits to fully include extra dimensions + TFLITE_CHECK_LE(dim_count, 4); + TFLITE_CHECK_GE(dim_count, p->start_indices_count); + TFLITE_CHECK_EQ(p->start_indices_count, p->stop_indices_count); + TFLITE_CHECK_EQ(p->stop_indices_count, p->strides_count); + + const int pad_count = dim_count - p->start_indices_count; + + // Pad indices at start, so move arrays by pad_count. + for (int i = p->start_indices_count - 1; i > 0; --i) { + p->strides[i + pad_count] = p->strides[i]; + p->start_indices[i + pad_count] = p->start_indices[i]; + p->stop_indices[i + pad_count] = p->stop_indices[i]; + } + for (int i = 0; i < pad_count; ++i) { + p->start_indices[i] = 0; + p->stop_indices[i] = 0; + p->strides[i] = 1; + } + + // Pad masks with 0s or 1s as required. + p->shrink_axis_mask <<= pad_count; + p->ellipsis_mask <<= pad_count; + p->new_axis_mask <<= pad_count; + p->begin_mask <<= pad_count; + p->end_mask <<= pad_count; + p->begin_mask |= (1 << pad_count) - 1; + p->end_mask |= (1 << pad_count) - 1; + + p->start_indices_count = dim_count; + p->stop_indices_count = dim_count; + p->strides_count = dim_count; +} + // Return the index for the first element along that axis. This index will be a // positive integer between [0, axis_size - 1] that can be used to index // directly into the data. -template <typename IntType> -inline int StartForAxis(int begin_mask, - std::vector<IntType> const& start_indices, - std::vector<IntType> const& strides, - int const* input_shape, int axis) { - // Begin with the specified index +inline int StartForAxis(const tflite::StridedSliceParams& params, + const RuntimeShape& input_shape, int axis) { + const auto begin_mask = params.begin_mask; + const auto* start_indices = params.start_indices; + const auto* strides = params.strides; + // Begin with the specified index. int start = start_indices[axis]; // begin_mask override @@ -57,7 +93,7 @@ inline int StartForAxis(int begin_mask, } // Handle negative indices - int axis_size = input_shape[axis]; + int axis_size = input_shape.Dims(axis); if (start < 0) { start += axis_size; } @@ -73,13 +109,26 @@ inline int StartForAxis(int begin_mask, // element. ie. So if you were iterating through all elements of a 1D array of // size 4, this function would return 4 as the stop, because it is one past the // "real" indices of 0, 1, 2 & 3. -template <typename IntType> -inline int StopForAxis(int end_mask, std::vector<IntType> const& stop_indices, - std::vector<IntType> const& strides, - int const* input_shape, int axis) { +inline int StopForAxis(const tflite::StridedSliceParams& params, + const RuntimeShape& input_shape, int axis, + int start_for_axis) { + const auto end_mask = params.end_mask; + const auto shrink_axis_mask = params.shrink_axis_mask; + const auto* stop_indices = params.stop_indices; + const auto* strides = params.strides; + // Begin with the specified index + const bool shrink_axis = shrink_axis_mask & (1 << axis); int stop = stop_indices[axis]; + // When shrinking an axis, the end position does not matter (and can be + // incorrect when negative indexing is used, see Issue #19260). Always use + // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has + // already been adjusted for negative indices. + if (shrink_axis) { + stop = start_for_axis + 1; + } + // end_mask override if (end_mask & (1 << axis)) { if (strides[axis] > 0) { @@ -93,7 +142,7 @@ inline int StopForAxis(int end_mask, std::vector<IntType> const& stop_indices, } // Handle negative indices - int axis_size = input_shape[axis]; + const int axis_size = input_shape.Dims(axis); if (stop < 0) { stop += axis_size; } @@ -117,6 +166,31 @@ inline bool LoopCondition(int index, int stop, int stride) { return stride > 0 ? index >= stop : index <= stop; } +inline tflite::StridedSliceParams BuildStridedSliceParams( + int begin_mask, int end_mask, int shrink_axis_mask, + const std::vector<int>& start_indices, const std::vector<int>& stop_indices, + const std::vector<int>& strides) { + tflite::StridedSliceParams op_params; + const int dims_count = start_indices.size(); + + op_params.start_indices_count = dims_count; + op_params.stop_indices_count = dims_count; + op_params.strides_count = dims_count; + for (int i = 0; i < dims_count; ++i) { + op_params.start_indices[i] = start_indices[i]; + op_params.stop_indices[i] = stop_indices[i]; + op_params.strides[i] = strides[i]; + } + + op_params.begin_mask = begin_mask; + op_params.ellipsis_mask = 0; + op_params.end_mask = end_mask; + op_params.new_axis_mask = 0; + op_params.shrink_axis_mask = shrink_axis_mask; + + return op_params; +} + } // namespace strided_slice } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h index ce887cea8b..689cea03e7 100644 --- a/tensorflow/contrib/lite/kernels/internal/tensor.h +++ b/tensorflow/contrib/lite/kernels/internal/tensor.h @@ -15,103 +15,30 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_ +#include <complex> #include <vector> -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/contrib/lite/kernels/internal/types.h" namespace tflite { -template <typename T> -inline T* GetTensorData(TfLiteTensor* tensor); - -template <> -inline float* GetTensorData(TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.f : nullptr; -} - -template <> -inline uint8_t* GetTensorData(TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.uint8 : nullptr; -} - -template <> -inline int32_t* GetTensorData(TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.i32 : nullptr; -} - -template <> -inline int64_t* GetTensorData(TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.i64 : nullptr; -} - -template <> -inline bool* GetTensorData(TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.b : nullptr; -} - -template <typename T> -inline const T* GetTensorData(const TfLiteTensor* tensor); - template <> -inline const float* GetTensorData(const TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.f : nullptr; +inline std::complex<float>* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr + ? reinterpret_cast<std::complex<float>*>(tensor->data.c64) + : nullptr; } template <> -inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.uint8 : nullptr; +inline const std::complex<float>* GetTensorData(const TfLiteTensor* tensor) { + return tensor != nullptr + ? reinterpret_cast<const std::complex<float>*>(tensor->data.c64) + : nullptr; } -template <> -inline const int32_t* GetTensorData(const TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.i32 : nullptr; -} - -template <> -inline const int64_t* GetTensorData(const TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.i64 : nullptr; -} - -template <> -inline const bool* GetTensorData(const TfLiteTensor* tensor) { - return tensor != nullptr ? tensor->data.b : nullptr; -} - -inline int RemapDim(int max_dimensions, int d) { - return max_dimensions - d - 1; -} - -// TODO(ahentz): the implementations in kernels/internal/ take a Dims<4> object -// even if the original tensors were not 4D. We should consider rewriting them -// to take a more generic 'shape' object. -inline Dims<4> GetTensorDims(const int data[], const int size) { - Dims<4> d; - for (int i = 0; i < 4; ++i) { - int src = size - i - 1; - if (src >= 0) { - d.sizes[i] = data[src]; - } else { - d.sizes[i] = 1; - } - } - d.strides[0] = 1; - for (int i = 1; i < 4; i++) { - d.strides[i] = d.strides[i - 1] * d.sizes[i - 1]; - } - return d; -} - -inline Dims<4> GetTensorDims(std::vector<int32_t> data) { - return GetTensorDims(data.data(), data.size()); -} - -inline Dims<4> GetTensorDims(const TfLiteTensor* tensor) { - if (tensor == nullptr) { - return Dims<4>(); - } - - auto* dims = tensor->dims; - return GetTensorDims(dims->data, dims->size); +inline RuntimeShape GetTensorShape(std::vector<int32_t> data) { + return RuntimeShape(data.size(), data.data()); } // A list of tensors in a format that can be used by kernels like split and @@ -125,20 +52,20 @@ class VectorOfTensors { int num_tensors = tensor_list.size; all_data_.reserve(num_tensors); - all_dims_.reserve(num_tensors); - all_dims_ptr_.reserve(num_tensors); + all_shape_.reserve(num_tensors); + all_shape_ptr_.reserve(num_tensors); for (int i = 0; i < num_tensors; ++i) { TfLiteTensor* t = &context.tensors[tensor_list.data[i]]; all_data_.push_back(GetTensorData<T>(t)); - all_dims_.push_back(GetTensorDims(t)); + all_shape_.push_back(GetTensorShape(t)); } // Taking the pointer from inside a std::vector is only OK if the vector is - // never modified, so we populate all_dims in the previous loop and then we + // never modified, so we populate all_shape in the previous loop and then we // are free to grab iterators here. for (int i = 0; i < num_tensors; ++i) { - all_dims_ptr_.push_back(&all_dims_[i]); + all_shape_ptr_.push_back(&all_shape_[i]); } } // Return a pointer to the data pointers of all tensors in the list. For @@ -147,16 +74,16 @@ class VectorOfTensors { // f[0][1] is the second element of the first tensor. T* const* data() const { return all_data_.data(); } - // Return a pointer the dim pointers of all tensors in the list. For + // Return a pointer the shape pointers of all tensors in the list. For // example: - // const Dims<4>* const* d = v.dims(); + // const RuntimeShape* const* d = v.dims(); // dims[1] are the dimensions of the second tensor in the list. - const Dims<4>* const* dims() const { return all_dims_ptr_.data(); } + const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); } private: std::vector<T*> all_data_; - std::vector<Dims<4>> all_dims_; - std::vector<Dims<4>*> all_dims_ptr_; + std::vector<RuntimeShape> all_shape_; + std::vector<RuntimeShape*> all_shape_ptr_; }; // A list of quantized tensors in a format that can be used by kernels like diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h b/tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h new file mode 100644 index 0000000000..9f5b33d217 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h @@ -0,0 +1,102 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_ + +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/types.h" + +namespace tflite { + +template <typename T> +inline T* GetTensorData(TfLiteTensor* tensor); + +template <> +inline float* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.f : nullptr; +} + +template <> +inline uint8_t* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.uint8 : nullptr; +} + +template <> +inline int16_t* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.i16 : nullptr; +} + +template <> +inline int32_t* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.i32 : nullptr; +} + +template <> +inline int64_t* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.i64 : nullptr; +} + +template <> +inline bool* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.b : nullptr; +} + +template <typename T> +inline const T* GetTensorData(const TfLiteTensor* tensor); + +template <> +inline const float* GetTensorData(const TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.f : nullptr; +} + +template <> +inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.uint8 : nullptr; +} + +template <> +inline const int16_t* GetTensorData(const TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.i16 : nullptr; +} + +template <> +inline const int32_t* GetTensorData(const TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.i32 : nullptr; +} + +template <> +inline const int64_t* GetTensorData(const TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.i64 : nullptr; +} + +template <> +inline const bool* GetTensorData(const TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.b : nullptr; +} + +inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) { + if (tensor == nullptr) { + return RuntimeShape(); + } + + TfLiteIntArray* dims = tensor->dims; + const int dims_size = dims->size; + const int32_t* dims_data = dims->data; + return RuntimeShape(dims_size, dims_data); +} + +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_test.cc index bf2068d320..2ed73ba82d 100644 --- a/tensorflow/contrib/lite/kernels/internal/tensor_test.cc +++ b/tensorflow/contrib/lite/kernels/internal/tensor_test.cc @@ -21,28 +21,32 @@ namespace { using ::testing::ElementsAre; -TEST(TensorTest, GetTensorDims4D) { - Dims<4> d = GetTensorDims({2, 3, 4, 5}); - EXPECT_THAT(d.sizes, ElementsAre(5, 4, 3, 2)); - EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 60)); +TEST(TensorTest, GetTensorShape4D) { + RuntimeShape d = GetTensorShape({2, 3, 4, 5}); + EXPECT_THAT( + std::vector<int32>(d.DimsData(), d.DimsData() + d.DimensionsCount()), + ElementsAre(2, 3, 4, 5)); } -TEST(TensorTest, GetTensorDims3D) { - Dims<4> d = GetTensorDims({3, 4, 5}); - EXPECT_THAT(d.sizes, ElementsAre(5, 4, 3, 1)); - EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 60)); +TEST(TensorTest, GetTensorShape3D) { + RuntimeShape d = GetTensorShape({3, 4, 5}); + EXPECT_THAT( + std::vector<int32>(d.DimsData(), d.DimsData() + d.DimensionsCount()), + ElementsAre(3, 4, 5)); } -TEST(TensorTest, GetTensorDims2D) { - Dims<4> d = GetTensorDims({4, 5}); - EXPECT_THAT(d.sizes, ElementsAre(5, 4, 1, 1)); - EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 20)); +TEST(TensorTest, GetTensorShape2D) { + RuntimeShape d = GetTensorShape({4, 5}); + EXPECT_THAT( + std::vector<int32>(d.DimsData(), d.DimsData() + d.DimensionsCount()), + ElementsAre(4, 5)); } -TEST(TensorTest, GetTensorDims1D) { - Dims<4> d = GetTensorDims({5}); - EXPECT_THAT(d.sizes, ElementsAre(5, 1, 1, 1)); - EXPECT_THAT(d.strides, ElementsAre(1, 5, 5, 5)); +TEST(TensorTest, GetTensorShape1D) { + RuntimeShape d = GetTensorShape({5}); + EXPECT_THAT( + std::vector<int32>(d.DimsData(), d.DimsData() + d.DimensionsCount()), + ElementsAre(5)); } } // namespace diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h index e1c9ccd84b..b0fe5adf65 100644 --- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h +++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h @@ -15,7 +15,11 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_ -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" + +#if defined(_MSC_VER) +#define __restrict__ __restrict +#endif namespace tflite { namespace tensor_utils { @@ -23,13 +27,16 @@ namespace tensor_utils { // Limit a float input f between +abs_limit and -abs_limit. float Clip(float f, float abs_limit); +// Checks if all entries of vector are zero. +bool IsZeroVector(const float* vector, int v_size); + // Quantizes a buffer of floating point values using a symmetric quantization // (i.e. linear quantization without an offset) to 8-bit signed integers. // It also outputs the range (min, max) of the floating point buffer, and the // scaling factor used to quantize the values. void SymmetricQuantizeFloats(const float* values, const int size, - int8_t* quantized_values, float* min, float* max, - float* scaling_factor); + int8_t* quantized_values, float* min_value, + float* max_value, float* scaling_factor); // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch // dimension composed by input vectors independent from each other). The result @@ -94,6 +101,11 @@ void BatchVectorBatchVectorDotProduct(const float* vector1, int n_batch, float* result, int result_stride); +// Cwise product of a vector and a batch-vector. +void VectorBatchVectorCwiseProduct(const float* vector, int v_size, + const float* batch_vector, int n_batch, + float* result); + // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC // operation, the assumption here is that result array is initialized to valid // values. @@ -101,6 +113,10 @@ void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, const float* batch_vector, int n_batch, float* result); +// Add another vector for each batch in the batch vector. +void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch, + float* batch_vector); + // Batch vector initialization with another vector. void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch, float* batch_vector); @@ -121,6 +137,10 @@ void Sub1Vector(const float* vector, int v_size, float* result); // Fill vector with 0.f. void ZeroVector(float* vector, int v_size); +// Multiply all elements of vector with a scalar. +void VectorScalarMultiply(const int8_t* vector, int v_size, float scale, + float* result); + // Clip elements of a vector using a abs_limit value. void ClipVector(const float* vector, int v_size, float abs_limit, float* result); @@ -136,6 +156,12 @@ void VectorShiftLeft(float* vector, int v_size, float shift_value); // added to get one element of output. void ReductionSumVector(const float* input_vector, float* output_vector, int output_size, int reduction_size); + +// Layer norm for each batch. +// normalization_epsilon is added to avoid divergence. +void MeanStddevNormalization(const float* input_vector, float* output_vector, + int v_size, int n_batch, + float normalization_epsilon); } // namespace tensor_utils } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc index 3d8a2eada0..6458af714b 100644 --- a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc +++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc @@ -14,7 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include <gmock/gmock.h> -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" #include "tensorflow/contrib/lite/kernels/test_util.h" namespace tflite { @@ -32,19 +32,55 @@ TEST(uKernels, ClipTest) { {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0}))); } +TEST(uKernels, VectorScalarMultiply) { + constexpr int kVectorSize = 29; + static int8_t input[kVectorSize]; + for (int i = 0; i < 29; ++i) { + input[i] = static_cast<int8_t>(i - 14); + } + const float scale = 0.1f; + std::vector<float> output(kVectorSize, 0.0f); + VectorScalarMultiply(input, kVectorSize, scale, output.data()); + EXPECT_THAT(output, + ElementsAreArray(ArrayFloatNear( + {-1.4, -1.3, -1.2, -1.1, -1.0, -0.9, -0.8, -0.7, -0.6, -0.5, + -0.4, -0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3, 0.4, 0.5, + 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4}))); +} + +TEST(uKernels, IsZeroTest) { + constexpr int kVectorSize = 21; + static float zeros[kVectorSize] = {0.0}; + EXPECT_TRUE(IsZeroVector(zeros, kVectorSize)); + + static float nonzeros[kVectorSize] = { + 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, + 1e-13, 1e-14, 1e-15, 1e-16, 1e-17, 1e-18, 1e-19, + 1e-20, 1e-21, 1e-22, 1e-23, 1e-24, 1e-25, 1e-26}; + EXPECT_FALSE(IsZeroVector(nonzeros, kVectorSize)); +} + +TEST(uKernels, GeneratedIsZeroTest) { + constexpr int kVectorSize = 39; + std::vector<float> input(kVectorSize); + ZeroVector(input.data(), kVectorSize); + EXPECT_TRUE(IsZeroVector(input.data(), kVectorSize)); +} + TEST(uKernels, SymmetricQuantizeFloatsTest) { constexpr int kVectorSize = 9; static float input[kVectorSize] = {-640, -635.0, -630, 10.0, 2.0, -5.0, -10.0, 0.0, 1000.0}; - int8 output[kVectorSize]; + int8_t output[kVectorSize]; float min, max, scaling_factor; SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max, &scaling_factor); EXPECT_EQ(min, -640); EXPECT_EQ(max, 1000); - EXPECT_NEAR(scaling_factor, 0.127, 1e-6); // EQ won't work due to fpoint. + // EQ won't work due to fpoint. + EXPECT_NEAR(scaling_factor, 1000 / 127.0, 1e-6); EXPECT_THAT(output, testing::ElementsAreArray({-81, -81, -80, 1, 0, -1, -1, 0, 127})); } @@ -53,7 +89,7 @@ TEST(uKernels, SymmetricQuantizeFloatsAllZerosTest) { constexpr int kVectorSize = 9; static float input[kVectorSize] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; - int8 output[kVectorSize]; + int8_t output[kVectorSize]; float min, max, scaling_factor; SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max, &scaling_factor); @@ -69,14 +105,14 @@ TEST(uKernels, SymmetricQuantizeFloatsAllAlmostZeroTest) { static float input[kVectorSize] = {-1e-5, 3e-5, -7e-6, -9e-5, 1e-6, 4e-5, 9e-6, 2e-4, 0}; - int8 output[kVectorSize]; + int8_t output[kVectorSize]; float min, max, scaling_factor; SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max, &scaling_factor); EXPECT_NEAR(min, -9e-05, 1e-6); EXPECT_NEAR(max, 0.0002, 1e-6); - EXPECT_EQ(scaling_factor, 635000); + EXPECT_NEAR(scaling_factor, 1.57e-6, 1e-6); EXPECT_THAT(output, testing::ElementsAreArray({-6, 19, -4, -57, 1, 25, 6, 127, 0})); } @@ -107,6 +143,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) { -1., 3., 7., 3., 23., 3.}))); } +#ifdef __ANDROID__ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { // Note we use 29 columns as this exercises all the neon kernel: the // 16-block SIMD code, the 8-block postamble, and the leftover postamble. @@ -130,13 +167,13 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { -13.13, 14.14, -15.15, 16.16, -17.17, 18.18, -19.19, 20.2, -21.21, 22.22, -23.23, 24.24, -25.25, 26.26, -27.27, 28.28, 0}; - int8* a_int8_data = reinterpret_cast<int8*>( + int8_t* a_int8_data = reinterpret_cast<int8_t*>( aligned_malloc(a_rows * a_cols, kWeightsPerUint32)); float a_min, a_max; float scaling_factor_a; SymmetricQuantizeFloats(a_float_data, a_rows * a_cols, a_int8_data, &a_min, &a_max, &scaling_factor_a); - const int8 expected_a_int8_data[] = { + const int8_t expected_a_int8_data[] = { /* 1st row */ 5, 10, @@ -327,7 +364,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { }; // Quantized values of B: - int8 b_int8_data[b_rows * b_cols * batches]; + int8_t b_int8_data[b_rows * b_cols * batches]; float b_min, b_max; float scaling_factor_b[batches]; SymmetricQuantizeFloats(b_float_data, b_rows * b_cols, b_int8_data, &b_min, @@ -336,7 +373,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { &b_int8_data[b_rows * b_cols], &b_min, &b_max, &scaling_factor_b[1]); - const int8 expected_b_int8_data[] = { + const int8_t expected_b_int8_data[] = { /* batch 1 */ 127, -127, @@ -429,6 +466,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { aligned_free(a_int8_data); } +#endif // __ANDROID__ TEST(uKernels, VectorVectorCwiseProductTest) { constexpr int kVectorSize = 10; @@ -458,6 +496,16 @@ TEST(uKernels, VectorVectorCwiseProductAccumulateTest) { {1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45}))); } +TEST(uKernels, VectorBatchVectorAddTest) { + constexpr int kVectorSize = 3; + constexpr int kBatchSize = 2; + static float input[kVectorSize] = {0.0, -0.5, 1.0}; + std::vector<float> output = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + VectorBatchVectorAdd(input, kVectorSize, kBatchSize, output.data()); + EXPECT_THAT(output, + testing::ElementsAreArray({1.0, 1.5, 4.0, 4.0, 4.5, 7.0})); +} + TEST(uKernels, VectorBatchVectorAssignTest) { constexpr int kVectorSize = 5; constexpr int kBatchSize = 3; @@ -517,6 +565,120 @@ TEST(uKernels, ZeroVectorTest) { ElementsAreArray(ArrayFloatNear({0.0, 0.0, 0.0, 0.0, 0.0}))); } +TEST(uKernels, VectorBatchVectorCwiseProductAccumulate) { + constexpr int kVectorSize = 29; + constexpr int kBatchSize = 4; + static float input[kVectorSize] = { + 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, + 11.11, 12.12, 13.13, 14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2, + 21.21, 22.22, 23.23, 24.24, 25.25, 26.26, 27.27, 28.28, 0}; + std::vector<float> output = { + /* batch 0 */ + 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13, + 14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2, 21.21, 22.22, 23.23, + 24.24, 25.25, 26.26, 27.27, 28.28, 0, + /* batch 1 */ + -1.1, -2.2, -3.3, -4.4, -5.5, -6.6, -7.7, -8.8, -9.9, -10.1, -11.11, + -12.12, -13.13, -14.14, -15.15, -16.16, -17.17, -18.18, -19.19, -20.2, + -21.21, -22.22, -23.23, -24.24, -25.25, -26.26, -27.27, -28.28, 0, + /* batch 2 */ + 1.1, -2.2, 3.3, -4.4, 5.5, -6.6, 7.7, -8.8, 9.9, -10.1, 11.11, -12.12, + 13.13, -14.14, 15.15, -16.16, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, + 23.23, -24.24, 25.25, -26.26, 27.27, -28.28, 0, + /* batch 3 */ + -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12, + -13.13, 14.14, -15.15, 16.16, -17.17, 18.18, -19.19, 20.2, -21.21, 22.22, + -23.23, 24.24, -25.25, 26.26, -27.27, 28.28, 0}; + VectorBatchVectorCwiseProductAccumulate(input, kVectorSize, output.data(), + kBatchSize, output.data()); + + // Expect output = input * output + output. + const std::vector<float> expected_output = { + /* batch 0 */ + 2.310000, 7.040000, 14.190000, 23.760000, 35.750000, 50.159996, 66.989998, + 86.240005, 107.909996, 112.110008, 134.542084, 159.014389, 185.526901, + 214.079605, 244.672485, 277.305603, 311.978912, 348.692413, 387.446136, + 428.240051, 471.074066, 515.948364, 562.862854, 611.817566, 662.812500, + 715.847595, 770.922974, 828.038452, 0.000000, + /* batch 1 */ + -2.310000, -7.040000, -14.190000, -23.760000, -35.750000, -50.159996, + -66.989998, -86.240005, -107.909996, -112.110008, -134.542084, + -159.014389, -185.526901, -214.079605, -244.672485, -277.305603, + -311.978912, -348.692413, -387.446136, -428.240051, -471.074066, + -515.948364, -562.862854, -611.817566, -662.812500, -715.847595, + -770.922974, -828.038452, 0.000000, + /* batch 2 */ + 2.310000, -7.040000, 14.190000, -23.760000, 35.750000, -50.159996, + 66.989998, -86.240005, 107.909996, -112.110008, 134.542084, -159.014389, + 185.526901, -214.079605, 244.672485, -277.305603, 311.978912, -348.692413, + 387.446136, -428.240051, 471.074066, -515.948364, 562.862854, -611.817566, + 662.812500, -715.847595, 770.922974, -828.038452, 0.000000, + /* batch 3 */ + -2.310000, 7.040000, -14.190000, 23.760000, -35.750000, 50.159996, + -66.989998, 86.240005, -107.909996, 112.110008, -134.542084, 159.014389, + -185.526901, 214.079605, -244.672485, 277.305603, -311.978912, 348.692413, + -387.446136, 428.240051, -471.074066, 515.948364, -562.862854, 611.817566, + -662.812500, 715.847595, -770.922974, 828.038452, 0.000000}; + EXPECT_THAT(output, testing::ElementsAreArray(expected_output)); +} + +TEST(uKernels, VectorBatchVectorCwiseProductNoAccumulate) { + constexpr int kVectorSize = 29; + constexpr int kBatchSize = 4; + static float input[kVectorSize] = { + 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, + 11.11, 12.12, 13.13, 14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2, + 21.21, 22.22, 23.23, 24.24, 25.25, 26.26, 27.27, 28.28, 0}; + std::vector<float> output = { + /* batch 0 */ + 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13, + 14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2, 21.21, 22.22, 23.23, + 24.24, 25.25, 26.26, 27.27, 28.28, 0, + /* batch 1 */ + -1.1, -2.2, -3.3, -4.4, -5.5, -6.6, -7.7, -8.8, -9.9, -10.1, -11.11, + -12.12, -13.13, -14.14, -15.15, -16.16, -17.17, -18.18, -19.19, -20.2, + -21.21, -22.22, -23.23, -24.24, -25.25, -26.26, -27.27, -28.28, 0, + /* batch 2 */ + 1.1, -2.2, 3.3, -4.4, 5.5, -6.6, 7.7, -8.8, 9.9, -10.1, 11.11, -12.12, + 13.13, -14.14, 15.15, -16.16, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, + 23.23, -24.24, 25.25, -26.26, 27.27, -28.28, 0, + /* batch 3 */ + -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12, + -13.13, 14.14, -15.15, 16.16, -17.17, 18.18, -19.19, 20.2, -21.21, 22.22, + -23.23, 24.24, -25.25, 26.26, -27.27, 28.28, 0}; + VectorBatchVectorCwiseProduct(input, kVectorSize, output.data(), kBatchSize, + output.data()); + + // Expect output = input * output + output. + const std::vector<float> expected_output = { + /* batch 0 */ + 1.210000, 4.840000, 10.889999, 19.360001, 30.250000, 43.559998, 59.289997, + 77.440002, 98.009995, 102.010010, 123.432091, 146.894394, 172.396896, + 199.939606, 229.522491, 261.145599, 294.808899, 330.512421, 368.256134, + 408.040039, 449.864075, 493.728363, 539.632874, 587.577576, 637.562500, + 689.587585, 743.652954, 799.758423, 0.000000, + /* batch 1 */ + -1.210000, -4.840000, -10.889999, -19.360001, -30.250000, -43.559998, + -59.289997, -77.440002, -98.009995, -102.010010, -123.432091, -146.894394, + -172.396896, -199.939606, -229.522491, -261.145599, -294.808899, + -330.512421, -368.256134, -408.040039, -449.864075, -493.728363, + -539.632874, -587.577576, -637.562500, -689.587585, -743.652954, + -799.758423, 0.000000, + /* batch 2 */ + 1.210000, -4.840000, 10.889999, -19.360001, 30.250000, -43.559998, + 59.289997, -77.440002, 98.009995, -102.010010, 123.432091, -146.894394, + 172.396896, -199.939606, 229.522491, -261.145599, 294.808899, -330.512421, + 368.256134, -408.040039, 449.864075, -493.728363, 539.632874, -587.577576, + 637.562500, -689.587585, 743.652954, -799.758423, 0.000000, + /* batch 3 */ + -1.210000, 4.840000, -10.889999, 19.360001, -30.250000, 43.559998, + -59.289997, 77.440002, -98.009995, 102.010010, -123.432091, 146.894394, + -172.396896, 199.939606, -229.522491, 261.145599, -294.808899, 330.512421, + -368.256134, 408.040039, -449.864075, 493.728363, -539.632874, 587.577576, + -637.562500, 689.587585, -743.652954, 799.758423, 0.000000}; + EXPECT_THAT(output, testing::ElementsAreArray(expected_output)); +} + TEST(uKernels, BatchVectorBatchVectorDotProductTest) { constexpr int kVectorSize = 5; constexpr int kBatch = 2; @@ -560,5 +722,85 @@ TEST(uKernels, ReductionSumVectorTest) { EXPECT_THAT(result2, ElementsAreArray(ArrayFloatNear({1.0, 3.5}))); } +TEST(uKernels, MeanStddevNormalizationNoneZeroInput) { + constexpr int kVectorSize = 4; + constexpr int kBatchSize = 2; + constexpr float kNormalizationEpsilon = 1e-8; + + // None-zero input. + static float input[kVectorSize * kBatchSize] = { + 0.1, 0.2, 0.3, 0.4, // batch 0 + 0.9, 1.0, 1.1, 1.2, // batch 1 + }; + std::vector<float> output(kVectorSize * kBatchSize); + MeanStddevNormalization(input, output.data(), kVectorSize, kBatchSize, + kNormalizationEpsilon); + const std::vector<float> expected_output = { + -1.34164071, -0.447213531, 0.44721365, 1.34164071, // batch 0 + -1.34163153, -0.447210163, 0.447211236, 1.3416326, // batch 1 + }; + EXPECT_THAT(output, testing::ElementsAreArray(expected_output)); +} + +TEST(uKernels, MeanStddevNormalizationAllZeroInput) { + constexpr int kVectorSize = 4; + constexpr int kBatchSize = 2; + constexpr float kNormalizationEpsilon = 1e-8; + + // Zero input. + static float input[kVectorSize * kBatchSize] = { + 0.0, 0.0, 0.0, 0.0, // batch 0 + 0.0, 0.0, 0.0, 0.0, // batch 1 + }; + std::vector<float> output(kVectorSize * kBatchSize); + MeanStddevNormalization(input, output.data(), kVectorSize, kBatchSize, + kNormalizationEpsilon); + const std::vector<float> expected_output = { + 0.0, 0.0, 0.0, 0.0, // batch 0 + 0.0, 0.0, 0.0, 0.0, // batch 1 + }; + EXPECT_THAT(output, testing::ElementsAreArray(expected_output)); +} + +TEST(uKernels, MeanStddevNormalizationMixed) { + constexpr int kVectorSize = 4; + constexpr int kBatchSize = 2; + constexpr float kNormalizationEpsilon = 1e-8; + + // Mix of zero and non-zero input. + static float input[kVectorSize * kBatchSize] = { + 0.0, 0.0, 0.0, 0.0, // batch 0 + 0.1, 0.2, 0.3, 0.4, // batch 1 + }; + std::vector<float> output(kVectorSize * kBatchSize); + MeanStddevNormalization(input, output.data(), kVectorSize, kBatchSize, + kNormalizationEpsilon); + const std::vector<float> expected_output = { + 0.0, 0.0, 0.0, 0.0, // batch 0 + -1.34164071, -0.447213531, 0.44721365, 1.34164071, // batch 1 + }; + EXPECT_THAT(output, testing::ElementsAreArray(expected_output)); +} + +TEST(uKernels, MeanStddevNormalizationSmallValue) { + constexpr int kVectorSize = 4; + constexpr int kBatchSize = 2; + constexpr float kNormalizationEpsilon = 1e-8; + + // Mix of zero and non-zero input. + static float input[kVectorSize * kBatchSize] = { + 3e-5, -7e-6, -9e-5, 1e-6, // batch 0 + 4e-5, 9e-6, 2e-4, 0.0, // batch 1 + }; + std::vector<float> output(kVectorSize * kBatchSize); + MeanStddevNormalization(input, output.data(), kVectorSize, kBatchSize, + kNormalizationEpsilon); + const std::vector<float> expected_output = { + 1.04231524, 0.212946132, -1.64753067, 0.392269224, // batch 0 + -0.275023013, -0.658201098, 1.70267045, -0.769446373, // batch 1 + }; + EXPECT_THAT(output, testing::ElementsAreArray(expected_output)); +} + } // namespace tensor_utils } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/test_util.cc b/tensorflow/contrib/lite/kernels/internal/test_util.cc new file mode 100644 index 0000000000..75d568ae3a --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/test_util.cc @@ -0,0 +1,107 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/kernels/internal/test_util.h" + +#include <cmath> +#include <iterator> + +namespace tflite { + +// this is a copied from an internal function in propagate_fixed_sizes.cc +bool ComputeConvSizes(const RuntimeShape& input_shape, int output_depth, + int filter_width, int filter_height, int stride, + int dilation_width_factor, int dilation_height_factor, + PaddingType padding_type, RuntimeShape* output_shape, + int* pad_width, int* pad_height) { + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + const int batch = input_shape.Dims(0); + + int dilated_filter_width = dilation_width_factor * (filter_width - 1) + 1; + int dilated_filter_height = dilation_height_factor * (filter_height - 1) + 1; + + int output_height = 0; + int output_width = 0; + if (padding_type == PaddingType::kValid) { + output_height = (input_height + stride - dilated_filter_height) / stride; + output_width = (input_width + stride - dilated_filter_width) / stride; + } else if (padding_type == PaddingType::kSame) { + output_height = (input_height + stride - 1) / stride; + output_width = (input_width + stride - 1) / stride; + } else { + return false; + } + + if (output_width <= 0 || output_height <= 0) { + return false; + } + + *pad_height = std::max( + 0, ((output_height - 1) * stride + dilated_filter_height - input_height) / + 2); + *pad_width = std::max( + 0, + ((output_width - 1) * stride + dilated_filter_width - input_width) / 2); + + output_shape->BuildFrom({batch, output_height, output_width, output_depth}); + return true; +} + +std::mt19937& RandomEngine() { + static std::mt19937 engine; + return engine; +} + +int UniformRandomInt(int min, int max) { + std::uniform_int_distribution<int> dist(min, max); + return dist(RandomEngine()); +} + +float UniformRandomFloat(float min, float max) { + std::uniform_real_distribution<float> dist(min, max); + return dist(RandomEngine()); +} + +int ExponentialRandomPositiveInt(float percentile, int percentile_val, + int max_val) { + const float lambda = + -std::log(1.f - percentile) / static_cast<float>(percentile_val); + std::exponential_distribution<float> dist(lambda); + float val; + do { + val = dist(RandomEngine()); + } while (!val || !std::isfinite(val) || val > max_val); + return static_cast<int>(std::ceil(val)); +} + +float ExponentialRandomPositiveFloat(float percentile, float percentile_val, + float max_val) { + const float lambda = + -std::log(1.f - percentile) / static_cast<float>(percentile_val); + std::exponential_distribution<float> dist(lambda); + float val; + do { + val = dist(RandomEngine()); + } while (!std::isfinite(val) || val > max_val); + return val; +} + +void FillRandom(std::vector<float>* vec, float min, float max) { + std::uniform_real_distribution<float> dist(min, max); + auto gen = std::bind(dist, RandomEngine()); + std::generate(std::begin(*vec), std::end(*vec), gen); +} + +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/test_util.h b/tensorflow/contrib/lite/kernels/internal/test_util.h new file mode 100644 index 0000000000..e4a383bedf --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/test_util.h @@ -0,0 +1,103 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TEST_UTIL_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TEST_UTIL_H_ + +#include <algorithm> +#include <functional> +#include <iterator> +#include <limits> +#include <random> +#include <vector> + +#include "tensorflow/contrib/lite/kernels/internal/types.h" + +namespace tflite { + +// Computes output and padding dimensions. +bool ComputeConvSizes(const RuntimeShape& input_shape, int output_depth, + int filter_width, int filter_height, int stride, + int dilation_width_factor, int dilation_height_factor, + PaddingType padding_type, RuntimeShape* output_shape, + int* pad_width, int* pad_height); + +// Returns a mt19937 random engine. +std::mt19937& RandomEngine(); + +// Returns a random integer uniformly distributed between |min| and |max|. +int UniformRandomInt(int min, int max); + +// Returns a random float uniformly distributed between |min| and |max|. +float UniformRandomFloat(float min, float max); + +// Returns a random element in |v|. +template <typename T> +const T& RandomElement(const std::vector<T>& v) { + return v[UniformRandomInt(0, v.size() - 1)]; +} + +// Returns a random exponentially distributed integer. +int ExponentialRandomPositiveInt(float percentile, int percentile_val, + int max_val); + +// Returns a random exponentially distributed float. +float ExponentialRandomPositiveFloat(float percentile, float percentile_val, + float max_val); + +// Fills a vector with random floats between |min| and |max|. +void FillRandom(std::vector<float>* vec, float min, float max); + +// Fills a vector with random numbers between |min| and |max|. +template <typename T> +void FillRandom(std::vector<T>* vec, T min, T max) { + std::uniform_int_distribution<T> dist(min, max); + auto gen = std::bind(dist, RandomEngine()); + std::generate(std::begin(*vec), std::end(*vec), gen); +} + +// Fills a vector with random numbers. +template <typename T> +void FillRandom(std::vector<T>* vec) { + FillRandom(vec, std::numeric_limits<T>::min(), std::numeric_limits<T>::max()); +} + +template <typename T> +void FillRandom(typename std::vector<T>::iterator begin_it, + typename std::vector<T>::iterator end_it, T min, T max) { + std::uniform_int_distribution<T> dist(min, max); + auto gen = std::bind(dist, RandomEngine()); + std::generate(begin_it, end_it, gen); +} + +// Fill with a "skyscraper" pattern, in which there is a central section (across +// the depth) with higher values than the surround. +template <typename T> +void FillRandomSkyscraper(std::vector<T>* vec, int depth, + double middle_proportion, uint8 middle_min, + uint8 sides_max) { + for (auto base_it = std::begin(*vec); base_it != std::end(*vec); + base_it += depth) { + auto left_it = base_it + std::ceil(0.5 * depth * (1.0 - middle_proportion)); + auto right_it = + base_it + std::ceil(0.5 * depth * (1.0 + middle_proportion)); + FillRandom(base_it, left_it, std::numeric_limits<T>::min(), sides_max); + FillRandom(left_it, right_it, middle_min, std::numeric_limits<T>::max()); + FillRandom(right_it, base_it + depth, std::numeric_limits<T>::min(), + sides_max); + } +} + +} // namespace tflite +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TEST_UTIL_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h index 43c6883278..c6bc6074d4 100644 --- a/tensorflow/contrib/lite/kernels/internal/types.h +++ b/tensorflow/contrib/lite/kernels/internal/types.h @@ -1,4 +1,4 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,11 +15,81 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_ +#include <algorithm> +#include <cstring> + #include "tensorflow/contrib/lite/kernels/internal/compatibility.h" namespace tflite { enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu }; +enum class PaddingType : uint8 { kNone, kSame, kValid }; + +struct PaddingValues { + int16 width; + int16 height; +}; + +// This enumeration allows for non-default formats for the weights array +// of a fully-connected operator, allowing the use of special optimized +// runtime paths. +enum class FullyConnectedWeightsFormat : uint8 { + // Default format (flat 2D layout, the inner contiguous dimension + // is input_depth, the outer non-contiguous dimension is output_depth) + kDefault, + // Summary: optimized layout for fast CPU runtime implementation, + // aimed specifically at ARM CPUs at the moment, and specialized for + // 8-bit quantized layers. + // + // The use case we're concerned with here is: 8-bit quantization, + // large weights matrix that doesn't fit in cache (e.g. 4096x2048 in + // a key application that drove this), very small batch size (e.g. 1 -- 4). + // + // Even with 8-bit quantization of weights, the performance of memory + // accesses to the weights can become the dominant issue when + // the batch size is small, so each weight value is used in only a few + // arithmetic ops, i.e. the fully-connected node has a low arithmetic + // intensity. The specific issues that arise are of three kinds: + // (1) One may, ideally, max out DRAM bandwidth, i.e. be truly memory + // bound. That's the "good" issue to run into. + // (2) One may run into sub-optimal pre-fetching: the data hasn't been + // prefetched into the cache by the time we need it. + // (3) One may run into cache aliasing: multiple values that are + // pre-fetched, alias each other in the L1 cache (which typically + // has only 4-way set associativity in ARM CPUs) and thus evict + // each other before we get to using them. + // + // The point of this shuffling is to avoid issues (2) and (3) so that + // we get as fast as possible given only the hard constraint (1). + // This is achieved by turning the difficulty into a solution: the + // difficulty, that each value loaded from memory is used only in + // one kernel iteration, making this operation memory-intensive, hints at + // the solution, of shuffling the weights so that they are stored in the + // exact order as the kernel needs to load them, so that the memory + // accesses made by the kernel are trivial. This solves (2) because the + // trivial memory access pattern allows the CPU's automatic prefetching + // to perform very well (no need even for preload instructions), and this + // solves (3) because the values being loaded concurrently are now + // contiguous in the address space, thus don't alias each other in the cache. + // + // On ARM, we typically want our kernel to process a 4x16 block of weights + // at a time, because: + // - 16 is the number of bytes in a NEON register. + // - 4 is how many rows we need to handle concurrently in the kernel in + // order to have sufficient mutual independence of instructions to + // maximize arithmetic throughput. + // + // Finally, the 'Int8' part in the name refers to the fact that this + // weights format has each weights value encoded as a signed int8 value, + // even if the data type of the weights buffer is uint8. This is intended + // to save runtime kernels the effort to have to XOR the top bit of these + // bytes before using them in signed arithmetic, see this file for more + // explanations on the 'signed int8 trick' in matrix multiplication kernels: + // + // tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc + // + kShuffled4x16Int8, +}; // Quantization parameters, determining the mapping of quantized values // to real values (i.e. determining how quantized values are mathematically @@ -43,9 +113,207 @@ struct Dims { int strides[N]; }; +class RuntimeShape { + public: + // Shapes with dimensions up to 4 are stored directly in the structure, while + // larger shapes are separately allocated. + static constexpr int kMaxSmallSize = 4; + + RuntimeShape& operator=(RuntimeShape const&) = delete; + + RuntimeShape() : size_(0) {} + + explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) { + if (dimensions_count > kMaxSmallSize) { +#ifdef TF_LITE_STATIC_MEMORY + TFLITE_CHECK(false && "No shape resizing supported on this platform"); +#else // TF_LITE_STATIC_MEMORY + dims_pointer_ = new int32[dimensions_count]; +#endif // TF_LITE_STATIC_MEMORY + } + } + + RuntimeShape(int shape_size, int32 value) : size_(0) { + Resize(shape_size); + for (int i = 0; i < shape_size; ++i) { + SetDim(i, value); + } + } + + RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) { + ReplaceWith(dimensions_count, dims_data); + } + + RuntimeShape(const std::initializer_list<int> init_list) : size_(0) { + BuildFrom(init_list); + } + + // Avoid using this constructor. We should be able to delete it when C++17 + // rolls out. + RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) { + if (size_ > kMaxSmallSize) { + dims_pointer_ = new int32[size_]; + } + std::memcpy(DimsData(), other.DimsData(), sizeof(int32) * size_); + } + + bool operator==(const RuntimeShape& comp) const { + return this->size_ == comp.size_ && + std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32)) == 0; + } + + ~RuntimeShape() { + if (size_ > kMaxSmallSize) { +#ifdef TF_LITE_STATIC_MEMORY + TFLITE_CHECK(false && "No shape resizing supported on this platform"); +#else // TF_LITE_STATIC_MEMORY + delete[] dims_pointer_; +#endif // TF_LITE_STATIC_MEMORY + } + } + + inline int32 DimensionsCount() const { return size_; } + inline int32 Dims(int i) const { + TFLITE_DCHECK_GE(i, 0); + TFLITE_DCHECK_LT(i, size_); + return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i]; + } + inline void SetDim(int i, int32 val) { + TFLITE_DCHECK_GE(i, 0); + TFLITE_DCHECK_LT(i, size_); + if (size_ > kMaxSmallSize) { + dims_pointer_[i] = val; + } else { + dims_[i] = val; + } + } + + inline int32* DimsData() { + return size_ > kMaxSmallSize ? dims_pointer_ : dims_; + } + inline const int32* DimsData() const { + return size_ > kMaxSmallSize ? dims_pointer_ : dims_; + } + // The caller must ensure that the shape is no bigger than 4-D. + inline const int32* DimsDataUpTo4D() const { return dims_; } + + inline void Resize(int dimensions_count) { + if (size_ > kMaxSmallSize) { +#ifdef TF_LITE_STATIC_MEMORY + TFLITE_CHECK(false && "No shape resizing supported on this platform"); +#else // TF_LITE_STATIC_MEMORY + delete[] dims_pointer_; +#endif // TF_LITE_STATIC_MEMORY + } + size_ = dimensions_count; + if (dimensions_count > kMaxSmallSize) { +#ifdef TF_LITE_STATIC_MEMORY + TFLITE_CHECK(false && "No shape resizing supported on this platform"); +#else // TF_LITE_STATIC_MEMORY + dims_pointer_ = new int32[dimensions_count]; +#endif // TF_LITE_STATIC_MEMORY + } + } + + inline void ReplaceWith(int dimensions_count, const int32* dims_data) { + Resize(dimensions_count); + int32* dst_dims = DimsData(); + std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32)); + } + + template <typename T> + inline void BuildFrom(const T& src_iterable) { + const int dimensions_count = + std::distance(src_iterable.begin(), src_iterable.end()); + Resize(dimensions_count); + int32* data = DimsData(); + for (auto it : src_iterable) { + *data = it; + ++data; + } + } + + // This will probably be factored out. Old code made substantial use of 4-D + // shapes, and so this function is used to extend smaller shapes. Note that + // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be + // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their + // inputs should already be 4-D, so this function should not be needed. + inline static RuntimeShape ExtendedShape(int new_shape_size, + const RuntimeShape& shape) { + return RuntimeShape(new_shape_size, shape, 1); + } + + inline void BuildFrom(const std::initializer_list<int> init_list) { + BuildFrom<const std::initializer_list<int>>(init_list); + } + + // Returns the total count of elements, that is the size when flattened into a + // vector. + inline int FlatSize() const { + int buffer_size = 1; + const int* dims_data = DimsData(); + for (int i = 0; i < size_; i++) { + const int dim = dims_data[i]; + TFLITE_DCHECK_GE(dim, 1); + buffer_size *= dim; + } + return buffer_size; + } + + bool operator!=(const RuntimeShape& comp) const { return !((*this) == comp); } + + private: + // For use only by ExtendedShape(), written to guarantee (return-value) copy + // elision in C++17. + // This creates a shape padded to the desired size with the specified value. + RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value) + : size_(0) { + // If the following check fails, it is likely because a 4D-only kernel is + // being used with an array of larger dimension count. + TFLITE_CHECK_GE(new_shape_size, shape.DimensionsCount()); + Resize(new_shape_size); + const int size_increase = new_shape_size - shape.DimensionsCount(); + for (int i = 0; i < size_increase; ++i) { + SetDim(i, pad_value); + } + std::memcpy(DimsData() + size_increase, shape.DimsData(), + sizeof(int32) * shape.DimensionsCount()); + } + + int32 size_; + union { + int32 dims_[kMaxSmallSize]; + int32* dims_pointer_; + }; +}; + +// Converts inference-style shape to legacy tflite::Dims<4>. +inline tflite::Dims<4> ToRuntimeDims(const tflite::RuntimeShape& array_shape) { + tflite::Dims<4> result; + const int dimensions_count = array_shape.DimensionsCount(); + TFLITE_CHECK_LE(dimensions_count, 4); + int cum_prod = 1; + for (int i = 0; i < 4; i++) { + const int new_dim = + (i < dimensions_count) ? array_shape.Dims(dimensions_count - 1 - i) : 1; + result.sizes[i] = new_dim; + result.strides[i] = cum_prod; + cum_prod *= new_dim; + } + return result; +} + +// TODO(b/80418076): Move to legacy ops file, update invocations. +inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) { + return RuntimeShape( + {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]}); +} + // Gets next index to iterate through a multidimensional array. inline bool NextIndex(const int num_dims, const int* dims, int* current) { - TFLITE_DCHECK_GT(num_dims, 0); + if (num_dims == 0) { + return false; + } TFLITE_DCHECK(dims != nullptr); TFLITE_DCHECK(current != nullptr); int carry = 1; @@ -72,7 +340,9 @@ inline bool NextIndex(const int num_dims, const int* dims, int* current) { inline size_t ReducedOutputOffset(const int num_dims, const int* dims, const int* index, const int num_axis, const int* axis) { - TFLITE_DCHECK_GT(num_dims, 0); + if (num_dims == 0) { + return 0; + } TFLITE_DCHECK(dims != nullptr); TFLITE_DCHECK(index != nullptr); size_t offset = 0; @@ -95,6 +365,16 @@ inline size_t ReducedOutputOffset(const int num_dims, const int* dims, return offset; } +inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) { + TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4); + const int* dims_data = shape.DimsDataUpTo4D(); + TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]); + TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]); + TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]); + TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]); + return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3; +} + inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) { TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]); TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]); @@ -108,7 +388,14 @@ inline int Offset(const Dims<4>& dims, int* index) { return Offset(dims, index[0], index[1], index[2], index[3]); } +inline int Offset(const RuntimeShape& shape, int* index) { + return Offset(shape, index[0], index[1], index[2], index[3]); +} + // Get array size, DCHECKing that the dim index is in range. +// +// Note that this will be phased out with Dims<4>, since RuntimeShape::Dims() +// already performs this check. template <int N> int ArraySize(const Dims<N>& array, int index) { TFLITE_DCHECK(index >= 0 && index < N); @@ -130,6 +417,21 @@ int MatchingArraySize(const ArrayType1& array1, int index1, return MatchingArraySize(array1, index1, args...); } +// Get common shape dim, DCHECKing that they all agree. +inline int MatchingDim(const RuntimeShape& shape1, int index1, + const RuntimeShape& shape2, int index2) { + TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2)); + return shape1.Dims(index1); +} + +template <typename... Args> +int MatchingDim(const RuntimeShape& shape1, int index1, + const RuntimeShape& shape2, int index2, Args... args) { + TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2)); + return MatchingDim(shape1, index1, args...); +} + +// Will be phased out with Dims<4>, replaced by RuntimeShape::FlatSize(). template <int N> inline int FlatSize(const Dims<N>& dims) { int flat_size = 1; @@ -139,13 +441,61 @@ inline int FlatSize(const Dims<N>& dims) { return flat_size; } -// Deprecated. Prefer FlatSize. +TFLITE_DEPRECATED("Prefer FlatSize.") inline int RequiredBufferSizeForDims(const Dims<4>& dims) { return FlatSize(dims); } // Flat size calculation, checking that dimensions match with one or more other // arrays. +inline int MatchingFlatSize(const RuntimeShape& shape, + const RuntimeShape& check_shape_0) { + TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount()); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) { + TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i)); + } + return shape.FlatSize(); +} + +inline int MatchingFlatSize(const RuntimeShape& shape, + const RuntimeShape& check_shape_0, + const RuntimeShape& check_shape_1) { + TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount()); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) { + TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i)); + } + return MatchingFlatSize(shape, check_shape_1); +} + +inline int MatchingFlatSize(const RuntimeShape& shape, + const RuntimeShape& check_shape_0, + const RuntimeShape& check_shape_1, + const RuntimeShape& check_shape_2) { + TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount()); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) { + TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i)); + } + return MatchingFlatSize(shape, check_shape_1, check_shape_2); +} + +inline int MatchingFlatSize(const RuntimeShape& shape, + const RuntimeShape& check_shape_0, + const RuntimeShape& check_shape_1, + const RuntimeShape& check_shape_2, + const RuntimeShape& check_shape_3) { + TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount()); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) { + TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i)); + } + return MatchingFlatSize(shape, check_shape_1, check_shape_2, check_shape_3); +} + +// Flat size calculation, checking that dimensions match with one or more other +// arrays. template <int N> inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) { for (int i = 0; i < N; ++i) { @@ -170,7 +520,7 @@ inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0, for (int i = 0; i < N; ++i) { TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i)); } - return FlatSize(dims, check_dims_1, check_dims_2); + return MatchingFlatSize(dims, check_dims_1, check_dims_2); } template <int N> @@ -181,7 +531,7 @@ inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0, for (int i = 0; i < N; ++i) { TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i)); } - return FlatSize(dims, check_dims_1, check_dims_2, check_dims_3); + return MatchingFlatSize(dims, check_dims_1, check_dims_2, check_dims_3); } // Data is required to be contiguous, and so many operators can use either the @@ -249,6 +599,72 @@ inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim, check_dims_3); } +// Data is required to be contiguous, and so many operators can use either the +// full array flat size or the flat size with one dimension skipped (commonly +// the depth). +inline int FlatSizeSkipDim(const RuntimeShape& shape, int skip_dim) { + const int dims_count = shape.DimensionsCount(); + TFLITE_DCHECK(skip_dim >= 0 && skip_dim < dims_count); + const auto* dims_data = shape.DimsData(); + int flat_size = 1; + for (int i = 0; i < dims_count; ++i) { + flat_size *= (i == skip_dim) ? 1 : dims_data[i]; + } + return flat_size; +} + +// A combination of MatchingFlatSize() and FlatSizeSkipDim(). +inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim, + const RuntimeShape& check_shape_0) { + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) { + if (i != skip_dim) { + TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i)); + } + } + return FlatSizeSkipDim(shape, skip_dim); +} + +inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim, + const RuntimeShape& check_shape_0, + const RuntimeShape& check_shape_1) { + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) { + if (i != skip_dim) { + TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i)); + } + } + return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1); +} + +inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim, + const RuntimeShape& check_shape_0, + const RuntimeShape& check_shape_1, + const RuntimeShape& check_shape_2) { + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) { + if (i != skip_dim) { + TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i)); + } + } + return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1, check_shape_2); +} + +inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim, + const RuntimeShape& check_shape_0, + const RuntimeShape& check_shape_1, + const RuntimeShape& check_shape_2, + const RuntimeShape& check_shape_3) { + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) { + if (i != skip_dim) { + TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i)); + } + } + return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1, check_shape_2, + check_shape_3); +} + template <int N> bool IsPackedWithoutStrides(const Dims<N>& dims) { int expected_stride = 1; @@ -259,6 +675,344 @@ bool IsPackedWithoutStrides(const Dims<N>& dims) { return true; } +template <int N> +void ComputeStrides(Dims<N>* dims) { + dims->strides[0] = 1; + for (int d = 1; d < N; d++) { + dims->strides[d] = dims->strides[d - 1] * dims->sizes[d - 1]; + } +} + +enum class BroadcastableOpCategory : uint8 { + kNone, + kNonBroadcast, // Matching input shapes. + kFirstInputBroadcastsFast, // Fivefold nested loops. + kSecondInputBroadcastsFast, // Fivefold nested loops. + kGenericBroadcast, // Fall-back. +}; + +struct MinMax { + float min; + float max; +}; +static_assert(sizeof(MinMax) == 8, ""); + +struct ActivationParams { + FusedActivationFunctionType activation_type; + // uint8, etc, activation params. + int32 quantized_activation_min; + int32 quantized_activation_max; +}; + +// For Add, Sub, Mul ops. +struct ArithmeticParams { + // Shape dependent / common to data / op types. + BroadcastableOpCategory broadcast_category; + // uint8 inference params. + int32 input1_offset; + int32 input2_offset; + int32 output_offset; + int32 output_multiplier; + int output_shift; + // Add / Sub, not Mul, uint8 inference params. + int left_shift; + int32 input1_multiplier; + int input1_shift; + int32 input2_multiplier; + int input2_shift; + // uint8, etc, activation params. + int32 quantized_activation_min; + int32 quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; + + // Processed output dimensions. + // Let input "a" be the one that broadcasts in the faster-changing dimension. + // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and + // {b0, b1, b2, b3, b4}, + // broadcast_shape[4] = b0 = a0. + // broadcast_shape[3] = b1; a1 = 1. + // broadcast_shape[2] = b2 = a2. + // broadcast_shape[1] = a3; b3 = 1. + // broadcast_shape[0] = b4 = a4. + int broadcast_shape[5]; +}; + +struct ConcatenationParams { + int8 axis; + const int32* input_zeropoint; + const float* input_scale; + uint16 inputs_count; + int32 output_zeropoint; + float output_scale; +}; + +struct ComparisonParams { + // uint8 inference params. + int left_shift; + int32 input1_offset; + int32 input1_multiplier; + int input1_shift; + int32 input2_offset; + int32 input2_multiplier; + int input2_shift; + // Shape dependent / common to inference types. + bool is_broadcast; +}; + +struct ConvParams { + PaddingType padding_type; + PaddingValues padding_values; + // TODO(starka): This was just "stride", so check that width+height is OK. + int16 stride_width; + int16 stride_height; + int16 dilation_width_factor; + int16 dilation_height_factor; + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32 input_offset; + int32 weights_offset; + int32 output_offset; + int32 output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32 quantized_activation_min; + int32 quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +struct DepthToSpaceParams { + int32 block_size; +}; + +struct DepthwiseParams { + PaddingType padding_type; + PaddingValues padding_values; + int16 stride_width; + int16 stride_height; + int16 dilation_width_factor; + int16 dilation_height_factor; + int16 depth_multiplier; + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32 input_offset; + int32 weights_offset; + int32 output_offset; + int32 output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32 quantized_activation_min; + int32 quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +struct DequantizationParams { + double scale; + int32 zero_point; +}; + +struct FakeQuantParams { + MinMax minmax; + int32 num_bits; +}; + +struct FullyConnectedParams { + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32 input_offset; + int32 weights_offset; + int32 output_offset; + int32 output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32 quantized_activation_min; + int32 quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; + FullyConnectedWeightsFormat weights_format; +}; + +struct GatherParams { + int16 input_rank; + int16 axis; +}; + +struct L2NormalizationParams { + // uint8 inference params. + int32 input_zero_point; +}; + +struct LocalResponseNormalizationParams { + int32 range; + double bias; + double alpha; + double beta; +}; + +struct LogisticParams { + // uint8 inference params. + int32 input_zero_point; + int32 input_range_radius; + int32 input_multiplier; + int input_left_shift; +}; + +struct LstmCellParams { + int32 weights_zero_point; + int32 accum_multiplier; + int accum_shift; + int state_integer_bits; +}; + +struct MeanParams { + int8 axis_count; + int16 axis[4]; +}; + +struct PackParams { + int8 axis; + const int32* input_zeropoint; + const float* input_scale; + uint16 inputs_count; + int32 output_zeropoint; + float output_scale; +}; + +struct PadParams { + int8 left_padding_count; + int32 left_padding[4]; + int8 right_padding_count; + int32 right_padding[4]; +}; + +struct PoolParams { + FusedActivationFunctionType activation; + PaddingType padding_type; + PaddingValues padding_values; + int stride_height; + int stride_width; + int filter_height; + int filter_width; + // uint8, etc, activation params. + int32 quantized_activation_min; + int32 quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +struct ReshapeParams { + int8 shape_count; + int32 shape[4]; +}; + +struct ResizeBilinearParams { + bool align_corners; +}; + +struct SliceParams { + int8 begin_count; + int32 begin[4]; + int8 size_count; + int32 size[4]; +}; + +struct SoftmaxParams { + // beta is not really used (not a Tensorflow parameter) and not implemented + // for LogSoftmax. + double beta; + // uint8 inference params. Used even when beta defaults to 1.0. + int32 input_multiplier; + int32 input_left_shift; + // Reverse scaling is only used by LogSoftmax. + int32 reverse_scaling_divisor; + int32 reverse_scaling_right_shift; + int diff_min; +}; + +struct SpaceToBatchParams { + // "Zero" padding for uint8 means padding with the output offset. + int32 output_offset; +}; + +struct SpaceToDepthParams { + int32 block_size; +}; + +struct SplitParams { + // Graphs that split into, say, 2000 nodes are encountered. The indices in + // OperatorEdges are of type uint16. + uint16 num_split; + int16 axis; +}; + +struct SqueezeParams { + int8 squeeze_dims_count; + int32 squeeze_dims[4]; +}; + +struct StridedSliceParams { + int8 start_indices_count; + int16 start_indices[4]; + int8 stop_indices_count; + int16 stop_indices[4]; + int8 strides_count; + int16 strides[4]; + + int16 begin_mask; + int16 ellipsis_mask; + int16 end_mask; + int16 new_axis_mask; + int16 shrink_axis_mask; +}; + +struct TanhParams { + int32 input_zero_point; + int32 input_range_radius; + int32 input_multiplier; + int input_left_shift; +}; + +struct TransposeParams { + int8 perm_count; + int32 perm[4]; +}; + +struct UnpackParams { + uint16 num_split; + int16 axis; +}; + +template <typename P> +inline void SetActivationParams(float min, float max, P* params) { + params->float_activation_min = min; + params->float_activation_max = max; +} + +template <typename P> +inline void SetActivationParams(int32 min, int32 max, P* params) { + params->quantized_activation_min = min; + params->quantized_activation_max = max; +} + +template <typename P> +inline void GetActivationParams(const P& params, int32* min, int32* max) { + *min = params.quantized_activation_min; + *max = params.quantized_activation_max; +} + +template <typename P> +inline void GetActivationParams(const P& params, float* min, float* max) { + *min = params.float_activation_min; + *max = params.float_activation_max; +} + } // namespace tflite #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_ diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc index 239b533a17..503ef28459 100644 --- a/tensorflow/contrib/lite/kernels/kernel_util.cc +++ b/tensorflow/contrib/lite/kernels/kernel_util.cc @@ -37,19 +37,17 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context, TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 1e-6 * std::min(input_product_scale, bias_scale)); TF_LITE_ENSURE(context, input_product_scale >= 0); - TF_LITE_ENSURE(context, input_product_scale < output_scale); *multiplier = input_product_scale / output_scale; return kTfLiteOk; } -void CalculateActivationRangeUint8(TfLiteFusedActivation activation, - TfLiteTensor* output, int32_t* act_min, - int32_t* act_max) { - const int32_t qmin = std::numeric_limits<uint8_t>::min(); - const int32_t qmax = std::numeric_limits<uint8_t>::max(); - +namespace { +void CalculateActivationRangeQuantizedImpl(TfLiteFusedActivation activation, + int32_t qmin, int32_t qmax, + TfLiteTensor* output, + int32_t* act_min, int32_t* act_max) { const auto scale = output->params.scale; const auto zero_point = output->params.zero_point; @@ -71,29 +69,47 @@ void CalculateActivationRangeUint8(TfLiteFusedActivation activation, *act_max = qmax; } } - -void CalculateActivationRangeFloat(TfLiteFusedActivation activation, - float* activation_min, - float* activation_max) { - if (activation == kTfLiteActRelu) { - *activation_min = 0.f; - *activation_max = std::numeric_limits<float>::max(); - } else if (activation == kTfLiteActRelu6) { - *activation_min = 0.f; - *activation_max = 6.f; - } else if (activation == kTfLiteActRelu1) { - *activation_min = -1.f; - *activation_max = 1.f; +} // namespace + +TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context, + TfLiteFusedActivation activation, + TfLiteTensor* output, + int32_t* act_min, + int32_t* act_max) { + int32_t qmin = 0; + int32_t qmax = 0; + if (output->type == kTfLiteUInt8) { + qmin = std::numeric_limits<uint8_t>::min(); + qmax = std::numeric_limits<uint8_t>::max(); + } else if (output->type == kTfLiteInt16) { + qmin = std::numeric_limits<int16_t>::min(); + qmax = std::numeric_limits<int16_t>::max(); } else { - *activation_min = std::numeric_limits<float>::lowest(); - *activation_max = std::numeric_limits<float>::max(); + TF_LITE_ENSURE(context, false); } + + CalculateActivationRangeQuantizedImpl(activation, qmin, qmax, output, act_min, + act_max); + return kTfLiteOk; +} + +void CalculateActivationRangeUint8(TfLiteFusedActivation activation, + TfLiteTensor* output, int32_t* act_min, + int32_t* act_max) { + const int32_t qmin = std::numeric_limits<uint8_t>::min(); + const int32_t qmax = std::numeric_limits<uint8_t>::max(); + + CalculateActivationRangeQuantizedImpl(activation, qmin, qmax, output, act_min, + act_max); } bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2) { return TfLiteIntArrayEqual(input1->dims, input2->dims); } +// TODO(petewarden): Having macros around this is ugly, look at other strategies +// before replicating this approach elsewhere. +#ifndef TF_LITE_STATIC_MEMORY TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, const TfLiteTensor* input1, const TfLiteTensor* input2, @@ -112,5 +128,6 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, *output_shape = shape.release(); return kTfLiteOk; } +#endif // TF_LITE_STATIC_MEMORY } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h index 82cded36f2..e9a5fd7a40 100644 --- a/tensorflow/contrib/lite/kernels/kernel_util.h +++ b/tensorflow/contrib/lite/kernels/kernel_util.h @@ -15,8 +15,11 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include <algorithm> +#include <limits> + +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" namespace tflite { @@ -28,6 +31,11 @@ inline const TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node, int index) { return &context->tensors[node->inputs->data[index]]; } +inline TfLiteTensor* GetVariableInput(TfLiteContext* context, TfLiteNode* node, + int index) { + TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]]; + return (tensor->is_variable) ? tensor : nullptr; +} inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node, int index) { return &context->tensors[node->outputs->data[index]]; @@ -86,14 +94,35 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context, TfLiteTensor* output, double* multiplier); -// Calculates the useful range of an activation layer given its activation -// tensor. +// Calculates the useful quantized range of an activation layer given its +// activation tensor. +TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context, + TfLiteFusedActivation activation, + TfLiteTensor* output, + int32_t* act_min, + int32_t* act_max); void CalculateActivationRangeUint8(TfLiteFusedActivation activation, TfLiteTensor* output, int32_t* act_min, int32_t* act_max); -void CalculateActivationRangeFloat(TfLiteFusedActivation activation, - float* activation_min, - float* activation_max); +// Calculates the useful range of an activation layer given its activation +// tensor.a +template <typename T> +void CalculateActivationRange(TfLiteFusedActivation activation, + T* activation_min, T* activation_max) { + if (activation == kTfLiteActRelu) { + *activation_min = 0; + *activation_max = std::numeric_limits<T>::max(); + } else if (activation == kTfLiteActRelu6) { + *activation_min = 0; + *activation_max = 6; + } else if (activation == kTfLiteActRelu1) { + *activation_min = -1; + *activation_max = 1; + } else { + *activation_min = std::numeric_limits<T>::lowest(); + *activation_max = std::numeric_limits<T>::max(); + } +} // Return true if the given tensors have the same shape. bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2); diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc index 7cea63da87..e02d7df9ef 100644 --- a/tensorflow/contrib/lite/kernels/l2norm.cc +++ b/tensorflow/contrib/lite/kernels/l2norm.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -68,10 +68,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = GetOutput(context, node, kOutputTensor); if (output->type == kTfLiteFloat32) { -#define TF_LITE_L2NORM(type) \ - type::L2Normalization<FusedActivationFunctionType::kNone>( \ - GetTensorData<float>(input), GetTensorDims(input), \ - GetTensorData<float>(output), GetTensorDims(output)) +#define TF_LITE_L2NORM(type) \ + tflite::L2NormalizationParams op_params; \ + op_params.input_zero_point = 0; \ + type::L2Normalization(op_params, GetTensorShape(input), \ + GetTensorData<float>(input), GetTensorShape(output), \ + GetTensorData<float>(output)) if (kernel_type == kReference) { TF_LITE_L2NORM(reference_ops); @@ -81,10 +83,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } #undef TF_LITE_L2NORM } else if (output->type == kTfLiteUInt8) { -#define TF_LITE_L2NORM(type) \ - type::L2Normalization(GetTensorData<uint8>(input), GetTensorDims(input), \ - input->params.zero_point, \ - GetTensorData<uint8>(output), GetTensorDims(output)) +#define TF_LITE_L2NORM(type) \ + tflite::L2NormalizationParams op_params; \ + op_params.input_zero_point = input->params.zero_point; \ + type::L2Normalization(op_params, GetTensorShape(input), \ + GetTensorData<uint8>(input), GetTensorShape(output), \ + GetTensorData<uint8>(output)) if (kernel_type == kReference) { TF_LITE_L2NORM(reference_ops); @@ -94,7 +98,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } #undef TF_LITE_L2NORM } else { - context->ReportError(context, "Inputs and outputs not all float types."); + context->ReportError(context, "Output type is %d, requires float.", + output->type); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc index 11cc666bad..070ed60040 100644 --- a/tensorflow/contrib/lite/kernels/l2norm_test.cc +++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc @@ -67,7 +67,7 @@ class L2NormOpModel : public SingleOpModel { int output_; }; -TEST(L2NormOpTest, SimpleTest) { +TEST(L2NormOpTest, SimpleFloatTest) { L2NormOpModel m({1, 1, 1, 6}, TensorType_FLOAT32, ActivationFunctionType_NONE); m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1}); @@ -76,7 +76,7 @@ TEST(L2NormOpTest, SimpleTest) { ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05})); } -TEST(L2NormOpTest, MultipleBatchesTest) { +TEST(L2NormOpTest, MultipleBatchFloatTest) { L2NormOpModel m({3, 1, 1, 6}, TensorType_FLOAT32, ActivationFunctionType_NONE); m.SetInput({ @@ -105,6 +105,32 @@ TEST(L2NormOpTest, SimpleUint8Test) { ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}, 0.1))); } +TEST(L2NormOpTest, MultipleBatchUint8Test) { + L2NormOpModel m({3, 1, 1, 6}, TensorType_UINT8, ActivationFunctionType_NONE); + + m.QuantizeAndPopulate<uint8_t>(m.input(), + { + -1.1, 0.6, 0.7, 1.2, -0.7, 0.1, // batch 1 + -1.1, 0.6, 0.7, 1.2, -0.7, 0.1, // batch 2 + -1.1, 0.6, 0.7, 1.2, -0.7, 0.1, // batch 3 + }); + m.Invoke(); + EXPECT_THAT(m.GetOutput<uint8_t>(), + ElementsAreArray({ + 58, 166, 173, 205, 83, 134, // batch 1 + 58, 166, 173, 205, 83, 134, // batch 2 + 58, 166, 173, 205, 83, 134, // batch 3 + })); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear( + { + -0.55, 0.3, 0.35, 0.6, -0.35, 0.05, // batch 1 + -0.55, 0.3, 0.35, 0.6, -0.35, 0.05, // batch 2 + -0.55, 0.3, 0.35, 0.6, -0.35, 0.05, // batch 3 + }, + 0.1))); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/layer_norm_lstm.cc b/tensorflow/contrib/lite/kernels/layer_norm_lstm.cc new file mode 100644 index 0000000000..9739fd4514 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/layer_norm_lstm.cc @@ -0,0 +1,1316 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Layer Normalization LSTM op that applies normalization by mean and standard +// deviation to the activation of the LSTM layers. Please see +// https://arxiv.org/abs/1607.06450 for details. +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers +#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +namespace tflite { +namespace ops { +namespace custom { +namespace layer_norm_lstm { + +// Struct to hold Layer Norm LSTM option data. +struct OpData { + TfLiteFusedActivation activation; + float cell_clip; + float proj_clip; + int scratch_tensor_index; +}; + +// Input Tensors of size {n_batch, n_input} +constexpr int kInputTensor = 0; + +// Input weight tensors of size: {n_cell, n_input} +constexpr int kInputToInputWeightsTensor = 1; // Optional +constexpr int kInputToForgetWeightsTensor = 2; +constexpr int kInputToCellWeightsTensor = 3; +constexpr int kInputToOutputWeightsTensor = 4; + +// Recurrent weight tensors of size {n_cell, n_output} +constexpr int kRecurrentToInputWeightsTensor = 5; // Optional +constexpr int kRecurrentToForgetWeightsTensor = 6; +constexpr int kRecurrentToCellWeightsTensor = 7; +constexpr int kRecurrentToOutputWeightsTensor = 8; + +// Peephole weights tensors of size {n_cell}, representing a diagonal matrix. +constexpr int kCellToInputWeightsTensor = 9; // Optional +constexpr int kCellToForgetWeightsTensor = 10; // Optional +constexpr int kCellToOutputWeightsTensor = 11; // Optional + +// Layer norm weights tensors of size {n_cell}, representing a diagonal matrix. +constexpr int kInputLayerNormWeightsTensor = 12; +constexpr int kForgetLayerNormWeightsTensor = 13; +constexpr int kCellLayerNormWeightsTensor = 14; +constexpr int kOutputLayerNormWeightsTensor = 15; + +// Gates bias tensors of size {n_cell} +constexpr int kInputGateBiasTensor = 16; // Optional +constexpr int kForgetGateBiasTensor = 17; +constexpr int kCellGateBiasTensor = 18; +constexpr int kOutputGateBiasTensor = 19; + +// Projection weight tensor of size {n_output, n_cell} +constexpr int kProjectionWeightsTensor = 20; // Optional +// Projection bias tensor of size {n_output} +constexpr int kProjectionBiasTensor = 21; // Optional + +// State tensors. +constexpr int kInputActivationStateTensor = 22; +constexpr int kInputCellStateTensor = 23; + +// Output tensor. +constexpr int kOutputTensor = 0; + +// Total number of scratch tensors for hybrid Op. +constexpr int kTensorsToAdd = 7; + +// Small float to avoid divergence during calculation of deviation. +const float kLayerNormEpsilon = 1e-8; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* data = new OpData; + + // Turn custom option data into flexbuffer map format. + const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer); + const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap(); + + // Get activation function, cell_clip and proj_clip from the flexbuffer. + // TODO(b/113824099): make activation more generic. + assert(m["fused_activation_function"].ToString() == "TANH"); + data->activation = kTfLiteActTanh; + data->cell_clip = m["cell_clip"].AsFloat(); + data->proj_clip = m["proj_clip"].AsFloat(); + + // Populate scratch_tensor_index. + context->AddTensors(context, /*tensors_to_add=*/kTensorsToAdd, + &data->scratch_tensor_index); + return data; +} + +// Check that input tensor dimensions matches with each other. +TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context, + TfLiteNode* node, int n_input, + int n_output, int n_cell) { + const OpData* op_data = reinterpret_cast<OpData*>(node->user_data); + + // Making sure clipping parameters have valid values. + // == 0 means no clipping + // > 0 means clipping + TF_LITE_ENSURE(context, op_data->cell_clip >= 0); + TF_LITE_ENSURE(context, op_data->proj_clip >= 0); + + const TfLiteTensor* input_to_input_weights = + GetOptionalInputTensor(context, node, kInputToInputWeightsTensor); + if (input_to_input_weights != nullptr) { + TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell); + TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input); + } + + const TfLiteTensor* input_to_forget_weights = + GetInput(context, node, kInputToForgetWeightsTensor); + TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell); + TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input); + + const TfLiteTensor* input_to_cell_weights = + GetInput(context, node, kInputToCellWeightsTensor); + TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell); + TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input); + + const TfLiteTensor* recurrent_to_input_weights = + GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor); + if (recurrent_to_input_weights != nullptr) { + TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0], + n_cell); + TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1], + n_output); + } + + const TfLiteTensor* recurrent_to_forget_weights = + GetInput(context, node, kRecurrentToForgetWeightsTensor); + TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0], + n_cell); + TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1], + n_output); + + const TfLiteTensor* recurrent_to_cell_weights = + GetInput(context, node, kRecurrentToCellWeightsTensor); + TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell); + TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1], + n_output); + + // We make sure the input-gate's parameters are either both present (regular + // LSTM) or not at all (CIFG-LSTM). + const bool cifg_weights_all_or_none = + ((input_to_input_weights != nullptr) && + (recurrent_to_input_weights != nullptr)) || + ((input_to_input_weights == nullptr) && + (recurrent_to_input_weights == nullptr)); + TF_LITE_ENSURE(context, cifg_weights_all_or_none == true); + + const TfLiteTensor* cell_to_input_weights = + GetOptionalInputTensor(context, node, kCellToInputWeightsTensor); + if (cell_to_input_weights) { + TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1); + TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell); + } + + const TfLiteTensor* cell_to_forget_weights = + GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor); + if (cell_to_forget_weights) { + TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1); + TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell); + } + + const TfLiteTensor* cell_to_output_weights = + GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor); + if (cell_to_output_weights) { + TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1); + TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell); + } + + // Making sure the peephole weights are there all or none. + const bool use_cifg = (input_to_input_weights == nullptr); + const bool peephole_weights_all_or_none = + ((cell_to_input_weights != nullptr || use_cifg) && + (cell_to_forget_weights != nullptr) && + (cell_to_output_weights != nullptr)) || + ((cell_to_input_weights == nullptr) && + (cell_to_forget_weights == nullptr) && + (cell_to_output_weights == nullptr)); + TF_LITE_ENSURE(context, peephole_weights_all_or_none == true); + + // Making sure layer norm weights are not null and have the right dimension. + const TfLiteTensor* input_layer_norm_weights = + GetInput(context, node, kInputLayerNormWeightsTensor); + TF_LITE_ENSURE(context, input_layer_norm_weights != nullptr); + TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->size, 1); + TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->data[0], n_cell); + + const TfLiteTensor* forget_layer_norm_weights = + GetInput(context, node, kForgetLayerNormWeightsTensor); + TF_LITE_ENSURE(context, forget_layer_norm_weights != nullptr); + TF_LITE_ENSURE_EQ(context, forget_layer_norm_weights->dims->size, 1); + TF_LITE_ENSURE_EQ(context, forget_layer_norm_weights->dims->data[0], n_cell); + + const TfLiteTensor* cell_layer_norm_weights = + GetInput(context, node, kCellLayerNormWeightsTensor); + TF_LITE_ENSURE(context, cell_layer_norm_weights != nullptr); + TF_LITE_ENSURE_EQ(context, cell_layer_norm_weights->dims->size, 1); + TF_LITE_ENSURE_EQ(context, cell_layer_norm_weights->dims->data[0], n_cell); + + const TfLiteTensor* output_layer_norm_weights = + GetInput(context, node, kOutputLayerNormWeightsTensor); + TF_LITE_ENSURE(context, output_layer_norm_weights != nullptr); + TF_LITE_ENSURE_EQ(context, output_layer_norm_weights->dims->size, 1); + TF_LITE_ENSURE_EQ(context, output_layer_norm_weights->dims->data[0], n_cell); + + // Make sure the input gate bias is present only when not a CIFG-LSTM. + const TfLiteTensor* input_gate_bias = + GetOptionalInputTensor(context, node, kInputGateBiasTensor); + if (use_cifg) { + TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr); + } else { + TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1); + TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell); + } + + const TfLiteTensor* forget_gate_bias = + GetInput(context, node, kForgetGateBiasTensor); + TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1); + TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell); + + const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor); + TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1); + TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell); + + const TfLiteTensor* output_gate_bias = + GetInput(context, node, kOutputGateBiasTensor); + TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1); + TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell); + + const TfLiteTensor* projection_weights = + GetOptionalInputTensor(context, node, kProjectionWeightsTensor); + if (projection_weights != nullptr) { + TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output); + TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell); + } + + const TfLiteTensor* projection_bias = + GetOptionalInputTensor(context, node, kProjectionBiasTensor); + if (projection_bias != nullptr) { + TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1); + TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output); + } + + // Making sure the projection tensors are consistent: + // 1) If projection weight is not present, then projection bias should not be + // present. + // 2) If projection weight is present, then projection bias is optional. + const bool projection_tensors_consistent = + ((projection_weights != nullptr) || (projection_bias == nullptr)); + TF_LITE_ENSURE(context, projection_tensors_consistent == true); + + return kTfLiteOk; +} + +// Resize the output, state tensors based on the sizes of the input tensors. +// Allocate a temporary scratch tensor. Also check that the sizes of the input +// tensors match each other. +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + OpData* op_data = reinterpret_cast<OpData*>(node->user_data); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 24); + TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); + + // Inferring batch size, number of outputs and number of cells from the + // input tensors. + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + TF_LITE_ENSURE(context, input->dims->size > 1); + const int n_batch = input->dims->data[0]; + const int n_input = input->dims->data[1]; + + const TfLiteTensor* input_to_output_weights = + GetInput(context, node, kInputToOutputWeightsTensor); + const int n_cell = input_to_output_weights->dims->data[0]; + TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input); + + const TfLiteTensor* recurrent_to_output_weights = + GetInput(context, node, kRecurrentToOutputWeightsTensor); + TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0], + n_cell); + const int n_output = recurrent_to_output_weights->dims->data[1]; + + // Check that input tensor dimensions matches with each other. + TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input, + n_output, n_cell)); + + // Get the pointer to output, activation_state and cell_state tensors. + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + const TfLiteTensor* activation_state = + GetInput(context, node, kInputActivationStateTensor); + const TfLiteTensor* cell_state = + GetInput(context, node, kInputCellStateTensor); + + // Check the shape of input state tensors. + // These tensor may be 1D or 2D. It's fine as long as the total size is + // correct. + TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output); + TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell); + // Resize the output tensors. + TfLiteIntArray* output_size = TfLiteIntArrayCreate(2); + output_size->data[0] = n_batch; + output_size->data[1] = n_output; + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, output, output_size)); + + // The weights are of consistent type, so it suffices to check one. + const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 && + input->type == kTfLiteFloat32); + + TfLiteIntArrayFree(node->temporaries); + if (is_hybrid_op) { + node->temporaries = TfLiteIntArrayCreate(7); + } else { + node->temporaries = TfLiteIntArrayCreate(1); + } + node->temporaries->data[0] = op_data->scratch_tensor_index; + + // Create a scratch buffer tensor. + TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0); + scratch_buffer->type = input->type; + scratch_buffer->allocation_type = kTfLiteArenaRw; + + const TfLiteTensor* input_to_input_weights = + GetOptionalInputTensor(context, node, kInputToInputWeightsTensor); + const bool use_cifg = (input_to_input_weights == nullptr); + TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2); + scratch_buffer_size->data[0] = n_batch; + if (use_cifg) { + // Reserving space for Cell, Forget, Output gates + scratch_buffer_size->data[1] = n_cell * 3; + } else { + // Reserving space for Input, Cell, Forget, Output gates + scratch_buffer_size->data[1] = n_cell * 4; + } + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer, + scratch_buffer_size)); + + if (is_hybrid_op) { + // Allocate temporary tensors to store quantized values of input, + // activation_state and cell_state tensors. + node->temporaries->data[1] = op_data->scratch_tensor_index + 1; + TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } + node->temporaries->data[2] = op_data->scratch_tensor_index + 2; + TfLiteTensor* activation_state_quantized = + GetTemporary(context, node, /*index=*/2); + activation_state_quantized->type = kTfLiteUInt8; + activation_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(activation_state_quantized->dims, + activation_state->dims)) { + TfLiteIntArray* activation_state_quantized_size = + TfLiteIntArrayCopy(activation_state->dims); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, activation_state_quantized, + activation_state_quantized_size)); + } + node->temporaries->data[3] = op_data->scratch_tensor_index + 3; + TfLiteTensor* cell_state_quantized = + GetTemporary(context, node, /*index=*/3); + cell_state_quantized->type = kTfLiteUInt8; + cell_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) { + TfLiteIntArray* cell_state_quantized_size = + TfLiteIntArrayCopy(cell_state->dims); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, cell_state_quantized, + cell_state_quantized_size)); + } + + // Allocate temporary tensors to store scaling factors and product scaling + // factors. The latter is a convenience storage which allows to quantize + // a vector once (which produces the scaling factors) and multiply it with + // different matrices (which requires multiplying the scaling factors with + // the scaling factor of the matrix). + node->temporaries->data[4] = op_data->scratch_tensor_index + 4; + TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + scaling_factors_size->data[0] = n_batch; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } + node->temporaries->data[5] = op_data->scratch_tensor_index + 5; + TfLiteTensor* prod_scaling_factors = + GetTemporary(context, node, /*index=*/5); + prod_scaling_factors->type = kTfLiteFloat32; + prod_scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1); + prod_scaling_factors_size->data[0] = n_batch; + if (!TfLiteIntArrayEqual(prod_scaling_factors->dims, + prod_scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, prod_scaling_factors, + prod_scaling_factors_size)); + } + + // Allocate a temporary tensor to store the recovered weights. Since + // this is used for diagonal matrices, only need to store n_cell values. + node->temporaries->data[6] = op_data->scratch_tensor_index + 6; + TfLiteTensor* recovered_weights = GetTemporary(context, node, /*index=*/6); + recovered_weights->type = kTfLiteFloat32; + recovered_weights->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* recovered_weights_size = TfLiteIntArrayCreate(1); + recovered_weights_size->data[0] = n_cell; + if (!TfLiteIntArrayEqual(recovered_weights->dims, recovered_weights_size)) { + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, recovered_weights, + recovered_weights_size)); + } + } + return kTfLiteOk; +} + +void LayerNormLstmStep( + const float* input_ptr_batch, const float* input_to_input_weights_ptr, + const float* input_to_forget_weights_ptr, + const float* input_to_cell_weights_ptr, + const float* input_to_output_weights_ptr, + const float* recurrent_to_input_weights_ptr, + const float* recurrent_to_forget_weights_ptr, + const float* recurrent_to_cell_weights_ptr, + const float* recurrent_to_output_weights_ptr, + const float* cell_to_input_weights_ptr, + const float* cell_to_forget_weights_ptr, + const float* cell_to_output_weights_ptr, + const float* input_layer_norm_weight_ptr, + const float* forget_layer_norm_weight_ptr, + const float* cell_layer_norm_weight_ptr, + const float* output_layer_norm_weight_ptr, const float* input_gate_bias_ptr, + const float* forget_gate_bias_ptr, const float* cell_bias_ptr, + const float* output_gate_bias_ptr, const float* projection_weights_ptr, + const float* projection_bias_ptr, float cell_clip, float proj_clip, + const TfLiteFusedActivation& activation, int n_batch, int n_cell, + int n_input, int n_output, float* output_state_ptr, float* cell_state_ptr, + float* input_gate_scratch, float* forget_gate_scratch, float* cell_scratch, + float* output_gate_scratch, float* output_ptr_batch) { + // Since we have already checked that weights are all there or none, we can + // check the existense of only one to the get the condition. + const bool use_cifg = (input_to_input_weights_ptr == nullptr); + const bool use_peephole = (cell_to_output_weights_ptr != nullptr); + + // Initialize scratch buffers with 0. + if (!use_cifg) { + tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch); + } + tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch); + tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch); + tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch); + + // For each batch and cell: compute input_weight * input. + if (!use_cifg) { + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, + input_gate_scratch, /*result_stride=*/1); + } + + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, + forget_gate_scratch, /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, + cell_scratch, /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, + output_gate_scratch, /*result_stride=*/1); + + // For each batch and cell: compute recurrent_weight * output_state. + if (!use_cifg) { + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr, + n_batch, input_gate_scratch, /*result_stride=*/1); + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr, + n_batch, forget_gate_scratch, + /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr, + n_batch, cell_scratch, /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr, + n_batch, output_gate_scratch, + /*result_stride=*/1); + + // For each batch and cell: update input gate. + if (!use_cifg) { + if (use_peephole) { + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch, + input_gate_scratch); + } + tensor_utils::MeanStddevNormalization(input_gate_scratch, + input_gate_scratch, n_cell, n_batch, + kLayerNormEpsilon); + tensor_utils::VectorBatchVectorCwiseProduct(input_layer_norm_weight_ptr, + n_cell, input_gate_scratch, + n_batch, input_gate_scratch); + tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch, + input_gate_scratch); + tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch, + input_gate_scratch); + } + + // For each batch and cell: update forget gate. + if (use_peephole) { + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch, + forget_gate_scratch); + } + tensor_utils::MeanStddevNormalization(forget_gate_scratch, + forget_gate_scratch, n_cell, n_batch, + kLayerNormEpsilon); + tensor_utils::VectorBatchVectorCwiseProduct(forget_layer_norm_weight_ptr, + n_cell, forget_gate_scratch, + n_batch, forget_gate_scratch); + tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch, + forget_gate_scratch); + tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch, + forget_gate_scratch); + + // For each batch and cell: update the cell. + tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell, + n_batch, kLayerNormEpsilon); + tensor_utils::VectorBatchVectorCwiseProduct( + cell_layer_norm_weight_ptr, n_cell, cell_scratch, n_batch, cell_scratch); + tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch, + cell_scratch); + tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr, + n_batch * n_cell, cell_state_ptr); + tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell, + activation, cell_scratch); + if (use_cifg) { + tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell, + forget_gate_scratch); + tensor_utils::VectorVectorCwiseProductAccumulate( + cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr); + } else { + tensor_utils::VectorVectorCwiseProductAccumulate( + cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr); + } + if (cell_clip > 0.0) { + tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell, cell_clip, + cell_state_ptr); + } + + // For each batch and cell: update the output gate. + if (use_peephole) { + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch, + output_gate_scratch); + } + tensor_utils::MeanStddevNormalization(output_gate_scratch, + output_gate_scratch, n_cell, n_batch, + kLayerNormEpsilon); + tensor_utils::VectorBatchVectorCwiseProduct(output_layer_norm_weight_ptr, + n_cell, output_gate_scratch, + n_batch, output_gate_scratch); + tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch, + output_gate_scratch); + tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell, + output_gate_scratch); + tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell, + activation, cell_scratch); + tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch, + n_batch * n_cell, output_gate_scratch); + + // For each batch: update the projection and output_state. + const bool use_projection_weight = (projection_weights_ptr != nullptr); + const bool use_projection_bias = (projection_bias_ptr != nullptr); + if (use_projection_weight) { + if (use_projection_bias) { + tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output, + n_batch, output_ptr_batch); + } else { + tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output); + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch, + output_ptr_batch, /*result_stride=*/1); + if (proj_clip > 0.0) { + tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output, proj_clip, + output_ptr_batch); + } + } else { + tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output, + output_ptr_batch); + } + tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output, + output_state_ptr); +} + +void LayerNormLstmStep( + const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr, + float input_to_input_weights_scale, + const int8_t* input_to_forget_weights_ptr, + float input_to_forget_weights_scale, + const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale, + const int8_t* input_to_output_weights_ptr, + float input_to_output_weights_scale, + const int8_t* recurrent_to_input_weights_ptr, + float recurrent_to_input_weights_scale, + const int8_t* recurrent_to_forget_weights_ptr, + float recurrent_to_forget_weights_scale, + const int8_t* recurrent_to_cell_weights_ptr, + float recurrent_to_cell_weights_scale, + const int8_t* recurrent_to_output_weights_ptr, + float recurrent_to_output_weights_scale, + const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale, + const int8_t* cell_to_forget_weights_ptr, + float cell_to_forget_weights_scale, + const int8_t* cell_to_output_weights_ptr, + float cell_to_output_weights_scale, + const float* input_layer_norm_weight_ptr, + const float* forget_layer_norm_weight_ptr, + const float* cell_layer_norm_weight_ptr, + const float* output_layer_norm_weight_ptr, const float* input_gate_bias_ptr, + const float* forget_gate_bias_ptr, const float* cell_bias_ptr, + const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr, + float projection_weights_scale, const float* projection_bias_ptr, + float cell_clip, float proj_clip, const TfLiteFusedActivation& activation, + int n_batch, int n_cell, int n_input, int n_output, + float* input_gate_scratch, float* forget_gate_scratch, float* cell_scratch, + float* output_gate_scratch, float* scaling_factors, + float* product_scaling_factors, float* recovered_weights, + int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr, + int8_t* quantized_cell_state_ptr, float* output_state_ptr, + float* cell_state_ptr, float* output_ptr_batch) { + // Since we have already checked that weights are all there or none, we can + // check the existense of only one to the get the condition. + const bool use_cifg = (input_to_input_weights_ptr == nullptr); + const bool use_peephole = (cell_to_output_weights_ptr != nullptr); + + // Initialize scratch buffers with 0. + if (!use_cifg) { + tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch); + } + tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch); + tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch); + tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch); + + if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) { + // Save quantization and matmul computation for all zero input. + float unused_min, unused_max; + for (int b = 0; b < n_batch; ++b) { + const int offset = b * n_input; + tensor_utils::SymmetricQuantizeFloats( + input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset, + &unused_min, &unused_max, &scaling_factors[b]); + } + // For each batch and cell: compute input_weight * input. + if (!use_cifg) { + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * input_to_input_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_input_weights_ptr, n_cell, n_input, + quantized_input_ptr_batch, product_scaling_factors, n_batch, + input_gate_scratch, /*result_stride=*/1); + } + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * input_to_forget_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr_batch, + product_scaling_factors, n_batch, forget_gate_scratch, + /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * input_to_cell_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr_batch, + product_scaling_factors, n_batch, cell_scratch, /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * input_to_output_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch, + product_scaling_factors, n_batch, output_gate_scratch, + /*result_stride=*/1); + } + + if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) { + // Save quantization and matmul computation for all zero input. + float unused_min, unused_max; + for (int b = 0; b < n_batch; ++b) { + const int offset = b * n_output; + tensor_utils::SymmetricQuantizeFloats(output_state_ptr + offset, n_output, + quantized_output_state_ptr + offset, + &unused_min, &unused_max, + &scaling_factors[b]); + } + // For each batch and cell: compute recurrent_weight * output_state. + if (!use_cifg) { + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * recurrent_to_input_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_input_weights_ptr, n_cell, n_output, + quantized_output_state_ptr, product_scaling_factors, n_batch, + input_gate_scratch, /*result_stride=*/1); + } + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * recurrent_to_forget_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_forget_weights_ptr, n_cell, n_output, + quantized_output_state_ptr, product_scaling_factors, n_batch, + forget_gate_scratch, /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * recurrent_to_cell_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_cell_weights_ptr, n_cell, n_output, + quantized_output_state_ptr, product_scaling_factors, n_batch, + cell_scratch, /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * recurrent_to_output_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_output_weights_ptr, n_cell, n_output, + quantized_output_state_ptr, product_scaling_factors, n_batch, + output_gate_scratch, /*result_stride=*/1); + } + + // Save quantization and matmul computation for all zero input. + bool is_cell_state_all_zeros = + tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell); + + // For each batch and cell: update input gate. + if (!use_cifg) { + if (use_peephole && !is_cell_state_all_zeros) { + tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell, + cell_to_input_weights_scale, + recovered_weights); + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + recovered_weights, n_cell, cell_state_ptr, n_batch, + input_gate_scratch); + } + tensor_utils::MeanStddevNormalization(input_gate_scratch, + input_gate_scratch, n_cell, n_batch, + kLayerNormEpsilon); + tensor_utils::VectorBatchVectorCwiseProduct(input_layer_norm_weight_ptr, + n_cell, input_gate_scratch, + n_batch, input_gate_scratch); + tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch, + input_gate_scratch); + tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch, + input_gate_scratch); + } + + // For each batch and cell: update forget gate. + if (use_peephole && !is_cell_state_all_zeros) { + tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell, + cell_to_forget_weights_scale, + recovered_weights); + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + recovered_weights, n_cell, cell_state_ptr, n_batch, + forget_gate_scratch); + } + tensor_utils::MeanStddevNormalization(forget_gate_scratch, + forget_gate_scratch, n_cell, n_batch, + kLayerNormEpsilon); + tensor_utils::VectorBatchVectorCwiseProduct(forget_layer_norm_weight_ptr, + n_cell, forget_gate_scratch, + n_batch, forget_gate_scratch); + tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch, + forget_gate_scratch); + tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch, + forget_gate_scratch); + + // For each batch and cell: update the cell. + tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell, + n_batch, kLayerNormEpsilon); + tensor_utils::VectorBatchVectorCwiseProduct( + cell_layer_norm_weight_ptr, n_cell, cell_scratch, n_batch, cell_scratch); + tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch, + cell_scratch); + tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr, + n_batch * n_cell, cell_state_ptr); + tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell, + activation, cell_scratch); + if (use_cifg) { + tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell, + forget_gate_scratch); + tensor_utils::VectorVectorCwiseProductAccumulate( + cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr); + } else { + tensor_utils::VectorVectorCwiseProductAccumulate( + cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr); + } + if (cell_clip > 0.0) { + tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell, cell_clip, + cell_state_ptr); + } + + is_cell_state_all_zeros = + tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell); + // For each batch and cell: update the output gate. + if (use_peephole && !is_cell_state_all_zeros) { + tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell, + cell_to_output_weights_scale, + recovered_weights); + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + recovered_weights, n_cell, cell_state_ptr, n_batch, + output_gate_scratch); + } + tensor_utils::MeanStddevNormalization(output_gate_scratch, + output_gate_scratch, n_cell, n_batch, + kLayerNormEpsilon); + tensor_utils::VectorBatchVectorCwiseProduct(output_layer_norm_weight_ptr, + n_cell, output_gate_scratch, + n_batch, output_gate_scratch); + tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch, + output_gate_scratch); + tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell, + output_gate_scratch); + tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell, + activation, cell_scratch); + tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch, + n_batch * n_cell, output_gate_scratch); + + // For each batch: update the projection and output_state. + const bool use_projection_weight = (projection_weights_ptr != nullptr); + const bool use_projection_bias = (projection_bias_ptr != nullptr); + if (use_projection_weight) { + if (use_projection_bias) { + tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output, + n_batch, output_ptr_batch); + } else { + tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output); + } + if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) { + // Save quantization and matmul computation for all zero input. + float unused_min, unused_max; + for (int b = 0; b < n_batch; ++b) { + const int offset = b * n_cell; + tensor_utils::SymmetricQuantizeFloats( + output_gate_scratch + offset, n_cell, + quantized_cell_state_ptr + offset, &unused_min, &unused_max, + &scaling_factors[b]); + } + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * projection_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr, + product_scaling_factors, n_batch, output_ptr_batch, + /*result_stride=*/1); + } + if (proj_clip > 0.0) { + tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output, proj_clip, + output_ptr_batch); + } + } else { + tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output, + output_ptr_batch); + } + tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output, + output_state_ptr); +} + +// The LayerNormLSTM Op engine. +TfLiteStatus EvalFloat( + const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights, + const TfLiteTensor* input_to_forget_weights, + const TfLiteTensor* input_to_cell_weights, + const TfLiteTensor* input_to_output_weights, + const TfLiteTensor* recurrent_to_input_weights, + const TfLiteTensor* recurrent_to_forget_weights, + const TfLiteTensor* recurrent_to_cell_weights, + const TfLiteTensor* recurrent_to_output_weights, + const TfLiteTensor* cell_to_input_weights, + const TfLiteTensor* cell_to_forget_weights, + const TfLiteTensor* cell_to_output_weights, + const TfLiteTensor* input_layer_norm_weights, + const TfLiteTensor* forget_layer_norm_weights, + const TfLiteTensor* cell_layer_norm_weights, + const TfLiteTensor* output_layer_norm_weights, + const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias, + const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias, + const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias, + float cell_clip, float proj_clip, const TfLiteFusedActivation& activation, + TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state, + TfLiteTensor* cell_state, TfLiteTensor* output) { + const int n_batch = input->dims->data[0]; + const int n_input = input->dims->data[1]; + // n_cell and n_output will be the same size when there is no projection. + const int n_cell = input_to_output_weights->dims->data[0]; + const int n_output = recurrent_to_output_weights->dims->data[1]; + + // Since we have already checked that weights are all there or none, we can + // check the existence of only one to get the condition. + const bool use_cifg = (input_to_input_weights == nullptr); + const bool use_peephole = (cell_to_output_weights != nullptr); + + float* input_gate_scratch = nullptr; + float* cell_scratch = nullptr; + float* forget_gate_scratch = nullptr; + float* output_gate_scratch = nullptr; + if (use_cifg) { + cell_scratch = scratch_buffer->data.f; + forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch; + output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; + } else { + input_gate_scratch = scratch_buffer->data.f; + cell_scratch = scratch_buffer->data.f + n_cell * n_batch; + forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; + output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch; + } + + // Check optional tensors, the respective pointers can be null. + const float* input_to_input_weights_ptr = + (use_cifg) ? nullptr : input_to_input_weights->data.f; + const float* recurrent_to_input_weights_ptr = + (use_cifg) ? nullptr : recurrent_to_input_weights->data.f; + const float* input_gate_bias_ptr = + (use_cifg) ? nullptr : input_gate_bias->data.f; + const float* cell_to_input_weights_ptr = + (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr; + const float* cell_to_forget_weights_ptr = + (use_peephole) ? cell_to_forget_weights->data.f : nullptr; + const float* cell_to_output_weights_ptr = + (use_peephole) ? cell_to_output_weights->data.f : nullptr; + const float* projection_weights_ptr = + (projection_weights == nullptr) ? nullptr : projection_weights->data.f; + const float* projection_bias_ptr = + (projection_bias == nullptr) ? nullptr : projection_bias->data.f; + + // Required tensors, pointers are non-null. + const float* input_ptr_batch = input->data.f; + const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f; + const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f; + const float* input_to_output_weights_ptr = input_to_output_weights->data.f; + const float* recurrent_to_forget_weights_ptr = + recurrent_to_forget_weights->data.f; + const float* recurrent_to_cell_weights_ptr = + recurrent_to_cell_weights->data.f; + const float* recurrent_to_output_weights_ptr = + recurrent_to_output_weights->data.f; + const float* input_layer_norm_weight_ptr = input_layer_norm_weights->data.f; + const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f; + const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f; + const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f; + const float* forget_gate_bias_ptr = forget_gate_bias->data.f; + const float* cell_bias_ptr = cell_bias->data.f; + const float* output_gate_bias_ptr = output_gate_bias->data.f; + + float* activation_state_ptr = activation_state->data.f; + float* cell_state_ptr = cell_state->data.f; + float* output_ptr_batch = output->data.f; + + LayerNormLstmStep( + input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr, + input_to_cell_weights_ptr, input_to_output_weights_ptr, + recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr, + recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr, + cell_to_input_weights_ptr, cell_to_forget_weights_ptr, + cell_to_output_weights_ptr, input_layer_norm_weight_ptr, + forget_layer_norm_weight_ptr, cell_layer_norm_weight_ptr, + output_layer_norm_weight_ptr, input_gate_bias_ptr, forget_gate_bias_ptr, + cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr, + projection_bias_ptr, cell_clip, proj_clip, activation, n_batch, n_cell, + n_input, n_output, activation_state_ptr, cell_state_ptr, + input_gate_scratch, forget_gate_scratch, cell_scratch, + output_gate_scratch, output_ptr_batch); + + return kTfLiteOk; +} + +TfLiteStatus EvalHybrid( + const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights, + const TfLiteTensor* input_to_forget_weights, + const TfLiteTensor* input_to_cell_weights, + const TfLiteTensor* input_to_output_weights, + const TfLiteTensor* recurrent_to_input_weights, + const TfLiteTensor* recurrent_to_forget_weights, + const TfLiteTensor* recurrent_to_cell_weights, + const TfLiteTensor* recurrent_to_output_weights, + const TfLiteTensor* cell_to_input_weights, + const TfLiteTensor* cell_to_forget_weights, + const TfLiteTensor* cell_to_output_weights, + const TfLiteTensor* input_layer_norm_weights, + const TfLiteTensor* forget_layer_norm_weights, + const TfLiteTensor* cell_layer_norm_weights, + const TfLiteTensor* output_layer_norm_weights, + const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias, + const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias, + const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias, + float cell_clip, float proj_clip, const TfLiteFusedActivation& activation, + TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors, + TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_weights, + TfLiteTensor* input_quantized, TfLiteTensor* activation_state_quantized, + TfLiteTensor* cell_state_quantized, TfLiteTensor* activation_state, + TfLiteTensor* cell_state, TfLiteTensor* output) { + const int n_batch = input->dims->data[0]; + const int n_input = input->dims->data[1]; + // n_cell and n_output will be the same size when there is no projection. + const int n_cell = input_to_output_weights->dims->data[0]; + const int n_output = recurrent_to_output_weights->dims->data[1]; + + // Since we have already checked that weights are all there or none, we can + // check the existence of only one to get the condition. + const bool use_cifg = (input_to_input_weights == nullptr); + const bool use_peephole = (cell_to_output_weights != nullptr); + + float* input_gate_scratch = nullptr; + float* cell_scratch = nullptr; + float* forget_gate_scratch = nullptr; + float* output_gate_scratch = nullptr; + if (use_cifg) { + cell_scratch = scratch_buffer->data.f; + forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch; + output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; + } else { + input_gate_scratch = scratch_buffer->data.f; + cell_scratch = scratch_buffer->data.f + n_cell * n_batch; + forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; + output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch; + } + + // Check optional tensors, the respective pointers can be null. + int8_t* input_to_input_weights_ptr = nullptr; + float input_to_input_weights_scale = 1.0f; + int8_t* recurrent_to_input_weights_ptr = nullptr; + float recurrent_to_input_weights_scale = 1.0f; + float* input_gate_bias_ptr = nullptr; + if (!use_cifg) { + input_to_input_weights_ptr = + reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8); + recurrent_to_input_weights_ptr = + reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8); + input_gate_bias_ptr = input_gate_bias->data.f; + input_to_input_weights_scale = input_to_input_weights->params.scale; + recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale; + } + + int8_t* cell_to_input_weights_ptr = nullptr; + int8_t* cell_to_forget_weights_ptr = nullptr; + int8_t* cell_to_output_weights_ptr = nullptr; + float cell_to_input_weights_scale = 1.0f; + float cell_to_forget_weights_scale = 1.0f; + float cell_to_output_weights_scale = 1.0f; + if (use_peephole) { + if (!use_cifg) { + cell_to_input_weights_ptr = + reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8); + cell_to_input_weights_scale = cell_to_input_weights->params.scale; + } + cell_to_forget_weights_ptr = + reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8); + cell_to_output_weights_ptr = + reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8); + cell_to_forget_weights_scale = cell_to_forget_weights->params.scale; + cell_to_output_weights_scale = cell_to_output_weights->params.scale; + } + + const int8_t* projection_weights_ptr = + (projection_weights == nullptr) + ? nullptr + : reinterpret_cast<int8_t*>(projection_weights->data.uint8); + const float projection_weights_scale = + (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale; + const float* projection_bias_ptr = + (projection_bias == nullptr) ? nullptr : projection_bias->data.f; + + // Required tensors, pointers are non-null. + const float* input_ptr_batch = input->data.f; + const int8_t* input_to_forget_weights_ptr = + reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8); + const float input_to_forget_weights_scale = + input_to_forget_weights->params.scale; + const int8_t* input_to_cell_weights_ptr = + reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8); + const float input_to_cell_weights_scale = input_to_cell_weights->params.scale; + const int8_t* input_to_output_weights_ptr = + reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8); + const float input_to_output_weights_scale = + input_to_output_weights->params.scale; + const int8_t* recurrent_to_forget_weights_ptr = + reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8); + const float recurrent_to_forget_weights_scale = + recurrent_to_forget_weights->params.scale; + const int8_t* recurrent_to_cell_weights_ptr = + reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8); + const float recurrent_to_cell_weights_scale = + recurrent_to_cell_weights->params.scale; + const int8_t* recurrent_to_output_weights_ptr = + reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8); + const float recurrent_to_output_weights_scale = + recurrent_to_output_weights->params.scale; + const float* input_layer_norm_weight_ptr = input_layer_norm_weights->data.f; + const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f; + const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f; + const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f; + const float* forget_gate_bias_ptr = forget_gate_bias->data.f; + const float* cell_bias_ptr = cell_bias->data.f; + const float* output_gate_bias_ptr = output_gate_bias->data.f; + + float* activation_state_ptr = activation_state->data.f; + float* cell_state_ptr = cell_state->data.f; + float* output_ptr_batch = output->data.f; + + // Temporary storage for quantized values and scaling factors. + int8_t* quantized_input_ptr = + reinterpret_cast<int8_t*>(input_quantized->data.uint8); + int8_t* quantized_activation_state_ptr = + reinterpret_cast<int8_t*>(activation_state_quantized->data.uint8); + int8_t* quantized_cell_state_ptr = + reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8); + float* scaling_factors_ptr = scaling_factors->data.f; + float* prod_scaling_factors_ptr = prod_scaling_factors->data.f; + float* recovered_weights_ptr = recovered_weights->data.f; + + LayerNormLstmStep( + input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale, + input_to_forget_weights_ptr, input_to_forget_weights_scale, + input_to_cell_weights_ptr, input_to_cell_weights_scale, + input_to_output_weights_ptr, input_to_output_weights_scale, + recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale, + recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale, + recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale, + recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale, + cell_to_input_weights_ptr, cell_to_input_weights_scale, + cell_to_forget_weights_ptr, cell_to_forget_weights_scale, + cell_to_output_weights_ptr, cell_to_output_weights_scale, + input_layer_norm_weight_ptr, forget_layer_norm_weight_ptr, + cell_layer_norm_weight_ptr, output_layer_norm_weight_ptr, + input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr, + output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale, + projection_bias_ptr, cell_clip, proj_clip, activation, n_batch, n_cell, + n_input, n_output, input_gate_scratch, forget_gate_scratch, cell_scratch, + output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr, + recovered_weights_ptr, quantized_input_ptr, + quantized_activation_state_ptr, quantized_cell_state_ptr, + activation_state_ptr, cell_state_ptr, output_ptr_batch); + + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const OpData* op_data = reinterpret_cast<OpData*>(node->user_data); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + + const TfLiteTensor* input_to_input_weights = + GetOptionalInputTensor(context, node, kInputToInputWeightsTensor); + const TfLiteTensor* input_to_forget_weights = + GetInput(context, node, kInputToForgetWeightsTensor); + const TfLiteTensor* input_to_cell_weights = + GetInput(context, node, kInputToCellWeightsTensor); + const TfLiteTensor* input_to_output_weights = + GetInput(context, node, kInputToOutputWeightsTensor); + + const TfLiteTensor* recurrent_to_input_weights = + GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor); + const TfLiteTensor* recurrent_to_forget_weights = + GetInput(context, node, kRecurrentToForgetWeightsTensor); + const TfLiteTensor* recurrent_to_cell_weights = + GetInput(context, node, kRecurrentToCellWeightsTensor); + const TfLiteTensor* recurrent_to_output_weights = + GetInput(context, node, kRecurrentToOutputWeightsTensor); + + const TfLiteTensor* cell_to_input_weights = + GetOptionalInputTensor(context, node, kCellToInputWeightsTensor); + const TfLiteTensor* cell_to_forget_weights = + GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor); + const TfLiteTensor* cell_to_output_weights = + GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor); + + const TfLiteTensor* input_layer_norm_weights = + GetInput(context, node, kInputLayerNormWeightsTensor); + const TfLiteTensor* forget_layer_norm_weights = + GetInput(context, node, kForgetLayerNormWeightsTensor); + const TfLiteTensor* cell_layer_norm_weights = + GetInput(context, node, kCellLayerNormWeightsTensor); + const TfLiteTensor* output_layer_norm_weights = + GetInput(context, node, kOutputLayerNormWeightsTensor); + + const TfLiteTensor* input_gate_bias = + GetOptionalInputTensor(context, node, kInputGateBiasTensor); + const TfLiteTensor* forget_gate_bias = + GetInput(context, node, kForgetGateBiasTensor); + const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor); + const TfLiteTensor* output_gate_bias = + GetInput(context, node, kOutputGateBiasTensor); + + const TfLiteTensor* projection_weights = + GetOptionalInputTensor(context, node, kProjectionWeightsTensor); + const TfLiteTensor* projection_bias = + GetOptionalInputTensor(context, node, kProjectionBiasTensor); + + // Index the scratch buffers pointers to the global scratch buffer. + TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0); + + TfLiteTensor* activation_state = + &context->tensors[node->inputs->data[kInputActivationStateTensor]]; + TfLiteTensor* cell_state = + &context->tensors[node->inputs->data[kInputCellStateTensor]]; + + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + switch (input_to_output_weights->type) { + case kTfLiteFloat32: { + return EvalFloat(input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, + cell_to_input_weights, cell_to_forget_weights, + cell_to_output_weights, input_layer_norm_weights, + forget_layer_norm_weights, cell_layer_norm_weights, + output_layer_norm_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, + projection_weights, projection_bias, op_data->cell_clip, + op_data->proj_clip, op_data->activation, scratch_buffer, + activation_state, cell_state, output); + } + case kTfLiteUInt8: { + TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1); + TfLiteTensor* activation_state_quantized = + GetTemporary(context, node, /*index=*/2); + TfLiteTensor* cell_state_quantized = + GetTemporary(context, node, /*index=*/3); + TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4); + TfLiteTensor* prod_scaling_factors = + GetTemporary(context, node, /*index=*/5); + TfLiteTensor* recovered_weights = + GetTemporary(context, node, /*index=*/6); + return EvalHybrid( + input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, + cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights, + input_layer_norm_weights, forget_layer_norm_weights, + cell_layer_norm_weights, output_layer_norm_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, projection_weights, + projection_bias, op_data->cell_clip, op_data->proj_clip, + op_data->activation, scratch_buffer, scaling_factors, + prod_scaling_factors, recovered_weights, input_quantized, + activation_state_quantized, cell_state_quantized, activation_state, + cell_state, output); + } + default: + context->ReportError(context, "Type %d is not currently supported.", + input_to_output_weights->type); + return kTfLiteError; + } + return kTfLiteOk; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<OpData*>(buffer); +} + +} // namespace layer_norm_lstm + +TfLiteRegistration* Register_LAYER_NORM_LSTM() { + static TfLiteRegistration r = {layer_norm_lstm::Init, layer_norm_lstm::Free, + layer_norm_lstm::Prepare, + layer_norm_lstm::Eval}; + return &r; +} + +} // namespace custom +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/layer_norm_lstm_test.cc b/tensorflow/contrib/lite/kernels/layer_norm_lstm_test.cc new file mode 100644 index 0000000000..479f6a7d3c --- /dev/null +++ b/tensorflow/contrib/lite/kernels/layer_norm_lstm_test.cc @@ -0,0 +1,664 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// Unit test for TFLite Layer Norm LSTM op. + +#include <memory> +#include <vector> + +#include <gmock/gmock.h> +#include <gtest/gtest.h> +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace ops { +namespace custom { + +TfLiteRegistration* Register_LAYER_NORM_LSTM(); + +namespace { + +using ::testing::ElementsAreArray; + +class LayerNormLSTMOpModel : public SingleOpModel { + public: + LayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, + bool use_cifg, bool use_peephole, + bool use_projection_weights, bool use_projection_bias, + float cell_clip, float proj_clip, + const std::vector<std::vector<int>>& input_shapes, + const TensorType& weight_type = TensorType_FLOAT32) + : n_batch_(n_batch), + n_input_(n_input), + n_cell_(n_cell), + n_output_(n_output) { + input_ = AddInput(TensorType_FLOAT32); + + if (use_cifg) { + input_to_input_weights_ = AddNullInput(); + } else { + input_to_input_weights_ = AddInput(weight_type); + } + + input_to_forget_weights_ = AddInput(weight_type); + input_to_cell_weights_ = AddInput(weight_type); + input_to_output_weights_ = AddInput(weight_type); + + if (use_cifg) { + recurrent_to_input_weights_ = AddNullInput(); + } else { + recurrent_to_input_weights_ = AddInput(weight_type); + } + + recurrent_to_forget_weights_ = AddInput(weight_type); + recurrent_to_cell_weights_ = AddInput(weight_type); + recurrent_to_output_weights_ = AddInput(weight_type); + + if (use_peephole) { + if (use_cifg) { + cell_to_input_weights_ = AddNullInput(); + } else { + cell_to_input_weights_ = AddInput(weight_type); + } + cell_to_forget_weights_ = AddInput(weight_type); + cell_to_output_weights_ = AddInput(weight_type); + } else { + cell_to_input_weights_ = AddNullInput(); + cell_to_forget_weights_ = AddNullInput(); + cell_to_output_weights_ = AddNullInput(); + } + + input_layer_norm_weights_ = AddInput(TensorType_FLOAT32); + forget_layer_norm_weights_ = AddInput(TensorType_FLOAT32); + cell_layer_norm_weights_ = AddInput(TensorType_FLOAT32); + output_layer_norm_weights_ = AddInput(TensorType_FLOAT32); + + if (use_cifg) { + input_gate_bias_ = AddNullInput(); + } else { + input_gate_bias_ = AddInput(TensorType_FLOAT32); + } + forget_gate_bias_ = AddInput(TensorType_FLOAT32); + cell_bias_ = AddInput(TensorType_FLOAT32); + output_gate_bias_ = AddInput(TensorType_FLOAT32); + + if (use_projection_weights) { + projection_weights_ = AddInput(weight_type); + if (use_projection_bias) { + projection_bias_ = AddInput(TensorType_FLOAT32); + } else { + projection_bias_ = AddNullInput(); + } + } else { + projection_weights_ = AddNullInput(); + projection_bias_ = AddNullInput(); + } + + // Adding the 2 state tensors. + output_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true); + cell_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true); + + output_ = AddOutput(TensorType_FLOAT32); + + // Set up and pass in custom options using flexbuffer. + flexbuffers::Builder fbb; + fbb.Map([&]() { + fbb.Int("cell_clip", cell_clip); + fbb.Int("proj_clip", proj_clip); + fbb.String("fused_activation_function", "TANH"); + }); + fbb.Finish(); + SetCustomOp("LAYER_NORM_LSTM", fbb.GetBuffer(), Register_LAYER_NORM_LSTM); + BuildInterpreter(input_shapes); + } + + void SetInputToInputWeights(std::initializer_list<float> f) { + PopulateTensor(input_to_input_weights_, f); + } + + void SetInputToForgetWeights(std::initializer_list<float> f) { + PopulateTensor(input_to_forget_weights_, f); + } + + void SetInputToCellWeights(std::initializer_list<float> f) { + PopulateTensor(input_to_cell_weights_, f); + } + + void SetInputToOutputWeights(std::initializer_list<float> f) { + PopulateTensor(input_to_output_weights_, f); + } + + void SetRecurrentToInputWeights(std::initializer_list<float> f) { + PopulateTensor(recurrent_to_input_weights_, f); + } + + void SetRecurrentToForgetWeights(std::initializer_list<float> f) { + PopulateTensor(recurrent_to_forget_weights_, f); + } + + void SetRecurrentToCellWeights(std::initializer_list<float> f) { + PopulateTensor(recurrent_to_cell_weights_, f); + } + + void SetRecurrentToOutputWeights(std::initializer_list<float> f) { + PopulateTensor(recurrent_to_output_weights_, f); + } + + void SetCellToInputWeights(std::initializer_list<float> f) { + PopulateTensor(cell_to_input_weights_, f); + } + + void SetCellToForgetWeights(std::initializer_list<float> f) { + PopulateTensor(cell_to_forget_weights_, f); + } + + void SetCellToOutputWeights(std::initializer_list<float> f) { + PopulateTensor(cell_to_output_weights_, f); + } + + void SetInputLayerNormWeights(std::initializer_list<float> f) { + PopulateTensor(input_layer_norm_weights_, f); + } + + void SetForgetLayerNormWeights(std::initializer_list<float> f) { + PopulateTensor(forget_layer_norm_weights_, f); + } + + void SetCellLayerNormWeights(std::initializer_list<float> f) { + PopulateTensor(cell_layer_norm_weights_, f); + } + + void SetOutputLayerNormWeights(std::initializer_list<float> f) { + PopulateTensor(output_layer_norm_weights_, f); + } + + void SetInputGateBias(std::initializer_list<float> f) { + PopulateTensor(input_gate_bias_, f); + } + + void SetForgetGateBias(std::initializer_list<float> f) { + PopulateTensor(forget_gate_bias_, f); + } + + void SetCellBias(std::initializer_list<float> f) { + PopulateTensor(cell_bias_, f); + } + + void SetOutputGateBias(std::initializer_list<float> f) { + PopulateTensor(output_gate_bias_, f); + } + + void SetProjectionWeights(std::initializer_list<float> f) { + PopulateTensor(projection_weights_, f); + } + + void SetProjectionBias(std::initializer_list<float> f) { + PopulateTensor(projection_bias_, f); + } + + void SetInput(int offset, const float* begin, const float* end) { + PopulateTensor(input_, offset, const_cast<float*>(begin), + const_cast<float*>(end)); + } + + std::vector<float> GetOutput() { return ExtractVector<float>(output_); } + + int num_inputs() { return n_input_; } + int num_outputs() { return n_output_; } + int num_cells() { return n_cell_; } + int num_batches() { return n_batch_; } + + protected: + int input_; + int input_to_input_weights_; + int input_to_forget_weights_; + int input_to_cell_weights_; + int input_to_output_weights_; + + int recurrent_to_input_weights_; + int recurrent_to_forget_weights_; + int recurrent_to_cell_weights_; + int recurrent_to_output_weights_; + + int cell_to_input_weights_; + int cell_to_forget_weights_; + int cell_to_output_weights_; + + int input_layer_norm_weights_; + int forget_layer_norm_weights_; + int cell_layer_norm_weights_; + int output_layer_norm_weights_; + + int input_gate_bias_; + int forget_gate_bias_; + int cell_bias_; + int output_gate_bias_; + + int projection_weights_; + int projection_bias_; + + int output_state_; + int cell_state_; + + int output_; + + int n_batch_; + int n_input_; + int n_cell_; + int n_output_; +}; + +class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel { + public: + HybridLayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, + bool use_cifg, bool use_peephole, + bool use_projection_weights, + bool use_projection_bias, float cell_clip, + float proj_clip, + const std::vector<std::vector<int>>& input_shapes) + : LayerNormLSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg, + use_peephole, use_projection_weights, + use_projection_bias, cell_clip, proj_clip, + input_shapes, TensorType_UINT8) {} + + void SetInputToInputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_input_weights_, f); + } + + void SetInputToForgetWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_forget_weights_, f); + } + + void SetInputToCellWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_cell_weights_, f); + } + + void SetInputToOutputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_output_weights_, f); + } + + void SetRecurrentToInputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f); + } + + void SetRecurrentToForgetWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f); + } + + void SetRecurrentToCellWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f); + } + + void SetRecurrentToOutputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f); + } + + void SetCellToInputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(cell_to_input_weights_, f); + } + + void SetCellToForgetWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f); + } + + void SetCellToOutputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(cell_to_output_weights_, f); + } + + void SetInputLayerNormWeights(std::initializer_list<float> f) { + PopulateTensor(input_layer_norm_weights_, f); + } + + void SetForgetLayerNormWeights(std::initializer_list<float> f) { + PopulateTensor(forget_layer_norm_weights_, f); + } + + void SetCellLayerNormWeights(std::initializer_list<float> f) { + PopulateTensor(cell_layer_norm_weights_, f); + } + + void SetOutputLayerNormWeights(std::initializer_list<float> f) { + PopulateTensor(output_layer_norm_weights_, f); + } + + void SetProjectionWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(projection_weights_, f); + } +}; + +class BaseLayerNormLstmTest : public ::testing::Test { + protected: + // Weights of the Layer Norm LSTM model. Some are optional. + std::initializer_list<float> input_to_input_weights_; + std::initializer_list<float> input_to_cell_weights_; + std::initializer_list<float> input_to_forget_weights_; + std::initializer_list<float> input_to_output_weights_; + std::initializer_list<float> input_gate_bias_; + std::initializer_list<float> cell_gate_bias_; + std::initializer_list<float> forget_gate_bias_; + std::initializer_list<float> output_gate_bias_; + std::initializer_list<float> recurrent_to_input_weights_; + std::initializer_list<float> recurrent_to_cell_weights_; + std::initializer_list<float> recurrent_to_forget_weights_; + std::initializer_list<float> recurrent_to_output_weights_; + std::initializer_list<float> cell_to_input_weights_; + std::initializer_list<float> cell_to_forget_weights_; + std::initializer_list<float> cell_to_output_weights_; + std::initializer_list<float> input_layer_norm_weights_; + std::initializer_list<float> forget_layer_norm_weights_; + std::initializer_list<float> cell_layer_norm_weights_; + std::initializer_list<float> output_layer_norm_weights_; + std::initializer_list<float> projection_weights_; + + // Layer Norm LSTM input is stored as num_batch x num_inputs vector. + std::vector<std::vector<float>> layer_norm_lstm_input_; + + // Compares output up to tolerance to the result of the layer_norm_lstm given + // the input. + void VerifyGoldens(const std::vector<std::vector<float>>& input, + const std::vector<std::vector<float>>& output, + LayerNormLSTMOpModel* layer_norm_lstm, + float tolerance = 1e-5) { + const int num_batches = input.size(); + EXPECT_GT(num_batches, 0); + const int num_inputs = layer_norm_lstm->num_inputs(); + EXPECT_GT(num_inputs, 0); + const int input_sequence_size = input[0].size() / num_inputs; + EXPECT_GT(input_sequence_size, 0); + for (int i = 0; i < input_sequence_size; ++i) { + for (int b = 0; b < num_batches; ++b) { + const float* batch_start = input[b].data() + i * num_inputs; + const float* batch_end = batch_start + num_inputs; + + layer_norm_lstm->SetInput(b * layer_norm_lstm->num_inputs(), + batch_start, batch_end); + } + + layer_norm_lstm->Invoke(); + + const int num_outputs = layer_norm_lstm->num_outputs(); + std::vector<float> expected; + for (int b = 0; b < num_batches; ++b) { + const float* golden_start_batch = output[b].data() + i * num_outputs; + const float* golden_end_batch = golden_start_batch + num_outputs; + expected.insert(expected.end(), golden_start_batch, golden_end_batch); + } + EXPECT_THAT(layer_norm_lstm->GetOutput(), + ElementsAreArray(ArrayFloatNear(expected, tolerance))); + } + } +}; + +class NoCifgPeepholeProjectionNoClippingLayerNormLstmTest + : public BaseLayerNormLstmTest { + void SetUp() override { + input_to_input_weights_ = {0.5, 0.6, 0.7, -0.8, -0.9, 0.1, 0.2, + 0.3, -0.4, 0.5, -0.8, 0.7, -0.6, 0.5, + -0.4, -0.5, -0.4, -0.3, -0.2, -0.1}; + + input_to_forget_weights_ = {-0.6, -0.1, 0.3, 0.2, 0.9, -0.5, -0.2, + -0.4, 0.3, -0.8, -0.4, 0.3, -0.5, -0.4, + -0.6, 0.3, -0.4, -0.6, -0.5, -0.5}; + + input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5, -0.2, + -0.3, -0.2, -0.6, 0.6, -0.1, -0.4, -0.3, + -0.7, 0.7, -0.9, -0.5, 0.8, 0.6}; + + input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3, + -0.3, -0.8, -0.2, 0.6, -0.2, 0.4, -0.7, + -0.3, -0.5, 0.1, 0.5, -0.6, -0.4}; + + input_gate_bias_ = {0.03, 0.15, 0.22, 0.38}; + + forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1}; + + cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08}; + + output_gate_bias_ = {0.05, -0.01, 0.2, 0.1}; + + recurrent_to_input_weights_ = {-0.2, -0.3, 0.4, 0.1, -0.5, 0.9, + -0.2, -0.3, -0.7, 0.05, -0.2, -0.6}; + + recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8, -0.08, + -0.2, 0.3, 0.8, -0.6, -0.1, 0.2}; + + recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4, + 0.9, 0.3, -0.1, 0.2, 0.5, 0.2}; + + recurrent_to_output_weights_ = {0.3, -0.1, 0.1, -0.2, -0.5, -0.7, + -0.2, -0.6, -0.1, -0.4, -0.7, -0.2}; + + cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15}; + + cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03}; + + cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05}; + + input_layer_norm_weights_ = {0.1, 0.2, 0.3, 0.5}; + forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3}; + cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8}; + output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5}; + + projection_weights_ = {-0.1, 0.2, 0.01, -0.2, 0.1, 0.5, + 0.3, 0.08, 0.07, 0.2, -0.4, 0.2}; + + layer_norm_lstm_input_ = { + {// Batch0: 3 (input_sequence_size) * 5 (n_input) + 0.7, 0.8, 0.1, 0.2, 0.3, // seq 0 + 0.8, 0.1, 0.2, 0.4, 0.5, // seq 1 + 0.2, 0.7, 0.7, 0.1, 0.7}, // seq 2 + + {// Batch1: 3 (input_sequence_size) * 5 (n_input) + 0.3, 0.2, 0.9, 0.8, 0.1, // seq 0 + 0.1, 0.5, 0.2, 0.4, 0.2, // seq 1 + 0.6, 0.9, 0.2, 0.5, 0.7}, // seq 2 + }; + } +}; + +TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest, + LayerNormLstmBlackBoxTest) { + const int n_batch = 2; + const int n_input = 5; + const int n_cell = 4; + const int n_output = 3; + const float ceil_clip = 0.0; + const float proj_clip = 0.0; + + LayerNormLSTMOpModel layer_norm_lstm( + n_batch, n_input, n_cell, n_output, + /*use_cifg=*/false, /*use_peephole=*/true, + /*use_projection_weights=*/true, + /*use_projection_bias=*/false, ceil_clip, proj_clip, + { + {n_batch, n_input}, // input tensor + + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {n_cell}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor + + {n_cell}, // input_layer_norm_weight tensor + {n_cell}, // forget_layer_norm_weight tensor + {n_cell}, // cell_layer_norm_weight tensor + {n_cell}, // output_layer_norm_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {n_output, n_cell}, // projection_weight tensor + {0}, // projection_bias tensor + }); + + layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_); + layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_); + layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_); + layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_); + + layer_norm_lstm.SetInputGateBias(input_gate_bias_); + layer_norm_lstm.SetCellBias(cell_gate_bias_); + layer_norm_lstm.SetForgetGateBias(forget_gate_bias_); + layer_norm_lstm.SetOutputGateBias(output_gate_bias_); + + layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_); + layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_); + layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_); + + layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_); + layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_); + layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_); + layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_); + + layer_norm_lstm.SetProjectionWeights(projection_weights_); + + // Verify the final output. + const std::vector<std::vector<float>> layer_norm_lstm_golden_output = { + { + // Batch0: 3 (input_sequence_size) * 3 (n_output) + 0.0244077, 0.128027, -0.00170918, // seq 0 + 0.0137642, 0.140751, 0.0395835, // seq 1 + -0.00459231, 0.155278, 0.0837377, // seq 2 + }, + { + // Batch1: 3 (input_sequence_size) * 3 (n_output) + -0.00692428, 0.0848741, 0.063445, // seq 0 + -0.00403912, 0.139963, 0.072681, // seq 1 + 0.00752706, 0.161903, 0.0561371, // seq 2 + }}; + + VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output, + &layer_norm_lstm); +} + +TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest, + HybridLayerNormLstmBlackBoxTest) { + const int n_batch = 2; + const int n_input = 5; + const int n_cell = 4; + const int n_output = 3; + const float ceil_clip = 0.0; + const float proj_clip = 0.0; + + HybridLayerNormLSTMOpModel layer_norm_lstm( + n_batch, n_input, n_cell, n_output, + /*use_cifg=*/false, /*use_peephole=*/true, + /*use_projection_weights=*/true, + /*use_projection_bias=*/false, ceil_clip, proj_clip, + { + {n_batch, n_input}, // input tensor + + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {n_cell}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor + + {n_cell}, // input_layer_norm_weight tensor + {n_cell}, // forget_layer_norm_weight tensor + {n_cell}, // cell_layer_norm_weight tensor + {n_cell}, // output_layer_norm_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {n_output, n_cell}, // projection_weight tensor + {0}, // projection_bias tensor + }); + + layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_); + layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_); + layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_); + layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_); + + layer_norm_lstm.SetInputGateBias(input_gate_bias_); + layer_norm_lstm.SetCellBias(cell_gate_bias_); + layer_norm_lstm.SetForgetGateBias(forget_gate_bias_); + layer_norm_lstm.SetOutputGateBias(output_gate_bias_); + + layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_); + layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_); + layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_); + + layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_); + layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_); + layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_); + layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_); + + layer_norm_lstm.SetProjectionWeights(projection_weights_); + + const std::vector<std::vector<float>> layer_norm_lstm_golden_output = { + { + // Batch0: 3 (input_sequence_size) * 3 (n_output) + 0.0244576, 0.127847, -0.00181765, // seq 0 + 0.0137518, 0.140892, 0.0402234, // seq 1 + -0.0048839, 0.155096, 0.0840309, // seq 2 + }, + { + // Batch1: 3 (input_sequence_size) * 3 (n_output) + -0.00728636, 0.0843957, 0.0634786, // seq 0 + -0.00448382, 0.139278, 0.0737372, // seq 1 + 0.00734616, 0.161793, 0.0560238, // seq 2 + }}; + + VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output, + &layer_norm_lstm); +} + +} // namespace +} // namespace custom +} // namespace ops +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/local_response_norm.cc b/tensorflow/contrib/lite/kernels/local_response_norm.cc index c15a5170b8..334d2a2788 100644 --- a/tensorflow/contrib/lite/kernels/local_response_norm.cc +++ b/tensorflow/contrib/lite/kernels/local_response_norm.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -64,11 +64,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = GetOutput(context, node, kOutputTensor); if (output->type == kTfLiteFloat32) { -#define TF_LITE_LOCAL_RESPONSE_NORM(type) \ - type::LocalResponseNormalization( \ - GetTensorData<float>(input), GetTensorDims(input), params->radius, \ - params->bias, params->alpha, params->beta, GetTensorData<float>(output), \ - GetTensorDims(output)) +#define TF_LITE_LOCAL_RESPONSE_NORM(type) \ + tflite::LocalResponseNormalizationParams op_params; \ + op_params.range = params->radius; \ + op_params.bias = params->bias; \ + op_params.alpha = params->alpha; \ + op_params.beta = params->beta; \ + type::LocalResponseNormalization( \ + op_params, GetTensorShape(input), GetTensorData<float>(input), \ + GetTensorShape(output), GetTensorData<float>(output)) if (kernel_type == kReference) { TF_LITE_LOCAL_RESPONSE_NORM(reference_ops); } @@ -77,7 +81,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } #undef TF_LITE_LOCAL_RESPONSE_NORM } else { - context->ReportError(context, "Inputs and outputs not all float types."); + context->ReportError(context, "Output type is %d, requires float.", + output->type); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/log_softmax_test.cc b/tensorflow/contrib/lite/kernels/log_softmax_test.cc index 62820a2f51..1acc966cdc 100644 --- a/tensorflow/contrib/lite/kernels/log_softmax_test.cc +++ b/tensorflow/contrib/lite/kernels/log_softmax_test.cc @@ -90,10 +90,10 @@ TEST(LogSoftmaxOpTest, CompareWithTFmini) { m.Invoke(); std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]); - static tflite::Dims<4> input_dims = {{input_size, 1, 1, batch_size}, - {1, 0, 0, input_size}}; - tflite::reference_ops::LogSoftmax(input_buffer, input_dims, - output_buffer.get(), input_dims); + auto input_shape = RuntimeShape({batch_size, 1, 1, input_size}); + SoftmaxParams params; + tflite::reference_ops::LogSoftmax(params, input_shape, input_buffer, + input_shape, output_buffer.get()); std::vector<float> expected; expected.insert(expected.end(), output_buffer.get(), diff --git a/tensorflow/contrib/lite/kernels/logical.cc b/tensorflow/contrib/lite/kernels/logical.cc new file mode 100644 index 0000000000..f770cb35d1 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/logical.cc @@ -0,0 +1,134 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace logical { +namespace { + +// Input/output tensor index. +constexpr int kInputTensor1 = 0; +constexpr int kInputTensor2 = 1; +constexpr int kOutputTensor = 0; + +// Op data for logical op. +struct OpData { + bool requires_broadcast; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* data = new OpData; + data->requires_broadcast = false; + return data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<OpData*>(buffer); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + // Reinterprete the opaque data provided by user. + OpData* data = reinterpret_cast<OpData*>(node->user_data); + + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + TF_LITE_ENSURE_EQ(context, input1->type, input2->type); + + const TfLiteType type = input1->type; + if (type != kTfLiteBool) { + context->ReportError(context, "Logical ops only support bool type."); + return kTfLiteError; + } + output->type = type; + + data->requires_broadcast = !HaveSameShapes(input1, input2); + + TfLiteIntArray* output_size = nullptr; + if (data->requires_broadcast) { + TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast( + context, input1, input2, &output_size)); + } else { + output_size = TfLiteIntArrayCopy(input1->dims); + } + + return context->ResizeTensor(context, output, output_size); +} + +TfLiteStatus LogicalImpl(TfLiteContext* context, TfLiteNode* node, + const std::function<bool(bool, bool)>& func) { + OpData* data = reinterpret_cast<OpData*>(node->user_data); + + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + if (data->requires_broadcast) { + reference_ops::BroadcastLogical4DSlow( + GetTensorShape(input1), GetTensorData<bool>(input1), + GetTensorShape(input2), GetTensorData<bool>(input2), + GetTensorShape(output), GetTensorData<bool>(output), func); + } else { + reference_ops::Logical(GetTensorShape(input1), GetTensorData<bool>(input1), + GetTensorShape(input2), GetTensorData<bool>(input2), + GetTensorShape(output), GetTensorData<bool>(output), + func); + } + + return kTfLiteOk; +} + +TfLiteStatus LogicalOrEval(TfLiteContext* context, TfLiteNode* node) { + const auto logical_or_func = std::logical_or<bool>(); + return LogicalImpl(context, node, logical_or_func); +} + +TfLiteStatus LogicalAndEval(TfLiteContext* context, TfLiteNode* node) { + const auto logical_and_func = std::logical_and<bool>(); + return LogicalImpl(context, node, logical_and_func); +} + +} // namespace +} // namespace logical + +TfLiteRegistration* Register_LOGICAL_OR() { + // Init, Free, Prepare, Eval are satisfying the Interface required by + // TfLiteRegistration. + static TfLiteRegistration r = {logical::Init, logical::Free, logical::Prepare, + logical::LogicalOrEval}; + return &r; +} + +TfLiteRegistration* Register_LOGICAL_AND() { + // Init, Free, Prepare, Eval are satisfying the Interface required by + // TfLiteRegistration. + static TfLiteRegistration r = {logical::Init, logical::Free, logical::Prepare, + logical::LogicalAndEval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/logical_test.cc b/tensorflow/contrib/lite/kernels/logical_test.cc new file mode 100644 index 0000000000..206cbde98f --- /dev/null +++ b/tensorflow/contrib/lite/kernels/logical_test.cc @@ -0,0 +1,112 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAre; + +class LogicalOpModel : public SingleOpModel { + public: + LogicalOpModel(std::initializer_list<int> input1_shape, + std::initializer_list<int> input2_shape, BuiltinOperator op) { + input1_ = AddInput(TensorType_BOOL); + input2_ = AddInput(TensorType_BOOL); + output_ = AddOutput(TensorType_BOOL); + ConfigureBuiltinOp(op); + BuildInterpreter({input1_shape, input2_shape}); + } + + int input1() { return input1_; } + int input2() { return input2_; } + + std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); } + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + private: + int input1_; + int input2_; + int output_; + + void ConfigureBuiltinOp(BuiltinOperator op) { + switch (op) { + case BuiltinOperator_LOGICAL_OR: { + SetBuiltinOp(op, BuiltinOptions_LogicalOrOptions, + CreateLogicalOrOptions(builder_).Union()); + break; + } + case BuiltinOperator_LOGICAL_AND: { + SetBuiltinOp(op, BuiltinOptions_LogicalAndOptions, + CreateLogicalAndOptions(builder_).Union()); + break; + } + default: { FAIL() << "We shouldn't get here."; } + } + } +}; + +TEST(LogicalTest, LogicalOr) { + LogicalOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, BuiltinOperator_LOGICAL_OR); + model.PopulateTensor<bool>(model.input1(), {true, false, false, true}); + model.PopulateTensor<bool>(model.input2(), {true, false, true, false}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +TEST(LogicalTest, BroadcastLogicalOr) { + LogicalOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, BuiltinOperator_LOGICAL_OR); + model.PopulateTensor<bool>(model.input1(), {true, false, false, true}); + model.PopulateTensor<bool>(model.input2(), {false}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +TEST(LogicalTest, LogicalAnd) { + LogicalOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, BuiltinOperator_LOGICAL_AND); + model.PopulateTensor<bool>(model.input1(), {true, false, false, true}); + model.PopulateTensor<bool>(model.input2(), {true, false, true, false}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +TEST(LogicalTest, BroadcastLogicalAnd) { + LogicalOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, BuiltinOperator_LOGICAL_AND); + model.PopulateTensor<bool>(model.input1(), {true, false, false, true}); + model.PopulateTensor<bool>(model.input2(), {true}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/contrib/lite/kernels/lsh_projection.cc index 25d2dc2cdd..9fa1c5f100 100644 --- a/tensorflow/contrib/lite/kernels/lsh_projection.cc +++ b/tensorflow/contrib/lite/kernels/lsh_projection.cc @@ -50,7 +50,6 @@ limitations under the License. // Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] } // A flattened tensor represents projected bit vectors. -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -60,8 +59,8 @@ limitations under the License. #include <limits> #include <memory> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" #include <farmhash.h> diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc index 990b3da055..16d67a1a93 100644 --- a/tensorflow/contrib/lite/kernels/lstm.cc +++ b/tensorflow/contrib/lite/kernels/lstm.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -21,12 +20,16 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" +#include "tensorflow/contrib/lite/kernels/gemm_support.h" #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/lstm_eval.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" namespace tflite { @@ -34,6 +37,20 @@ namespace ops { namespace builtin { namespace lstm { +struct OpData { + // Which kernel type to use. Full kernel (20 inputs) or basic kernel + // (5 inputs). + TfLiteLSTMKernelType kernel_type; + + // These fields are only used by full kernel. + int activation_state_tensor_index; + int cell_state_tensor_index; + int scratch_tensor_index; +}; + +// For full inputs kernel (20-inputs). +namespace full { + // Input Tensors of size {n_batch, n_input} constexpr int kInputTensor = 0; @@ -65,26 +82,27 @@ constexpr int kProjectionWeightsTensor = 16; // Optional // Projection bias tensor of size {n_output} constexpr int kProjectionBiasTensor = 17; // Optional +// These state tensors are defined as variable tensors, and will be modified by +// this op. +constexpr int kInputActivationStateTensor = 18; +constexpr int kInputCellStateTensor = 19; + // Output tensors. -constexpr int kOutputStateTensor = 0; -constexpr int kCellStateTensor = 1; -constexpr int kOutputTensor = 2; +constexpr int kOutputTensor = 0; void* Init(TfLiteContext* context, const char* buffer, size_t length) { - auto* scratch_tensor_index = new int; - context->AddTensors(context, 1, scratch_tensor_index); - return scratch_tensor_index; -} - -void Free(TfLiteContext* context, void* buffer) { - delete reinterpret_cast<int*>(buffer); + auto* op_data = new OpData(); + op_data->kernel_type = kTfLiteLSTMFullKernel; + context->AddTensors(context, /*tensors_to_add=*/7, + &op_data->scratch_tensor_index); + return op_data; } // Check that input tensor dimensions matches with each other. TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context, TfLiteNode* node, int n_input, int n_output, int n_cell) { - auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data); + const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data); // Making sure clipping parameters have valid values. // == 0 means no clipping @@ -94,7 +112,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context, const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(context, node, kInputToInputWeightsTensor); - if (input_to_input_weights) { + if (input_to_input_weights != nullptr) { TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2); TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell); TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input); @@ -114,7 +132,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context, const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor); - if (recurrent_to_input_weights) { + if (recurrent_to_input_weights != nullptr) { TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2); TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0], n_cell); @@ -204,7 +222,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context, const TfLiteTensor* projection_weights = GetOptionalInputTensor(context, node, kProjectionWeightsTensor); - if (projection_weights) { + if (projection_weights != nullptr) { TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2); TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output); TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell); @@ -212,7 +230,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context, const TfLiteTensor* projection_bias = GetOptionalInputTensor(context, node, kProjectionBiasTensor); - if (projection_bias) { + if (projection_bias != nullptr) { TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1); TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output); } @@ -233,15 +251,19 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context, // Allocate a temporary scratch tensor. Also check that the sizes of the input // tensors match each other. TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data); + OpData* op_data = reinterpret_cast<OpData*>(node->user_data); + + TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 20); - // Check we have all the inputs and outputs we need. - TF_LITE_ENSURE_EQ(context, node->inputs->size, 18); - TF_LITE_ENSURE_EQ(context, node->outputs->size, 3); + op_data->activation_state_tensor_index = + node->inputs->data[kInputActivationStateTensor]; + op_data->cell_state_tensor_index = node->inputs->data[kInputCellStateTensor]; // Inferring batch size, number of outputs and number of cells from the // input tensors. const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE(context, input->dims->size > 1); const int n_batch = input->dims->data[0]; const int n_input = input->dims->data[1]; @@ -260,68 +282,153 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const int n_output = recurrent_to_output_weights->dims->data[1]; // Check that input tensor dimensions matches with each other. - CheckInputTensorDimensions(context, node, n_input, n_output, n_cell); + TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input, + n_output, n_cell)); - // Get the pointer to output, output_state and cell_state tensors. + // Get the pointer to output, activation_state and cell_state tensors. TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor); - TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor); - // Resize the output, output_state and cell_state tensors. + TfLiteTensor* activation_state = + &context->tensors[op_data->activation_state_tensor_index]; + TfLiteTensor* cell_state = + &context->tensors[op_data->cell_state_tensor_index]; + + // Check the shape of input state tensors. + // These tensor may be 1D or 2D. It's fine as long as the total size is + // correct. + TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output); + TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell); + + // Resize the output tensors. TfLiteIntArray* output_size = TfLiteIntArrayCreate(2); output_size->data[0] = n_batch; output_size->data[1] = n_output; TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output, output_size)); - TfLiteIntArray* output_state_size = TfLiteIntArrayCreate(2); - output_state_size->data[0] = n_batch; - output_state_size->data[1] = n_output; - TF_LITE_ENSURE_OK( - context, context->ResizeTensor(context, output_state, output_state_size)); + // The weights are of consistent type, so it suffices to check one. + // TODO(mirkov): create a utility/macro for this check, so all Ops can use it. + const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 && + input->type == kTfLiteFloat32); - TfLiteIntArray* cell_size = TfLiteIntArrayCreate(2); - cell_size->data[0] = n_batch; - cell_size->data[1] = n_cell; - TF_LITE_ENSURE_OK(context, - context->ResizeTensor(context, cell_state, cell_size)); + TfLiteIntArrayFree(node->temporaries); + if (is_hybrid_op) { + node->temporaries = TfLiteIntArrayCreate(7); + } else { + node->temporaries = TfLiteIntArrayCreate(1); + } + node->temporaries->data[0] = op_data->scratch_tensor_index; // Create a scratch buffer tensor. - TfLiteIntArrayFree(node->temporaries); - node->temporaries = TfLiteIntArrayCreate(1); - node->temporaries->data[0] = *scratch_tensor_index; TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0); scratch_buffer->type = input->type; scratch_buffer->allocation_type = kTfLiteArenaRw; - // Mark state tensors as persistent tensors. - output_state->allocation_type = kTfLiteArenaRwPersistent; - cell_state->allocation_type = kTfLiteArenaRwPersistent; - const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(context, node, kInputToInputWeightsTensor); const bool use_cifg = (input_to_input_weights == nullptr); + TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2); + scratch_buffer_size->data[0] = n_batch; if (use_cifg) { - TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2); - scratch_buffer_size->data[0] = n_batch; // Reserving space for Cell, Forget, Output gates scratch_buffer_size->data[1] = n_cell * 3; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer, - scratch_buffer_size)); } else { - TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2); - scratch_buffer_size->data[0] = n_batch; // Reserving space for Input, Cell, Forget, Output gates scratch_buffer_size->data[1] = n_cell * 4; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer, - scratch_buffer_size)); + } + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer, + scratch_buffer_size)); + + if (is_hybrid_op) { + // Allocate temporary tensors to store quantized values of input, + // activation_state and cell_state tensors. + node->temporaries->data[1] = op_data->scratch_tensor_index + 1; + TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } + node->temporaries->data[2] = op_data->scratch_tensor_index + 2; + TfLiteTensor* activation_state_quantized = + GetTemporary(context, node, /*index=*/2); + activation_state_quantized->type = kTfLiteUInt8; + activation_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(activation_state_quantized->dims, + activation_state->dims)) { + TfLiteIntArray* activation_state_quantized_size = + TfLiteIntArrayCopy(activation_state->dims); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, activation_state_quantized, + activation_state_quantized_size)); + } + node->temporaries->data[3] = op_data->scratch_tensor_index + 3; + TfLiteTensor* cell_state_quantized = + GetTemporary(context, node, /*index=*/3); + cell_state_quantized->type = kTfLiteUInt8; + cell_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) { + TfLiteIntArray* cell_state_quantized_size = + TfLiteIntArrayCopy(cell_state->dims); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, cell_state_quantized, + cell_state_quantized_size)); + } + + // Allocate temporary tensors to store scaling factors and product scaling + // factors. The latter is a convenience storage which allows to quantize + // a vector once (which produces the scaling factors) and multiply it with + // different matrices (which requires multiplying the scaling factors with + // the scaling factor of the matrix). + node->temporaries->data[4] = op_data->scratch_tensor_index + 4; + TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + scaling_factors_size->data[0] = n_batch; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } + node->temporaries->data[5] = op_data->scratch_tensor_index + 5; + TfLiteTensor* prod_scaling_factors = + GetTemporary(context, node, /*index=*/5); + prod_scaling_factors->type = kTfLiteFloat32; + prod_scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1); + prod_scaling_factors_size->data[0] = n_batch; + if (!TfLiteIntArrayEqual(prod_scaling_factors->dims, + prod_scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, prod_scaling_factors, + prod_scaling_factors_size)); + } + + // Allocate a temporary tensor to store the recovered cell weights. Since + // this is used for diagonal matrices, only need to store n_cell values. + node->temporaries->data[6] = op_data->scratch_tensor_index + 6; + TfLiteTensor* recovered_cell_weights = + GetTemporary(context, node, /*index=*/6); + recovered_cell_weights->type = kTfLiteFloat32; + recovered_cell_weights->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1); + recovered_cell_weights_size->data[0] = n_cell; + if (!TfLiteIntArrayEqual(recovered_cell_weights->dims, + recovered_cell_weights_size)) { + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, recovered_cell_weights, + recovered_cell_weights_size)); + } } return kTfLiteOk; } -// The LSTM Op engine. TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data); + const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data); + OpData* op_data = reinterpret_cast<OpData*>(node->user_data); + const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* input_to_input_weights = @@ -362,91 +469,303 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* projection_bias = GetOptionalInputTensor(context, node, kProjectionBiasTensor); - TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor); - TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor); + // Index the scratch buffers pointers to the global scratch buffer. + TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0); + + TfLiteTensor* activation_state = + &context->tensors[op_data->activation_state_tensor_index]; + TfLiteTensor* cell_state = + &context->tensors[op_data->cell_state_tensor_index]; + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - const int n_batch = input->dims->data[0]; - const int n_input = input->dims->data[1]; - // n_cell and n_output will be the same size when there is no projection. - const int n_cell = input_to_output_weights->dims->data[0]; - const int n_output = recurrent_to_output_weights->dims->data[1]; + // TODO(mirkov): add a check that weights are all uint8s or all floats. + switch (input_to_output_weights->type) { + case kTfLiteFloat32: { + return lstm_eval::EvalFloat( + input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, + cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights, + /*aux_input=*/nullptr, + /*aux_input_to_input_weights=*/nullptr, + /*aux_input_to_forget_weights=*/nullptr, + /*aux_input_to_cell_weights=*/nullptr, + /*aux_input_to_output_weights=*/nullptr, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, projection_weights, + projection_bias, params, /*forward_sequence=*/true, + /*output_offset=*/0, scratch_buffer, activation_state, cell_state, + output); + } + case kTfLiteUInt8: { + TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1); + TfLiteTensor* activation_state_quantized = + GetTemporary(context, node, /*index=*/2); + TfLiteTensor* cell_state_quantized = + GetTemporary(context, node, /*index=*/3); + TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4); + TfLiteTensor* prod_scaling_factors = + GetTemporary(context, node, /*index=*/5); + TfLiteTensor* recovered_cell_weights = + GetTemporary(context, node, /*index=*/6); + return lstm_eval::EvalHybrid( + input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, + cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights, + /*aux_input=*/nullptr, + /*aux_input_to_input_weights=*/nullptr, + /*aux_input_to_forget_weights=*/nullptr, + /*aux_input_to_cell_weights=*/nullptr, + /*aux_input_to_output_weights=*/nullptr, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, projection_weights, + projection_bias, params, /*forward_sequence=*/true, + /*output_offset=*/0, scratch_buffer, scaling_factors, + prod_scaling_factors, recovered_cell_weights, input_quantized, + /*aux_input_quantized=*/nullptr, activation_state_quantized, + cell_state_quantized, activation_state, cell_state, output); + } + default: + context->ReportError(context, "Type %d is not currently supported.", + input_to_output_weights->type); + return kTfLiteError; + } + return kTfLiteOk; +} - // Since we have already checked that weights are all there or none, we can - // check the existence of only one to get the condition. - const bool use_cifg = (input_to_input_weights == nullptr); - const bool use_peephole = (cell_to_output_weights != nullptr); +} // namespace full - // Index the scratch buffers pointers to the global scratch buffer. - TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0); +// For basic kernel (5-inputs). +namespace basic { - float* input_gate_scratch = nullptr; - float* cell_scratch = nullptr; - float* forget_gate_scratch = nullptr; - float* output_gate_scratch = nullptr; - if (use_cifg) { - cell_scratch = scratch_buffer->data.f; - forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch; - output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; +enum InputTensor { + kInputData = 0, + kInputPrevActivation = 1, + kInputWeights = 2, + kInputBiases = 3, + kInputPrevState = 4, + kInputNum = 5, +}; + +enum OutputTensor { + kOutputActivation = 0, + kOutputState = 1, + kOutputConcatTemp = 2, + kOutputActivationTemp = 3, + kOutputNum = 4, +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* op_data = new OpData(); + op_data->kernel_type = kTfLiteLSTMBasicKernel; + // `scratch_tensor_index` is unused in this kernel. + op_data->scratch_tensor_index = -1; + return op_data; +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE(context, node->inputs->size == kInputNum); + TF_LITE_ENSURE(context, node->outputs->size == kOutputNum); + + const TfLiteTensor* input = GetInput(context, node, kInputData); + const TfLiteTensor* prev_activation = + GetInput(context, node, kInputPrevActivation); + const TfLiteTensor* weights = GetInput(context, node, kInputWeights); + const TfLiteTensor* bias = GetInput(context, node, kInputBiases); + const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState); + + TF_LITE_ENSURE_EQ(context, input->dims->size, 2); + const int num_batches = input->dims->data[0]; + const int input_depth = input->dims->data[1]; + + TF_LITE_ENSURE_EQ(context, prev_activation->dims->size, 2); + TF_LITE_ENSURE_EQ(context, prev_activation->dims->data[0], num_batches); + const int activation_depth = prev_activation->dims->data[1]; + const int total_depth = input_depth + activation_depth; + + TF_LITE_ENSURE_EQ(context, weights->dims->size, 2); + TF_LITE_ENSURE_EQ(context, weights->dims->data[0], 4 * activation_depth); + TF_LITE_ENSURE_EQ(context, weights->dims->data[1], total_depth); + + TF_LITE_ENSURE_EQ(context, bias->dims->size, 1); + TF_LITE_ENSURE_EQ(context, bias->dims->data[0], 4 * activation_depth); + + TF_LITE_ENSURE_EQ(context, prev_state->dims->size, 2); + TF_LITE_ENSURE_EQ(context, prev_state->dims->data[0], num_batches); + TF_LITE_ENSURE_EQ(context, prev_state->dims->data[1], activation_depth); + + TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation); + TfLiteTensor* state_out = GetOutput(context, node, kOutputState); + TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp); + TfLiteTensor* activation_temp = + GetOutput(context, node, kOutputActivationTemp); + + TF_LITE_ENSURE_OK(context, context->ResizeTensor( + context, activation_out, + TfLiteIntArrayCopy(prev_activation->dims))); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, state_out, + TfLiteIntArrayCopy(prev_state->dims))); + + TfLiteIntArray* concat_temp_size = TfLiteIntArrayCreate(2); + concat_temp_size->data[0] = num_batches; + concat_temp_size->data[1] = total_depth; + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, concat_temp, concat_temp_size)); + TfLiteIntArray* activation_temp_size = TfLiteIntArrayCreate(2); + activation_temp_size->data[0] = num_batches; + activation_temp_size->data[1] = 4 * activation_depth; + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, activation_temp, + activation_temp_size)); + + // Set the state tensors as persistent. + for (auto index : {kInputPrevActivation, kInputPrevState}) { + TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]]; + tensor->allocation_type = kTfLiteArenaRwPersistent; + } + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input = GetInput(context, node, kInputData); + const TfLiteTensor* prev_activation = + GetInput(context, node, kInputPrevActivation); + const TfLiteTensor* weights = GetInput(context, node, kInputWeights); + const TfLiteTensor* bias = GetInput(context, node, kInputBiases); + const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState); + + TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation); + TfLiteTensor* state_out = GetOutput(context, node, kOutputState); + TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp); + TfLiteTensor* activation_temp = + GetOutput(context, node, kOutputActivationTemp); + + if (input->type == kTfLiteFloat32 && + prev_activation->type == kTfLiteFloat32 && + weights->type == kTfLiteFloat32 && bias->type == kTfLiteFloat32 && + prev_state->type == kTfLiteFloat32 && state_out->type == kTfLiteFloat32 && + activation_out->type == kTfLiteFloat32 && + concat_temp->type == kTfLiteFloat32 && + activation_temp->type == kTfLiteFloat32) { + tflite::LstmCellParams op_params; + // Float LSTM cell does not need parameters to be set: leave untouched. + optimized_ops::LstmCell( + op_params, + // Inputs. + GetTensorShape(input), GetTensorData<float>(input), + GetTensorShape(prev_activation), GetTensorData<float>(prev_activation), + GetTensorShape(weights), GetTensorData<float>(weights), + GetTensorShape(bias), GetTensorData<float>(bias), + GetTensorShape(prev_state), GetTensorData<float>(prev_state), + // Outputs. + GetTensorShape(state_out), GetTensorData<float>(state_out), + GetTensorShape(activation_out), GetTensorData<float>(activation_out), + GetTensorShape(concat_temp), GetTensorData<float>(concat_temp), + GetTensorShape(activation_temp), GetTensorData<float>(activation_temp)); + } else if (input->type == kTfLiteUInt8 && + prev_activation->type == kTfLiteUInt8 && + weights->type == kTfLiteUInt8 && bias->type == kTfLiteInt32 && + prev_state->type == kTfLiteInt16 && + state_out->type == kTfLiteInt16 && + activation_out->type == kTfLiteUInt8 && + concat_temp->type == kTfLiteUInt8 && + activation_temp->type == kTfLiteInt16) { + gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context); + int state_scale_log2_rounded; + if (!CheckedLog2(state_out->params.scale, &state_scale_log2_rounded)) { + context->ReportError( + context, + "The internal state of a LSTM cell must have a power-of-two scale."); + return kTfLiteError; + } + const int state_integer_bits = 15 + state_scale_log2_rounded; + if (state_integer_bits != 4) { + context->ReportError(context, + "The only case of quantized LstmCell currently " + "supported is with StateIntegerBits==4"); + return kTfLiteError; + } + + double real_accum_multiplier = 4096 * bias->params.scale; + int32 accum_multiplier; + int accum_shift; + tflite::QuantizeMultiplier(real_accum_multiplier, &accum_multiplier, + &accum_shift); + tflite::LstmCellParams op_params; + op_params.weights_zero_point = weights->params.zero_point; + op_params.accum_multiplier = accum_multiplier; + op_params.accum_shift = accum_shift; + optimized_ops::LstmCell<4>( + op_params, + // Inputs. + GetTensorShape(input), GetTensorData<uint8_t>(input), + GetTensorShape(prev_activation), + GetTensorData<uint8_t>(prev_activation), GetTensorShape(weights), + GetTensorData<uint8_t>(weights), GetTensorShape(bias), + GetTensorData<int32_t>(bias), GetTensorShape(prev_state), + GetTensorData<int16_t>(prev_state), + // Outputs. + GetTensorShape(state_out), GetTensorData<int16_t>(state_out), + GetTensorShape(activation_out), GetTensorData<uint8_t>(activation_out), + GetTensorShape(concat_temp), GetTensorData<uint8_t>(concat_temp), + GetTensorShape(activation_temp), + GetTensorData<int16_t>(activation_temp), gemm_context); } else { - input_gate_scratch = scratch_buffer->data.f; - cell_scratch = scratch_buffer->data.f + n_cell * n_batch; - forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; - output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch; + context->ReportError(context, + "Unsupported combination of data types for LstmCell"); + return kTfLiteError; } - // Check optional tensors, the respective pointers can be null. - const float* input_to_input_weights_ptr = - (use_cifg) ? nullptr : input_to_input_weights->data.f; - const float* recurrent_to_input_weights_ptr = - (use_cifg) ? nullptr : recurrent_to_input_weights->data.f; - const float* input_gate_bias_ptr = - (use_cifg) ? nullptr : input_gate_bias->data.f; - const float* cell_to_input_weights_ptr = - (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr; - const float* cell_to_forget_weights_ptr = - (use_peephole) ? cell_to_forget_weights->data.f : nullptr; - const float* cell_to_output_weights_ptr = - (use_peephole) ? cell_to_output_weights->data.f : nullptr; - const float* projection_weights_ptr = - (projection_weights == nullptr) ? nullptr : projection_weights->data.f; - const float* projection_bias_ptr = - (projection_bias == nullptr) ? nullptr : projection_bias->data.f; - - // Required tensors, pointers are non-null. - const float* input_ptr_batch = input->data.f; - const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f; - const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f; - const float* input_to_output_weights_ptr = input_to_output_weights->data.f; - const float* recurrent_to_forget_weights_ptr = - recurrent_to_forget_weights->data.f; - const float* recurrent_to_cell_weights_ptr = - recurrent_to_cell_weights->data.f; - const float* recurrent_to_output_weights_ptr = - recurrent_to_output_weights->data.f; - const float* forget_gate_bias_ptr = forget_gate_bias->data.f; - const float* cell_bias_ptr = cell_bias->data.f; - const float* output_gate_bias_ptr = output_gate_bias->data.f; - - float* output_state_ptr = output_state->data.f; - float* cell_state_ptr = cell_state->data.f; - float* output_ptr_batch = output->data.f; - - kernel_utils::LstmStep( - input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr, - input_to_cell_weights_ptr, input_to_output_weights_ptr, - recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr, - recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr, - cell_to_input_weights_ptr, cell_to_forget_weights_ptr, - cell_to_output_weights_ptr, input_gate_bias_ptr, forget_gate_bias_ptr, - cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr, - projection_bias_ptr, params, n_batch, n_cell, n_input, n_output, - output_state_ptr, cell_state_ptr, input_gate_scratch, forget_gate_scratch, - cell_scratch, output_gate_scratch, output_ptr_batch); + // TODO(ycling): Investigate if this copy can be avoided with the 5-inputs + // LSTM kernel. + memcpy(prev_activation->data.raw, activation_out->data.raw, + activation_out->bytes); + memcpy(prev_state->data.raw, state_out->data.raw, state_out->bytes); return kTfLiteOk; } +} // namespace basic + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + gemm_support::IncrementUsageCounter(context); + + const auto* params = reinterpret_cast<const TfLiteLSTMParams*>(buffer); + switch (params->kernel_type) { + case kTfLiteLSTMFullKernel: + return full::Init(context, buffer, length); + case kTfLiteLSTMBasicKernel: + return basic::Init(context, buffer, length); + } +} +void Free(TfLiteContext* context, void* buffer) { + gemm_support::DecrementUsageCounter(context); + + delete reinterpret_cast<OpData*>(buffer); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + const auto* op_data = reinterpret_cast<const OpData*>(node->user_data); + switch (op_data->kernel_type) { + case kTfLiteLSTMFullKernel: + return full::Prepare(context, node); + case kTfLiteLSTMBasicKernel: + return basic::Prepare(context, node); + } +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const auto* op_data = reinterpret_cast<const OpData*>(node->user_data); + switch (op_data->kernel_type) { + case kTfLiteLSTMFullKernel: + return full::Eval(context, node); + case kTfLiteLSTMBasicKernel: + return basic::Eval(context, node); + } +} + } // namespace lstm TfLiteRegistration* Register_LSTM() { diff --git a/tensorflow/contrib/lite/kernels/lstm_eval.cc b/tensorflow/contrib/lite/kernels/lstm_eval.cc new file mode 100644 index 0000000000..20a4e30009 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/lstm_eval.cc @@ -0,0 +1,912 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/kernels/lstm_eval.h" + +#include <stdint.h> + +#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace lstm_eval { + +namespace { + +// Performs an LSTM batch inference step for input specified by input_ptr_batch. +// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and +// biases (*_bias_ptr), and buffers (*_scratch), along with additional +// parameters: +// - params: various LSTM params including activation, clipping, etc., +// - n_batch: size of batch, +// - n_cell: number of cells (or units), +// - n_input: the input size, +// - n_output: the output size. +// +// The pointers to the cell and output state and the output are updated. +// +// The pointers with the suffix "_batch" point to data aligned in batch_major +// order, and each step processes batch_size many inputs from input_ptr_batch, +// and updates batch_size many cell and output states. +inline void LstmStepWithAuxInput( + const float* input_ptr_batch, const float* input_to_input_weights_ptr, + const float* input_to_forget_weights_ptr, + const float* input_to_cell_weights_ptr, + const float* input_to_output_weights_ptr, const float* aux_input_ptr_batch, + const float* aux_input_to_input_weights_ptr, + const float* aux_input_to_forget_weights_ptr, + const float* aux_input_to_cell_weights_ptr, + const float* aux_input_to_output_weights_ptr, + const float* recurrent_to_input_weights_ptr, + const float* recurrent_to_forget_weights_ptr, + const float* recurrent_to_cell_weights_ptr, + const float* recurrent_to_output_weights_ptr, + const float* cell_to_input_weights_ptr, + const float* cell_to_forget_weights_ptr, + const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr, + const float* forget_gate_bias_ptr, const float* cell_bias_ptr, + const float* output_gate_bias_ptr, const float* projection_weights_ptr, + const float* projection_bias_ptr, const TfLiteLSTMParams* params, + int n_batch, int n_cell, int n_input, int n_aux_input, int n_output, + float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch, + float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch, + float* output_ptr_batch) { + // Since we have already checked that weights are all there or none, we can + // check the existense of only one to the get the condition. + const bool use_cifg = (input_to_input_weights_ptr == nullptr); + const bool use_peephole = (cell_to_output_weights_ptr != nullptr); + // Initialize scratch buffers with bias. + if (!use_cifg) { + tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch, + input_gate_scratch); + } + tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch, + forget_gate_scratch); + tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch, + cell_scratch); + tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch, + output_gate_scratch); + + // For each batch and cell: compute input_weight * input. + if (!use_cifg) { + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, + input_gate_scratch, /*result_stride=*/1); + } + + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, + forget_gate_scratch, /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, + cell_scratch, /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch, + output_gate_scratch, /*result_stride=*/1); + + // If auxiliary input is available then compute aux_input_weight * aux_input + if (aux_input_ptr_batch != nullptr) { + if (!use_cifg) { + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_to_input_weights_ptr, n_cell, n_aux_input, + aux_input_ptr_batch, n_batch, input_gate_scratch, + /*result_stride=*/1); + } + + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_to_forget_weights_ptr, n_cell, n_aux_input, + aux_input_ptr_batch, n_batch, forget_gate_scratch, /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr_batch, + n_batch, cell_scratch, /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_to_output_weights_ptr, n_cell, n_aux_input, + aux_input_ptr_batch, n_batch, output_gate_scratch, /*result_stride=*/1); + } + + // For each batch and cell: compute recurrent_weight * output_state. + if (!use_cifg) { + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr, + n_batch, input_gate_scratch, /*result_stride=*/1); + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr, + n_batch, forget_gate_scratch, + /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr, + n_batch, cell_scratch, /*result_stride=*/1); + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr, + n_batch, output_gate_scratch, + /*result_stride=*/1); + + // For each batch and cell: update input gate. + if (!use_cifg) { + if (use_peephole) { + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch, + input_gate_scratch); + } + tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch, + input_gate_scratch); + } + + // For each batch and cell: update forget gate. + if (use_peephole) { + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch, + forget_gate_scratch); + } + tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch, + forget_gate_scratch); + + // For each batch and cell: update the cell. + tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr, + n_batch * n_cell, cell_state_ptr); + tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell, + params->activation, cell_scratch); + if (use_cifg) { + tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell, + forget_gate_scratch); + tensor_utils::VectorVectorCwiseProductAccumulate( + cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr); + } else { + tensor_utils::VectorVectorCwiseProductAccumulate( + cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr); + } + if (params->cell_clip > 0.0) { + tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell, + params->cell_clip, cell_state_ptr); + } + + // For each batch and cell: update the output gate. + if (use_peephole) { + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch, + output_gate_scratch); + } + tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell, + output_gate_scratch); + tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell, + params->activation, cell_scratch); + tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch, + n_batch * n_cell, output_gate_scratch); + + // For each batch: update the projection and output_state. + const bool use_projection_weight = (projection_weights_ptr != nullptr); + const bool use_projection_bias = (projection_bias_ptr != nullptr); + if (use_projection_weight) { + if (use_projection_bias) { + tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output, + n_batch, output_ptr_batch); + } else { + tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output); + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch, + output_ptr_batch, /*result_stride=*/1); + if (params->proj_clip > 0.0) { + tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output, + params->proj_clip, output_ptr_batch); + } + } else { + tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output, + output_ptr_batch); + } + tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output, + output_state_ptr); +} + +// Same as above but with quantized weight matrices. In detail: +// Input of size 'n_batch * n_input': +// input_ptr_batch +// +// LSTM weights: +// Quantized input weights of size 'n_cell * n_input': +// input_to_input_weights - optional (can be nullptr) +// input_to_forget_weights +// input_to_cell_weights +// input_to_input_weights +// Quantized recurrent weights of size 'n_cell * n_output': +// recurrent_to_input_weights - optional +// recurrent_to_forget_weights +// recurrent_to_cell_weights +// recurrent_to_input_weights +// Quantized peephole weights of size 'n_cell', representing diagonal matrices. +// cell_to_input_weights - optional +// cell_to_cell_weights - optional +// cell_to_output_weights - optional +// Quantized projection weights of size 'n_output * n_cell' +// projection_weights_ptr - optional +// Weight scales (scalars) for each of the weights above. +// input_to_input_weights_scale - optional +// input_to_forget_weights_scale +// input_to_cell_weights_scale +// input_to_output_weights_scale +// recurrent_to_input_weights_scale - optional +// recurrent_to_forget_weights_scale +// recurrent_to_cell_weights_scale +// recurrent_to_output_weights_scale +// cell_to_input_weights_scale, +// cell_to_forget_weights_scale, +// cell_to_output_weights_scale, +// projection_weights_scale - optional +// Gate biases of size 'n_cell': +// input_gate_bias_ptr - optional +// forget_gate_bias_ptr +// cell_gate_bias_ptr +// output_gate_bias_ptr +// +// Temporary pre-allocated storage for quantized values: +// quantized_input_ptr_batch (same size as input_ptr_batch) +// quantized_output_state_ptr (same size as output_state_ptr) +// quantized_cell_state_ptr (same size as cell_state_ptr) +// Temporary pre-allocated storage for recovered values: +// recovered_cell_weights (same size as cell_to_*_weights) +// +// Outputs: +// output_state_ptr - size 'n_batch * n_output' +// cell_state_ptr - size 'n_batch * n_cell' +// output_ptr_batch - size 'n_batch * n_output' +inline void LstmStepWithAuxInput( + const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr, + float input_to_input_weights_scale, + const int8_t* input_to_forget_weights_ptr, + float input_to_forget_weights_scale, + const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale, + const int8_t* input_to_output_weights_ptr, + float input_to_output_weights_scale, const float* aux_input_ptr_batch, + const int8_t* aux_input_to_input_weights_ptr, + float aux_input_to_input_weights_scale, + const int8_t* aux_input_to_forget_weights_ptr, + float aux_input_to_forget_weights_scale, + const int8_t* aux_input_to_cell_weights_ptr, + float aux_input_to_cell_weights_scale, + const int8_t* aux_input_to_output_weights_ptr, + float aux_input_to_output_weights_scale, + const int8_t* recurrent_to_input_weights_ptr, + float recurrent_to_input_weights_scale, + const int8_t* recurrent_to_forget_weights_ptr, + float recurrent_to_forget_weights_scale, + const int8_t* recurrent_to_cell_weights_ptr, + float recurrent_to_cell_weights_scale, + const int8_t* recurrent_to_output_weights_ptr, + float recurrent_to_output_weights_scale, + const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale, + const int8_t* cell_to_forget_weights_ptr, + float cell_to_forget_weights_scale, + const int8_t* cell_to_output_weights_ptr, + float cell_to_output_weights_scale, const float* input_gate_bias_ptr, + const float* forget_gate_bias_ptr, const float* cell_bias_ptr, + const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr, + float projection_weights_scale, const float* projection_bias_ptr, + const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input, + int n_aux_input, int n_output, float* input_gate_scratch, + float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch, + float* scaling_factors, float* product_scaling_factors, + float* recovered_cell_weights, int8_t* quantized_input_ptr_batch, + int8_t* quantized_aux_input_ptr_batch, int8_t* quantized_output_state_ptr, + int8_t* quantized_cell_state_ptr, float* output_state_ptr, + float* cell_state_ptr, float* output_ptr_batch) { + // Since we have already checked that weights are all there or none, we + // can check the existense of only one to the get the condition. + const bool use_cifg = (input_to_input_weights_ptr == nullptr); + const bool use_peephole = (cell_to_output_weights_ptr != nullptr); + // Initialize scratch buffers with bias. + if (!use_cifg) { + tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch, + input_gate_scratch); + } + tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch, + forget_gate_scratch); + tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch, + cell_scratch); + tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch, + output_gate_scratch); + + if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) { + // Save quantization and matmul computation for all zero input. + float unused_min, unused_max; + for (int b = 0; b < n_batch; ++b) { + const int offset = b * n_input; + tensor_utils::SymmetricQuantizeFloats( + input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset, + &unused_min, &unused_max, &scaling_factors[b]); + } + // For each batch and cell: compute input_weight * input. + if (!use_cifg) { + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * input_to_input_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_input_weights_ptr, n_cell, n_input, + quantized_input_ptr_batch, product_scaling_factors, n_batch, + input_gate_scratch, /*result_stride=*/1); + } + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * input_to_forget_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr_batch, + product_scaling_factors, n_batch, forget_gate_scratch, + /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * input_to_cell_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr_batch, + product_scaling_factors, n_batch, cell_scratch, /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * input_to_output_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch, + product_scaling_factors, n_batch, output_gate_scratch, + /*result_stride=*/1); + } + + if (aux_input_ptr_batch != nullptr && + !tensor_utils::IsZeroVector(aux_input_ptr_batch, n_batch * n_input)) { + // Save quantization and matmul computation for all zero input. + float unused_min, unused_max; + for (int b = 0; b < n_batch; ++b) { + const int offset = b * n_input; + tensor_utils::SymmetricQuantizeFloats( + aux_input_ptr_batch + offset, n_input, + quantized_aux_input_ptr_batch + offset, &unused_min, &unused_max, + &scaling_factors[b]); + } + // For each batch and cell: compute input_weight * input. + if (!use_cifg) { + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * aux_input_to_input_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_to_input_weights_ptr, n_cell, n_input, + quantized_aux_input_ptr_batch, product_scaling_factors, n_batch, + input_gate_scratch, /*result_stride=*/1); + } + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * aux_input_to_forget_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_to_forget_weights_ptr, n_cell, n_input, + quantized_aux_input_ptr_batch, product_scaling_factors, n_batch, + forget_gate_scratch, /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * aux_input_to_cell_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_to_cell_weights_ptr, n_cell, n_input, + quantized_aux_input_ptr_batch, product_scaling_factors, n_batch, + cell_scratch, /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * aux_input_to_output_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + aux_input_to_output_weights_ptr, n_cell, n_input, + quantized_aux_input_ptr_batch, product_scaling_factors, n_batch, + output_gate_scratch, /*result_stride=*/1); + } + + if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) { + // Save quantization and matmul computation for all zero input. + float unused_min, unused_max; + for (int b = 0; b < n_batch; ++b) { + const int offset = b * n_output; + tensor_utils::SymmetricQuantizeFloats(output_state_ptr + offset, n_output, + quantized_output_state_ptr + offset, + &unused_min, &unused_max, + &scaling_factors[b]); + } + // For each batch and cell: compute recurrent_weight * output_state. + if (!use_cifg) { + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * recurrent_to_input_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_input_weights_ptr, n_cell, n_output, + quantized_output_state_ptr, product_scaling_factors, n_batch, + input_gate_scratch, /*result_stride=*/1); + } + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * recurrent_to_forget_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_forget_weights_ptr, n_cell, n_output, + quantized_output_state_ptr, product_scaling_factors, n_batch, + forget_gate_scratch, /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * recurrent_to_cell_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_cell_weights_ptr, n_cell, n_output, + quantized_output_state_ptr, product_scaling_factors, n_batch, + cell_scratch, /*result_stride=*/1); + + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * recurrent_to_output_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_to_output_weights_ptr, n_cell, n_output, + quantized_output_state_ptr, product_scaling_factors, n_batch, + output_gate_scratch, /*result_stride=*/1); + } + + // Save quantization and matmul computation for all zero input. + bool is_cell_state_all_zeros = + tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell); + + // For each batch and cell: update input gate. + if (!use_cifg) { + if (use_peephole && !is_cell_state_all_zeros) { + tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell, + cell_to_input_weights_scale, + recovered_cell_weights); + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + recovered_cell_weights, n_cell, cell_state_ptr, n_batch, + input_gate_scratch); + } + tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch, + input_gate_scratch); + } + + // For each batch and cell: update forget gate. + if (use_peephole && !is_cell_state_all_zeros) { + tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell, + cell_to_forget_weights_scale, + recovered_cell_weights); + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + recovered_cell_weights, n_cell, cell_state_ptr, n_batch, + forget_gate_scratch); + } + tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch, + forget_gate_scratch); + + // For each batch and cell: update the cell. + tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr, + n_batch * n_cell, cell_state_ptr); + tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell, + params->activation, cell_scratch); + if (use_cifg) { + tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell, + forget_gate_scratch); + tensor_utils::VectorVectorCwiseProductAccumulate( + cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr); + } else { + tensor_utils::VectorVectorCwiseProductAccumulate( + cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr); + } + if (params->cell_clip > 0.0) { + tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell, + params->cell_clip, cell_state_ptr); + } + + is_cell_state_all_zeros = + tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell); + // For each batch and cell: update the output gate. + if (use_peephole && !is_cell_state_all_zeros) { + tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell, + cell_to_output_weights_scale, + recovered_cell_weights); + tensor_utils::VectorBatchVectorCwiseProductAccumulate( + recovered_cell_weights, n_cell, cell_state_ptr, n_batch, + output_gate_scratch); + } + tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell, + output_gate_scratch); + tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell, + params->activation, cell_scratch); + tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch, + n_batch * n_cell, output_gate_scratch); + + // For each batch: update the projection and output_state. + const bool use_projection_weight = (projection_weights_ptr != nullptr); + const bool use_projection_bias = (projection_bias_ptr != nullptr); + if (use_projection_weight) { + if (use_projection_bias) { + tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output, + n_batch, output_ptr_batch); + } else { + tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output); + } + if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) { + // Save quantization and matmul computation for all zero input. + float unused_min, unused_max; + for (int b = 0; b < n_batch; ++b) { + const int offset = b * n_cell; + tensor_utils::SymmetricQuantizeFloats( + output_gate_scratch + offset, n_cell, + quantized_cell_state_ptr + offset, &unused_min, &unused_max, + &scaling_factors[b]); + } + for (int b = 0; b < n_batch; ++b) { + product_scaling_factors[b] = + scaling_factors[b] * projection_weights_scale; + } + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr, + product_scaling_factors, n_batch, output_ptr_batch, + /*result_stride=*/1); + } + if (params->proj_clip > 0.0) { + tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output, + params->proj_clip, output_ptr_batch); + } + } else { + tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output, + output_ptr_batch); + } + tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output, + output_state_ptr); +} +} // namespace + +TfLiteStatus EvalFloat( + const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights, + const TfLiteTensor* input_to_forget_weights, + const TfLiteTensor* input_to_cell_weights, + const TfLiteTensor* input_to_output_weights, + const TfLiteTensor* recurrent_to_input_weights, + const TfLiteTensor* recurrent_to_forget_weights, + const TfLiteTensor* recurrent_to_cell_weights, + const TfLiteTensor* recurrent_to_output_weights, + const TfLiteTensor* cell_to_input_weights, + const TfLiteTensor* cell_to_forget_weights, + const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input, + const TfLiteTensor* aux_input_to_input_weights, + const TfLiteTensor* aux_input_to_forget_weights, + const TfLiteTensor* aux_input_to_cell_weights, + const TfLiteTensor* aux_input_to_output_weights, + const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias, + const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias, + const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias, + const TfLiteLSTMParams* params, bool forward_sequence, int output_offset, + TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state, + TfLiteTensor* cell_state, TfLiteTensor* output) { + TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3); + const int max_time = (input->dims->size == 2) ? 1 : input->dims->data[0]; + const int n_batch = input->dims->data[input->dims->size - 2]; + const int n_input = input->dims->data[input->dims->size - 1]; + const int aux_input_size = + (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0; + + // n_cell and n_output will be the same size when there is no projection. + const int n_cell = input_to_output_weights->dims->data[0]; + const int n_output = recurrent_to_output_weights->dims->data[1]; + + // Since we have already checked that weights are all there or none, we can + // check the existense of only one to the get the condition. + const bool use_cifg = (input_to_input_weights == nullptr); + const bool use_peephole = (cell_to_output_weights != nullptr); + + // Index the scratch buffers pointers to the global scratch buffer. + float* input_gate_scratch = nullptr; + float* cell_scratch = nullptr; + float* forget_gate_scratch = nullptr; + float* output_gate_scratch = nullptr; + if (use_cifg) { + cell_scratch = scratch_buffer->data.f; + forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch; + output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; + } else { + input_gate_scratch = scratch_buffer->data.f; + cell_scratch = scratch_buffer->data.f + n_cell * n_batch; + forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; + output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch; + } + + // Check optional tensors, the respective pointers can be null. + const float* input_to_input_weights_ptr = + (use_cifg) ? nullptr : input_to_input_weights->data.f; + const float* recurrent_to_input_weights_ptr = + (use_cifg) ? nullptr : recurrent_to_input_weights->data.f; + const float* input_gate_bias_ptr = + (use_cifg) ? nullptr : input_gate_bias->data.f; + const float* cell_to_input_weights_ptr = + (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr; + const float* cell_to_forget_weights_ptr = + (use_peephole) ? cell_to_forget_weights->data.f : nullptr; + const float* cell_to_output_weights_ptr = + (use_peephole) ? cell_to_output_weights->data.f : nullptr; + const float* projection_weights_ptr = + (projection_weights == nullptr) ? nullptr : projection_weights->data.f; + const float* projection_bias_ptr = + (projection_bias == nullptr) ? nullptr : projection_bias->data.f; + + float* aux_input_ptr = nullptr; + float* aux_input_to_input_weights_ptr = nullptr; + float* aux_input_to_forget_weights_ptr = nullptr; + float* aux_input_to_cell_weights_ptr = nullptr; + float* aux_input_to_output_weights_ptr = nullptr; + if (aux_input_size > 0) { + aux_input_ptr = aux_input->data.f; + aux_input_to_input_weights_ptr = aux_input_to_input_weights->data.f; + aux_input_to_forget_weights_ptr = aux_input_to_forget_weights->data.f; + aux_input_to_cell_weights_ptr = aux_input_to_cell_weights->data.f; + aux_input_to_output_weights_ptr = aux_input_to_output_weights->data.f; + } + + // Loop through the sequence. + const int input_step = n_batch * n_input; + const int output_step = n_batch * output->dims->data[output->dims->size - 1]; + for (int t = 0; t < max_time; t++) { + // If this is the forward_sequence, step forward, otherwise step backwards. + const int t_rel = forward_sequence ? t : max_time - t - 1; + const float* input_ptr = input->data.f + t_rel * input_step; + float* output_ptr_time = + output->data.f + t_rel * output_step + output_offset; + + LstmStepWithAuxInput( + input_ptr, input_to_input_weights_ptr, input_to_forget_weights->data.f, + input_to_cell_weights->data.f, input_to_output_weights->data.f, + aux_input_ptr, aux_input_to_input_weights_ptr, + aux_input_to_forget_weights_ptr, aux_input_to_cell_weights_ptr, + aux_input_to_output_weights_ptr, recurrent_to_input_weights_ptr, + recurrent_to_forget_weights->data.f, recurrent_to_cell_weights->data.f, + recurrent_to_output_weights->data.f, cell_to_input_weights_ptr, + cell_to_forget_weights_ptr, cell_to_output_weights_ptr, + input_gate_bias_ptr, forget_gate_bias->data.f, cell_bias->data.f, + output_gate_bias->data.f, projection_weights_ptr, projection_bias_ptr, + params, n_batch, n_cell, n_input, aux_input_size, n_output, + activation_state->data.f, cell_state->data.f, input_gate_scratch, + forget_gate_scratch, cell_scratch, output_gate_scratch, + output_ptr_time); + } + return kTfLiteOk; +} + +TfLiteStatus EvalHybrid( + const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights, + const TfLiteTensor* input_to_forget_weights, + const TfLiteTensor* input_to_cell_weights, + const TfLiteTensor* input_to_output_weights, + const TfLiteTensor* recurrent_to_input_weights, + const TfLiteTensor* recurrent_to_forget_weights, + const TfLiteTensor* recurrent_to_cell_weights, + const TfLiteTensor* recurrent_to_output_weights, + const TfLiteTensor* cell_to_input_weights, + const TfLiteTensor* cell_to_forget_weights, + const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input, + const TfLiteTensor* aux_input_to_input_weights, + const TfLiteTensor* aux_input_to_forget_weights, + const TfLiteTensor* aux_input_to_cell_weights, + const TfLiteTensor* aux_input_to_output_weights, + const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias, + const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias, + const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias, + const TfLiteLSTMParams* params, bool forward_sequence, int output_offset, + TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors, + TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights, + TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized, + TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized, + TfLiteTensor* output_state, TfLiteTensor* cell_state, + TfLiteTensor* output) { + TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3); + const int max_time = (input->dims->size == 2) ? 1 : input->dims->data[0]; + const int n_batch = input->dims->data[input->dims->size - 2]; + const int n_input = input->dims->data[input->dims->size - 1]; + const int aux_input_size = + (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0; + // n_cell and n_output will be the same size when there is no projection. + const int n_cell = input_to_output_weights->dims->data[0]; + const int n_output = recurrent_to_output_weights->dims->data[1]; + + // Since we have already checked that weights are all there or none, we can + // check the existence of only one to get the condition. + const bool use_cifg = (input_to_input_weights == nullptr); + const bool use_peephole = (cell_to_output_weights != nullptr); + + float* input_gate_scratch = nullptr; + float* cell_scratch = nullptr; + float* forget_gate_scratch = nullptr; + float* output_gate_scratch = nullptr; + if (use_cifg) { + cell_scratch = scratch_buffer->data.f; + forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch; + output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; + } else { + input_gate_scratch = scratch_buffer->data.f; + cell_scratch = scratch_buffer->data.f + n_cell * n_batch; + forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; + output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch; + } + + // Check optional tensors, the respective pointers can be null. + int8_t* input_to_input_weights_ptr = nullptr; + float input_to_input_weights_scale = 1.0f; + int8_t* recurrent_to_input_weights_ptr = nullptr; + float recurrent_to_input_weights_scale = 1.0f; + float* input_gate_bias_ptr = nullptr; + if (!use_cifg) { + input_to_input_weights_ptr = + reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8); + recurrent_to_input_weights_ptr = + reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8); + input_gate_bias_ptr = input_gate_bias->data.f; + input_to_input_weights_scale = input_to_input_weights->params.scale; + recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale; + } + + int8_t* cell_to_input_weights_ptr = nullptr; + int8_t* cell_to_forget_weights_ptr = nullptr; + int8_t* cell_to_output_weights_ptr = nullptr; + float cell_to_input_weights_scale = 1.0f; + float cell_to_forget_weights_scale = 1.0f; + float cell_to_output_weights_scale = 1.0f; + if (use_peephole) { + if (!use_cifg) { + cell_to_input_weights_ptr = + reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8); + cell_to_input_weights_scale = cell_to_input_weights->params.scale; + } + cell_to_forget_weights_ptr = + reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8); + cell_to_output_weights_ptr = + reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8); + cell_to_forget_weights_scale = cell_to_forget_weights->params.scale; + cell_to_output_weights_scale = cell_to_output_weights->params.scale; + } + + const int8_t* projection_weights_ptr = + (projection_weights == nullptr) + ? nullptr + : reinterpret_cast<int8_t*>(projection_weights->data.uint8); + const float projection_weights_scale = + (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale; + const float* projection_bias_ptr = + (projection_bias == nullptr) ? nullptr : projection_bias->data.f; + + // Required tensors, pointers are non-null. + const int8_t* input_to_forget_weights_ptr = + reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8); + const float input_to_forget_weights_scale = + input_to_forget_weights->params.scale; + const int8_t* input_to_cell_weights_ptr = + reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8); + const float input_to_cell_weights_scale = input_to_cell_weights->params.scale; + const int8_t* input_to_output_weights_ptr = + reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8); + const float input_to_output_weights_scale = + input_to_output_weights->params.scale; + const int8_t* recurrent_to_forget_weights_ptr = + reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8); + const float recurrent_to_forget_weights_scale = + recurrent_to_forget_weights->params.scale; + const int8_t* recurrent_to_cell_weights_ptr = + reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8); + const float recurrent_to_cell_weights_scale = + recurrent_to_cell_weights->params.scale; + const int8_t* recurrent_to_output_weights_ptr = + reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8); + const float recurrent_to_output_weights_scale = + recurrent_to_output_weights->params.scale; + const float* forget_gate_bias_ptr = forget_gate_bias->data.f; + const float* cell_bias_ptr = cell_bias->data.f; + const float* output_gate_bias_ptr = output_gate_bias->data.f; + + float* output_state_ptr = output_state->data.f; + float* cell_state_ptr = cell_state->data.f; + + // Temporary storage for quantized values and scaling factors. + int8_t* quantized_input_ptr = + reinterpret_cast<int8_t*>(input_quantized->data.uint8); + int8_t* quantized_aux_input_ptr = + (aux_input_quantized == nullptr) + ? nullptr + : reinterpret_cast<int8_t*>(aux_input_quantized->data.uint8); + int8_t* quantized_output_state_ptr = + reinterpret_cast<int8_t*>(output_state_quantized->data.uint8); + int8_t* quantized_cell_state_ptr = + reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8); + float* scaling_factors_ptr = scaling_factors->data.f; + float* prod_scaling_factors_ptr = prod_scaling_factors->data.f; + float* recovered_cell_weights_ptr = recovered_cell_weights->data.f; + + // Auxiliary input and weights. + float* aux_input_ptr = nullptr; + int8_t* aux_input_to_input_weights_ptr = nullptr; + int8_t* aux_input_to_forget_weights_ptr = nullptr; + int8_t* aux_input_to_cell_weights_ptr = nullptr; + int8_t* aux_input_to_output_weights_ptr = nullptr; + float aux_input_to_input_weights_scale = 0.0f; + float aux_input_to_forget_weights_scale = 0.0f; + float aux_input_to_cell_weights_scale = 0.0f; + float aux_input_to_output_weights_scale = 0.0f; + if (aux_input_size > 0) { + aux_input_ptr = aux_input->data.f; + aux_input_to_input_weights_ptr = + reinterpret_cast<int8_t*>(aux_input_to_input_weights->data.uint8); + aux_input_to_forget_weights_ptr = + reinterpret_cast<int8_t*>(aux_input_to_forget_weights->data.uint8); + aux_input_to_cell_weights_ptr = + reinterpret_cast<int8_t*>(aux_input_to_cell_weights->data.uint8); + aux_input_to_output_weights_ptr = + reinterpret_cast<int8_t*>(aux_input_to_output_weights->data.uint8); + aux_input_to_input_weights_scale = aux_input_to_input_weights->params.scale; + aux_input_to_forget_weights_scale = + aux_input_to_forget_weights->params.scale; + aux_input_to_cell_weights_scale = aux_input_to_cell_weights->params.scale; + aux_input_to_output_weights_scale = + aux_input_to_output_weights->params.scale; + } + + // Feed the sequence into the LSTM step-by-step. + const int input_step = n_batch * n_input; + const int output_step = n_batch * output->dims->data[output->dims->size - 1]; + for (int t = 0; t < max_time; t++) { + // If this is the forward_sequence, step forward, otherwise step backwards. + const int t_rel = forward_sequence ? t : max_time - t - 1; + const float* input_ptr = input->data.f + t_rel * input_step; + float* output_ptr = output->data.f + t_rel * output_step + output_offset; + + LstmStepWithAuxInput( + input_ptr, input_to_input_weights_ptr, input_to_input_weights_scale, + input_to_forget_weights_ptr, input_to_forget_weights_scale, + input_to_cell_weights_ptr, input_to_cell_weights_scale, + input_to_output_weights_ptr, input_to_output_weights_scale, + aux_input_ptr, aux_input_to_input_weights_ptr, + aux_input_to_input_weights_scale, aux_input_to_forget_weights_ptr, + aux_input_to_forget_weights_scale, aux_input_to_cell_weights_ptr, + aux_input_to_cell_weights_scale, aux_input_to_output_weights_ptr, + aux_input_to_output_weights_scale, recurrent_to_input_weights_ptr, + recurrent_to_input_weights_scale, recurrent_to_forget_weights_ptr, + recurrent_to_forget_weights_scale, recurrent_to_cell_weights_ptr, + recurrent_to_cell_weights_scale, recurrent_to_output_weights_ptr, + recurrent_to_output_weights_scale, cell_to_input_weights_ptr, + cell_to_input_weights_scale, cell_to_forget_weights_ptr, + cell_to_forget_weights_scale, cell_to_output_weights_ptr, + cell_to_output_weights_scale, input_gate_bias_ptr, forget_gate_bias_ptr, + cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr, + projection_weights_scale, projection_bias_ptr, params, n_batch, n_cell, + n_input, aux_input_size, n_output, input_gate_scratch, + forget_gate_scratch, cell_scratch, output_gate_scratch, + scaling_factors_ptr, prod_scaling_factors_ptr, + recovered_cell_weights_ptr, quantized_input_ptr, + quantized_aux_input_ptr, quantized_output_state_ptr, + quantized_cell_state_ptr, output_state_ptr, cell_state_ptr, output_ptr); + } + + return kTfLiteOk; +} + +} // namespace lstm_eval +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/lstm_eval.h b/tensorflow/contrib/lite/kernels/lstm_eval.h new file mode 100644 index 0000000000..adf8cf0f64 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/lstm_eval.h @@ -0,0 +1,79 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_LSTM_EVAL_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_LSTM_EVAL_H_ + +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace lstm_eval { + +TfLiteStatus EvalFloat( + const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights, + const TfLiteTensor* input_to_forget_weights, + const TfLiteTensor* input_to_cell_weights, + const TfLiteTensor* input_to_output_weights, + const TfLiteTensor* recurrent_to_input_weights, + const TfLiteTensor* recurrent_to_forget_weights, + const TfLiteTensor* recurrent_to_cell_weights, + const TfLiteTensor* recurrent_to_output_weights, + const TfLiteTensor* cell_to_input_weights, + const TfLiteTensor* cell_to_forget_weights, + const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input, + const TfLiteTensor* aux_input_to_input_weights, + const TfLiteTensor* aux_input_to_forget_weights, + const TfLiteTensor* aux_input_to_cell_weights, + const TfLiteTensor* aux_input_to_output_weights, + const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias, + const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias, + const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias, + const TfLiteLSTMParams* params, bool forward_sequence, int output_offset, + TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state, + TfLiteTensor* cell_state, TfLiteTensor* output); + +TfLiteStatus EvalHybrid( + const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights, + const TfLiteTensor* input_to_forget_weights, + const TfLiteTensor* input_to_cell_weights, + const TfLiteTensor* input_to_output_weights, + const TfLiteTensor* recurrent_to_input_weights, + const TfLiteTensor* recurrent_to_forget_weights, + const TfLiteTensor* recurrent_to_cell_weights, + const TfLiteTensor* recurrent_to_output_weights, + const TfLiteTensor* cell_to_input_weights, + const TfLiteTensor* cell_to_forget_weights, + const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input, + const TfLiteTensor* aux_input_to_input_weights, + const TfLiteTensor* aux_input_to_forget_weights, + const TfLiteTensor* aux_input_to_cell_weights, + const TfLiteTensor* aux_input_to_output_weights, + const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias, + const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias, + const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias, + const TfLiteLSTMParams* params, bool forward_sequence, int output_offset, + TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors, + TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights, + TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized, + TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized, + TfLiteTensor* output_state, TfLiteTensor* cell_state, TfLiteTensor* output); + +} // namespace lstm_eval +} // namespace builtin +} // namespace ops +} // namespace tflite +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_LSTM_EVAL_H_ diff --git a/tensorflow/contrib/lite/kernels/lstm_test.cc b/tensorflow/contrib/lite/kernels/lstm_test.cc index d81220d8d3..e7ddfceb45 100644 --- a/tensorflow/contrib/lite/kernels/lstm_test.cc +++ b/tensorflow/contrib/lite/kernels/lstm_test.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ // Unit test for TFLite LSTM op. +// +// TODO(alanchiao): add unit test with invalid input dimensions for this and its +// variants. -#include <iomanip> #include <memory> #include <vector> @@ -35,7 +37,8 @@ class LSTMOpModel : public SingleOpModel { LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg, bool use_peephole, bool use_projection_weights, bool use_projection_bias, float cell_clip, float proj_clip, - const std::vector<std::vector<int>>& input_shapes) + const std::vector<std::vector<int>>& input_shapes, + const TensorType& weight_type = TensorType_FLOAT32) : n_batch_(n_batch), n_input_(n_input), n_cell_(n_cell), @@ -45,31 +48,31 @@ class LSTMOpModel : public SingleOpModel { if (use_cifg) { input_to_input_weights_ = AddNullInput(); } else { - input_to_input_weights_ = AddInput(TensorType_FLOAT32); + input_to_input_weights_ = AddInput(weight_type); } - input_to_forget_weights_ = AddInput(TensorType_FLOAT32); - input_to_cell_weights_ = AddInput(TensorType_FLOAT32); - input_to_output_weights_ = AddInput(TensorType_FLOAT32); + input_to_forget_weights_ = AddInput(weight_type); + input_to_cell_weights_ = AddInput(weight_type); + input_to_output_weights_ = AddInput(weight_type); if (use_cifg) { recurrent_to_input_weights_ = AddNullInput(); } else { - recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32); + recurrent_to_input_weights_ = AddInput(weight_type); } - recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32); - recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32); - recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32); + recurrent_to_forget_weights_ = AddInput(weight_type); + recurrent_to_cell_weights_ = AddInput(weight_type); + recurrent_to_output_weights_ = AddInput(weight_type); if (use_peephole) { if (use_cifg) { cell_to_input_weights_ = AddNullInput(); } else { - cell_to_input_weights_ = AddInput(TensorType_FLOAT32); + cell_to_input_weights_ = AddInput(weight_type); } - cell_to_forget_weights_ = AddInput(TensorType_FLOAT32); - cell_to_output_weights_ = AddInput(TensorType_FLOAT32); + cell_to_forget_weights_ = AddInput(weight_type); + cell_to_output_weights_ = AddInput(weight_type); } else { cell_to_input_weights_ = AddNullInput(); cell_to_forget_weights_ = AddNullInput(); @@ -86,7 +89,7 @@ class LSTMOpModel : public SingleOpModel { output_gate_bias_ = AddInput(TensorType_FLOAT32); if (use_projection_weights) { - projection_weights_ = AddInput(TensorType_FLOAT32); + projection_weights_ = AddInput(weight_type); if (use_projection_bias) { projection_bias_ = AddInput(TensorType_FLOAT32); } else { @@ -97,14 +100,19 @@ class LSTMOpModel : public SingleOpModel { projection_bias_ = AddNullInput(); } - output_state_ = AddOutput(TensorType_FLOAT32); - cell_state_ = AddOutput(TensorType_FLOAT32); + // Adding the 2 input state tensors. + input_activation_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true); + input_cell_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true); + output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions, CreateLSTMOptions(builder_, ActivationFunctionType_TANH, cell_clip, proj_clip) .Union()); + BuildInterpreter(input_shapes); } @@ -176,24 +184,9 @@ class LSTMOpModel : public SingleOpModel { PopulateTensor(projection_bias_, f); } - void ResetOutputState() { - const int zero_buffer_size = n_cell_ * n_batch_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(output_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - - void ResetCellState() { - const int zero_buffer_size = n_cell_ * n_batch_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(cell_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - - void SetInput(int offset, float* begin, float* end) { - PopulateTensor(input_, offset, begin, end); + void SetInput(int offset, const float* begin, const float* end) { + PopulateTensor(input_, offset, const_cast<float*>(begin), + const_cast<float*>(end)); } std::vector<float> GetOutput() { return ExtractVector<float>(output_); } @@ -203,7 +196,7 @@ class LSTMOpModel : public SingleOpModel { int num_cells() { return n_cell_; } int num_batches() { return n_batch_; } - private: + protected: int input_; int input_to_input_weights_; int input_to_forget_weights_; @@ -226,6 +219,8 @@ class LSTMOpModel : public SingleOpModel { int projection_weights_; int projection_bias_; + int input_activation_state_; + int input_cell_state_; int output_; int output_state_; @@ -237,7 +232,174 @@ class LSTMOpModel : public SingleOpModel { int n_output_; }; -TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { +class HybridLSTMOpModel : public LSTMOpModel { + public: + HybridLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, + bool use_cifg, bool use_peephole, + bool use_projection_weights, bool use_projection_bias, + float cell_clip, float proj_clip, + const std::vector<std::vector<int>>& input_shapes) + : LSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg, use_peephole, + use_projection_weights, use_projection_bias, cell_clip, + proj_clip, input_shapes, TensorType_UINT8) {} + + void SetInputToInputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_input_weights_, f); + } + + void SetInputToForgetWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_forget_weights_, f); + } + + void SetInputToCellWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_cell_weights_, f); + } + + void SetInputToOutputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_output_weights_, f); + } + + void SetRecurrentToInputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f); + } + + void SetRecurrentToForgetWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f); + } + + void SetRecurrentToCellWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f); + } + + void SetRecurrentToOutputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f); + } + + void SetCellToInputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(cell_to_input_weights_, f); + } + + void SetCellToForgetWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f); + } + + void SetCellToOutputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(cell_to_output_weights_, f); + } + + void SetProjectionWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(projection_weights_, f); + } +}; + +class BaseLstmTest : public ::testing::Test { + protected: + // Weights of the LSTM model. Some are optional. + std::initializer_list<float> input_to_input_weights_; + std::initializer_list<float> input_to_cell_weights_; + std::initializer_list<float> input_to_forget_weights_; + std::initializer_list<float> input_to_output_weights_; + std::initializer_list<float> input_gate_bias_; + std::initializer_list<float> cell_gate_bias_; + std::initializer_list<float> forget_gate_bias_; + std::initializer_list<float> output_gate_bias_; + std::initializer_list<float> recurrent_to_input_weights_; + std::initializer_list<float> recurrent_to_cell_weights_; + std::initializer_list<float> recurrent_to_forget_weights_; + std::initializer_list<float> recurrent_to_output_weights_; + std::initializer_list<float> cell_to_input_weights_; + std::initializer_list<float> cell_to_forget_weights_; + std::initializer_list<float> cell_to_output_weights_; + std::initializer_list<float> projection_weights_; + + // LSTM input is stored as num_batch x num_inputs vector. + std::vector<std::vector<float>> lstm_input_; + // LSTM output is stored as num_batch x num_outputs vector. + std::vector<std::vector<float>> lstm_golden_output_; + + // Compares output up to tolerance to the result of the lstm given the input. + void VerifyGoldens(const std::vector<std::vector<float>>& input, + const std::vector<std::vector<float>>& output, + LSTMOpModel* lstm, float tolerance = 1e-5) { + const int num_batches = input.size(); + EXPECT_GT(num_batches, 0); + const int num_inputs = lstm->num_inputs(); + EXPECT_GT(num_inputs, 0); + const int input_sequence_size = input[0].size() / num_inputs; + EXPECT_GT(input_sequence_size, 0); + for (int i = 0; i < input_sequence_size; ++i) { + for (int b = 0; b < num_batches; ++b) { + const float* batch_start = input[b].data() + i * num_inputs; + const float* batch_end = batch_start + num_inputs; + + lstm->SetInput(b * lstm->num_inputs(), batch_start, batch_end); + } + + lstm->Invoke(); + + const int num_outputs = lstm->num_outputs(); + std::vector<float> expected; + for (int b = 0; b < num_batches; ++b) { + const float* golden_start_batch = output[b].data() + i * num_outputs; + const float* golden_end_batch = golden_start_batch + num_outputs; + expected.insert(expected.end(), golden_start_batch, golden_end_batch); + } + EXPECT_THAT(lstm->GetOutput(), + ElementsAreArray(ArrayFloatNear(expected, tolerance))); + } + } +}; + +class NoCifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest { + void SetUp() override { + input_to_input_weights_ = {-0.45018822, -0.02338299, -0.0870589, + -0.34550029, 0.04266912, -0.15680569, + -0.34856534, 0.43890524}; + input_to_cell_weights_ = {-0.50013041, 0.1370284, 0.11810488, 0.2013163, + -0.20583314, 0.44344562, 0.22077113, -0.29909778}; + input_to_forget_weights_ = {0.09701663, 0.20334584, -0.50592935, + -0.31343272, -0.40032279, 0.44781327, + 0.01387155, -0.35593212}; + input_to_output_weights_ = {-0.25065863, -0.28290087, 0.04613829, + 0.40525138, 0.44272184, 0.03897077, + -0.1556896, 0.19487578}; + input_gate_bias_ = {0., 0., 0., 0.}; + cell_gate_bias_ = {0., 0., 0., 0.}; + forget_gate_bias_ = {1., 1., 1., 1.}; + output_gate_bias_ = {0., 0., 0., 0.}; + + recurrent_to_input_weights_ = { + -0.0063535, -0.2042388, 0.31454784, -0.35746509, + 0.28902304, 0.08183324, -0.16555229, 0.02286911, + -0.13566875, 0.03034258, 0.48091322, -0.12528998, + 0.24077177, -0.51332325, -0.33502164, 0.10629296}; + + recurrent_to_cell_weights_ = { + -0.3407414, 0.24443203, -0.2078532, 0.26320225, + 0.05695659, -0.00123841, -0.4744786, -0.35869038, + -0.06418842, -0.13502428, -0.501764, 0.22830659, + -0.46367589, 0.26016325, -0.03894562, -0.16368064}; + + recurrent_to_forget_weights_ = { + -0.48684245, -0.06655136, 0.42224967, 0.2112639, + 0.27654213, 0.20864892, -0.07646349, 0.45877004, + 0.00141793, -0.14609534, 0.36447752, 0.09196436, + 0.28053468, 0.01560611, -0.20127171, -0.01140004}; + + recurrent_to_output_weights_ = { + 0.43385774, -0.17194885, 0.2718237, 0.09215671, + 0.24107647, -0.39835793, 0.18212086, 0.01301402, + 0.48572797, -0.50656658, 0.20047462, -0.20607421, + -0.51818722, -0.15390486, 0.0468148, 0.39922136}; + + lstm_input_ = {{2., 3., 3., 4., 1., 1.}}; + lstm_golden_output_ = {{-0.02973187, 0.1229473, 0.20885126, -0.15358765, + -0.03716109, 0.12507336, 0.41193449, -0.20860538, + -0.15053082, 0.09120187, 0.24278517, -0.12222792}}; + } +}; + +TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) { const int n_batch = 1; const int n_input = 2; // n_cell and n_output have the same size when there is no projection. @@ -257,10 +419,10 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { {n_cell, n_input}, // input_to_cell_weight tensor {n_cell, n_input}, // input_to_output_weight tensor - {n_cell, n_output}, // recurrent_to_input_weight tensor - {n_cell, n_output}, // recurrent_to_forget_weight tensor - {n_cell, n_output}, // recurrent_to_cell_weight tensor - {n_cell, n_output}, // recurrent_to_output_weight tensor + {n_cell, n_output}, // recurrent_to_input_weight_tensor + {n_cell, n_output}, // recurrent_to_forget_weight_tensor + {n_cell, n_output}, // recurrent_to_cell_weight_tensor + {n_cell, n_output}, // recurrent_to_output_weight_tensor {0}, // cell_to_input_weight tensor {0}, // cell_to_forget_weight tensor @@ -275,79 +437,129 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { {0}, // projection_bias tensor }); - lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589, - -0.34550029, 0.04266912, -0.15680569, - -0.34856534, 0.43890524}); - - lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163, - -0.20583314, 0.44344562, 0.22077113, - -0.29909778}); - - lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935, - -0.31343272, -0.40032279, 0.44781327, - 0.01387155, -0.35593212}); - - lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829, - 0.40525138, 0.44272184, 0.03897077, -0.1556896, - 0.19487578}); - - lstm.SetInputGateBias({0., 0., 0., 0.}); - - lstm.SetCellBias({0., 0., 0., 0.}); - - lstm.SetForgetGateBias({1., 1., 1., 1.}); - - lstm.SetOutputGateBias({0., 0., 0., 0.}); - - lstm.SetRecurrentToInputWeights( - {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324, - -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322, - -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296}); + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); - lstm.SetRecurrentToCellWeights( - {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841, - -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659, - -0.46367589, 0.26016325, -0.03894562, -0.16368064}); + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); - lstm.SetRecurrentToForgetWeights( - {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892, - -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436, - 0.28053468, 0.01560611, -0.20127171, -0.01140004}); + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); - lstm.SetRecurrentToOutputWeights( - {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793, - 0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421, - -0.51818722, -0.15390486, 0.0468148, 0.39922136}); - - static float lstm_input[] = {2., 3., 3., 4., 1., 1.}; - static float lstm_golden_output[] = {-0.02973187, 0.1229473, 0.20885126, - -0.15358765, -0.03716109, 0.12507336, - 0.41193449, -0.20860538, -0.15053082, - 0.09120187, 0.24278517, -0.12222792}; + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); +} - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); +TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) { + const int n_batch = 1; + const int n_input = 2; + // n_cell and n_output have the same size when there is no projection. + const int n_cell = 4; + const int n_output = 4; - const int input_sequence_size = - sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs()); - for (int i = 0; i < input_sequence_size; i++) { - float* batch0_start = lstm_input + i * lstm.num_inputs(); - float* batch0_end = batch0_start + lstm.num_inputs(); + HybridLSTMOpModel lstm( + n_batch, n_input, n_cell, n_output, + /*use_cifg=*/false, /*use_peephole=*/false, + /*use_projection_weights=*/false, + /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0, + { + {n_batch, n_input}, // input tensor + + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {0}, // cell_to_input_weight tensor + {0}, // cell_to_forget_weight tensor + {0}, // cell_to_output_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + }); + + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); + + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); + + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, + /*tolerance=*/0.0157651); +} - lstm.SetInput(0, batch0_start, batch0_end); +class CifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest { + void SetUp() override { + input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726, + 0.05100781, 0.04717243, 0.48944736, + -0.38535351, -0.17212132}; - lstm.Invoke(); + input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988, + -0.3633365, -0.22755712, 0.28253698, + 0.24407166, 0.33826375}; - float* golden_start = lstm_golden_output + i * lstm.num_outputs(); - float* golden_end = golden_start + lstm.num_outputs(); - std::vector<float> expected; - expected.insert(expected.end(), golden_start, golden_end); - EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); + input_to_output_weights_ = {0.10725588, -0.02335852, -0.55932593, + -0.09426838, -0.44257352, 0.54939759, + 0.01533556, 0.42751634}; + cell_gate_bias_ = {0., 0., 0., 0.}; + forget_gate_bias_ = {1., 1., 1., 1.}; + output_gate_bias_ = {0., 0., 0., 0.}; + + recurrent_to_cell_weights_ = { + 0.54066205, -0.32668582, -0.43562764, -0.56094903, + 0.42957711, 0.01841056, -0.32764608, -0.33027974, + -0.10826075, 0.20675004, 0.19069612, -0.03026325, + -0.54532051, 0.33003211, 0.44901288, 0.21193194}; + + recurrent_to_forget_weights_ = { + -0.13832897, -0.0515101, -0.2359007, -0.16661474, + -0.14340827, 0.36986142, 0.23414481, 0.55899, + 0.10798943, -0.41174671, 0.17751795, -0.34484994, + -0.35874045, -0.11352962, 0.27268326, 0.54058349}; + + recurrent_to_output_weights_ = { + 0.41613156, 0.42610586, -0.16495961, -0.5663873, + 0.30579174, -0.05115908, -0.33941799, 0.23364776, + 0.11178309, 0.09481031, -0.26424935, 0.46261835, + 0.50248802, 0.26114327, -0.43736315, 0.33149987}; + + cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408, + 0.31544167}; + cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703, + -0.77109635}; + + lstm_input_ = {{2., 3., 3., 4., 1., 1.}}; + lstm_golden_output_ = {{-0.36444446, -0.00352185, 0.12886585, -0.05163646, + -0.42312205, -0.01218222, 0.24201041, -0.08124574, + -0.358325, -0.04621704, 0.21641694, -0.06471302}}; } -} +}; -TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { +TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) { const int n_batch = 1; const int n_input = 2; // n_cell and n_output have the same size when there is no projection. @@ -385,74 +597,681 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { {0}, // projection_bias tensor }); - lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781, - 0.04717243, 0.48944736, -0.38535351, - -0.17212132}); - - lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988, - -0.3633365, -0.22755712, 0.28253698, 0.24407166, - 0.33826375}); - - lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593, - -0.09426838, -0.44257352, 0.54939759, - 0.01533556, 0.42751634}); - - lstm.SetCellBias({0., 0., 0., 0.}); - - lstm.SetForgetGateBias({1., 1., 1., 1.}); - - lstm.SetOutputGateBias({0., 0., 0., 0.}); - - lstm.SetRecurrentToCellWeights( - {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711, - 0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004, - 0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288, - 0.21193194}); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); - lstm.SetRecurrentToForgetWeights( - {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827, - 0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795, - -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349}); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); - lstm.SetRecurrentToOutputWeights( - {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908, - -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835, - 0.50248802, 0.26114327, -0.43736315, 0.33149987}); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); - lstm.SetCellToForgetWeights( - {0.47485286, -0.51955009, -0.24458408, 0.31544167}); - lstm.SetCellToOutputWeights( - {-0.17135078, 0.82760304, 0.85573703, -0.77109635}); + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); - static float lstm_input[] = {2., 3., 3., 4., 1., 1.}; - static float lstm_golden_output[] = {-0.36444446, -0.00352185, 0.12886585, - -0.05163646, -0.42312205, -0.01218222, - 0.24201041, -0.08124574, -0.358325, - -0.04621704, 0.21641694, -0.06471302}; - - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); - - const int input_sequence_size = - sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs()); - for (int i = 0; i < input_sequence_size; i++) { - float* batch0_start = lstm_input + i * lstm.num_inputs(); - float* batch0_end = batch0_start + lstm.num_inputs(); + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); +} - lstm.SetInput(0, batch0_start, batch0_end); +TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) { + const int n_batch = 1; + const int n_input = 2; + // n_cell and n_output have the same size when there is no projection. + const int n_cell = 4; + const int n_output = 4; - lstm.Invoke(); + HybridLSTMOpModel lstm( + n_batch, n_input, n_cell, n_output, + /*use_cifg=*/true, /*use_peephole=*/true, + /*use_projection_weights=*/false, + /*use_projection_bias=*/false, + /*cell_clip=*/0.0, /*proj_clip=*/0.0, + { + {n_batch, n_input}, // input tensor + + {0, 0}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {0, 0}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {0}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor + + {0}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + }); + + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); + + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); + + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); + + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573); +} - float* golden_start = lstm_golden_output + i * lstm.num_outputs(); - float* golden_end = golden_start + lstm.num_outputs(); - std::vector<float> expected; - expected.insert(expected.end(), golden_start, golden_end); - EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); +class NoCifgPeepholeProjectionNoClippingLstmTest : public BaseLstmTest { + void SetUp() override { + input_to_input_weights_ = { + 0.021393683, 0.06124551, 0.046905167, -0.014657677, -0.03149463, + 0.09171803, 0.14647801, 0.10797193, -0.0057968358, 0.0019193048, + -0.2726754, 0.10154029, -0.018539885, 0.080349885, -0.10262385, + -0.022599787, -0.09121155, -0.008675967, -0.045206103, -0.0821282, + -0.008045952, 0.015478081, 0.055217247, 0.038719587, 0.044153627, + -0.06453243, 0.05031825, -0.046935108, -0.008164439, 0.014574226, + -0.1671009, -0.15519552, -0.16819797, -0.13971269, -0.11953059, + 0.25005487, -0.22790983, 0.009855087, -0.028140958, -0.11200698, + 0.11295408, -0.0035217577, 0.054485075, 0.05184695, 0.064711206, + 0.10989193, 0.11674786, 0.03490607, 0.07727357, 0.11390585, + -0.1863375, -0.1034451, -0.13945189, -0.049401227, -0.18767063, + 0.042483903, 0.14233552, 0.13832581, 0.18350165, 0.14545603, + -0.028545704, 0.024939531, 0.050929718, 0.0076203286, -0.0029723682, + -0.042484224, -0.11827596, -0.09171104, -0.10808628, -0.16327988, + -0.2273378, -0.0993647, -0.017155107, 0.0023917493, 0.049272764, + 0.0038534778, 0.054764505, 0.089753784, 0.06947234, 0.08014476, + -0.04544234, -0.0497073, -0.07135631, -0.048929106, -0.004042012, + -0.009284026, 0.018042054, 0.0036860977, -0.07427302, -0.11434604, + -0.018995456, 0.031487543, 0.012834908, 0.019977754, 0.044256654, + -0.39292613, -0.18519334, -0.11651281, -0.06809892, 0.011373677}; + + input_to_forget_weights_ = { + -0.0018401089, -0.004852237, 0.03698424, 0.014181704, + 0.028273236, -0.016726194, -0.05249759, -0.10204261, + 0.00861066, -0.040979505, -0.009899187, 0.01923892, + -0.028177269, -0.08535103, -0.14585495, 0.10662567, + -0.01909731, -0.017883534, -0.0047269356, -0.045103323, + 0.0030784295, 0.076784775, 0.07463696, 0.094531395, + 0.0814421, -0.12257899, -0.033945758, -0.031303465, + 0.045630626, 0.06843887, -0.13492945, -0.012480007, + -0.0811829, -0.07224499, -0.09628791, 0.045100946, + 0.0012300825, 0.013964662, 0.099372394, 0.02543059, + 0.06958324, 0.034257296, 0.0482646, 0.06267997, + 0.052625068, 0.12784666, 0.07077897, 0.025725935, + 0.04165009, 0.07241905, 0.018668644, -0.037377294, + -0.06277783, -0.08833636, -0.040120605, -0.011405586, + -0.007808335, -0.010301386, -0.005102167, 0.027717464, + 0.05483423, 0.11449111, 0.11289652, 0.10939839, + 0.13396506, -0.08402166, -0.01901462, -0.044678304, + -0.07720565, 0.014350063, -0.11757958, -0.0652038, + -0.08185733, -0.076754324, -0.092614375, 0.10405491, + 0.052960336, 0.035755895, 0.035839386, -0.012540553, + 0.036881298, 0.02913376, 0.03420159, 0.05448447, + -0.054523353, 0.02582715, 0.02327355, -0.011857179, + -0.0011980024, -0.034641717, -0.026125094, -0.17582615, + -0.15923657, -0.27486774, -0.0006143371, 0.0001771948, + -8.470171e-05, 0.02651807, 0.045790765, 0.06956496}; + + input_to_cell_weights_ = { + -0.04580283, -0.09549462, -0.032418985, -0.06454633, + -0.043528453, 0.043018587, -0.049152344, -0.12418144, + -0.078985475, -0.07596889, 0.019484362, -0.11434962, + -0.0074034138, -0.06314844, -0.092981495, 0.0062155537, + -0.025034338, -0.0028890965, 0.048929527, 0.06235075, + 0.10665918, -0.032036792, -0.08505916, -0.10843358, + -0.13002433, -0.036816437, -0.02130134, -0.016518239, + 0.0047691227, -0.0025825808, 0.066017866, 0.029991534, + -0.10652836, -0.1037554, -0.13056071, -0.03266643, + -0.033702414, -0.006473424, -0.04611692, 0.014419339, + -0.025174323, 0.0396852, 0.081777506, 0.06157468, + 0.10210095, -0.009658194, 0.046511717, 0.03603906, + 0.0069369148, 0.015960095, -0.06507666, 0.09551598, + 0.053568836, 0.06408714, 0.12835667, -0.008714329, + -0.20211966, -0.12093674, 0.029450472, 0.2849013, + -0.029227901, 0.1164364, -0.08560263, 0.09941786, + -0.036999565, -0.028842626, -0.0033637602, -0.017012902, + -0.09720865, -0.11193351, -0.029155117, -0.017936034, + -0.009768936, -0.04223324, -0.036159635, 0.06505112, + -0.021742892, -0.023377212, -0.07221364, -0.06430552, + 0.05453865, 0.091149814, 0.06387331, 0.007518393, + 0.055960953, 0.069779344, 0.046411168, 0.10509911, + 0.07463894, 0.0075130584, 0.012850982, 0.04555431, + 0.056955688, 0.06555285, 0.050801456, -0.009862683, + 0.00826772, -0.026555609, -0.0073611983, -0.0014897042}; + + input_to_output_weights_ = { + -0.0998932, -0.07201956, -0.052803773, -0.15629593, -0.15001918, + -0.07650751, 0.02359855, -0.075155355, -0.08037709, -0.15093534, + 0.029517552, -0.04751393, 0.010350531, -0.02664851, -0.016839722, + -0.023121163, 0.0077019283, 0.012851257, -0.05040649, -0.0129761, + -0.021737747, -0.038305793, -0.06870586, -0.01481247, -0.001285394, + 0.10124236, 0.083122835, 0.053313006, -0.062235646, -0.075637154, + -0.027833903, 0.029774971, 0.1130802, 0.09218906, 0.09506135, + -0.086665764, -0.037162706, -0.038880914, -0.035832845, -0.014481564, + -0.09825003, -0.12048569, -0.097665586, -0.05287633, -0.0964047, + -0.11366429, 0.035777505, 0.13568819, 0.052451383, 0.050649304, + 0.05798951, -0.021852335, -0.099848844, 0.014740475, -0.078897946, + 0.04974699, 0.014160473, 0.06973932, 0.04964942, 0.033364646, + 0.08190124, 0.025535367, 0.050893165, 0.048514254, 0.06945813, + -0.078907564, -0.06707616, -0.11844508, -0.09986688, -0.07509403, + 0.06263226, 0.14925587, 0.20188436, 0.12098451, 0.14639415, + 0.0015017595, -0.014267382, -0.03417257, 0.012711468, 0.0028300495, + -0.024758482, -0.05098548, -0.0821182, 0.014225672, 0.021544158, + 0.08949725, 0.07505268, -0.0020780868, 0.04908258, 0.06476295, + -0.022907063, 0.027562456, 0.040185735, 0.019567577, -0.015598739, + -0.049097303, -0.017121866, -0.083368234, -0.02332002, -0.0840956}; + + input_gate_bias_ = {0.02234832, 0.14757581, 0.18176508, 0.10380666, + 0.053110216, -0.06928846, -0.13942584, -0.11816189, + 0.19483899, 0.03652339, -0.10250295, 0.036714908, + -0.18426876, 0.036065217, 0.21810818, 0.02383196, + -0.043370757, 0.08690144, -0.04444982, 0.00030581196}; + + forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696, + 0.11098921, 0.15378423, 0.09263801, 0.09790885, + 0.09508917, 0.061199076, 0.07665568, -0.015443159, + -0.03499149, 0.046190713, 0.08895977, 0.10899629, + 0.40694186, 0.06030037, 0.012413437, -0.06108739}; + + cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132, 0.033463873, + -0.1483596, -0.10639995, -0.091433935, 0.058573797, + -0.06809782, -0.07889636, -0.043246906, -0.09829136, + -0.4279842, 0.034901652, 0.18797937, 0.0075234566, + 0.016178843, 0.1749513, 0.13975595, 0.92058027}; + + output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469, 0.12648113, + 0.027195795, 0.35373217, -0.018957434, 0.008907322, + -0.0762701, 0.12018895, 0.04216877, 0.0022856654, + 0.040952638, 0.3147856, 0.08225149, -0.057416286, + -0.14995944, -0.008040261, 0.13208859, 0.029760877}; + + recurrent_to_input_weights_ = { + -0.001374326, -0.078856036, 0.10672688, 0.029162422, + -0.11585556, 0.02557986, -0.13446963, -0.035785314, + -0.01244275, 0.025961924, -0.02337298, -0.044228926, + -0.055839065, -0.046598054, -0.010546039, -0.06900766, + 0.027239809, 0.022582639, -0.013296484, -0.05459212, + 0.08981, -0.045407712, 0.08682226, -0.06867011, + -0.14390695, -0.02916037, 0.000996957, 0.091420636, + 0.14283475, -0.07390571, -0.06402044, 0.062524505, + -0.093129106, 0.04860203, -0.08364217, -0.08119002, + 0.009352075, 0.22920375, 0.0016303885, 0.11583097, + -0.13732095, 0.012405723, -0.07551853, 0.06343048, + 0.12162708, -0.031923793, -0.014335606, 0.01790974, + -0.10650317, -0.0724401, 0.08554849, -0.05727212, + 0.06556731, -0.042729504, -0.043227166, 0.011683251, + -0.013082158, -0.029302018, -0.010899579, -0.062036745, + -0.022509435, -0.00964907, -0.01567329, 0.04260106, + -0.07787477, -0.11576462, 0.017356863, 0.048673786, + -0.017577527, -0.05527947, -0.082487635, -0.040137455, + -0.10820036, -0.04666372, 0.022746278, -0.07851417, + 0.01068115, 0.032956902, 0.022433773, 0.0026891115, + 0.08944216, -0.0685835, 0.010513544, 0.07228705, + 0.02032331, -0.059686817, -0.0005566496, -0.086984694, + 0.040414046, -0.1380399, 0.094208956, -0.05722982, + 0.012092817, -0.04989123, -0.086576, -0.003399834, + -0.04696032, -0.045747425, 0.10091314, 0.048676282, + -0.029037097, 0.031399418, -0.0040285117, 0.047237843, + 0.09504992, 0.041799378, -0.049185462, -0.031518843, + -0.10516937, 0.026374253, 0.10058866, -0.0033195973, + -0.041975245, 0.0073591834, 0.0033782164, -0.004325073, + -0.10167381, 0.042500053, -0.01447153, 0.06464186, + -0.017142897, 0.03312627, 0.009205989, 0.024138335, + -0.011337001, 0.035530265, -0.010912711, 0.0706555, + -0.005894094, 0.051841937, -0.1401738, -0.02351249, + 0.0365468, 0.07590991, 0.08838724, 0.021681072, + -0.10086113, 0.019608743, -0.06195883, 0.077335775, + 0.023646897, -0.095322326, 0.02233014, 0.09756986, + -0.048691444, -0.009579111, 0.07595467, 0.11480546, + -0.09801813, 0.019894179, 0.08502348, 0.004032281, + 0.037211012, 0.068537936, -0.048005626, -0.091520436, + -0.028379958, -0.01556313, 0.06554592, -0.045599163, + -0.01672207, -0.020169014, -0.011877351, -0.20212261, + 0.010889619, 0.0047078193, 0.038385306, 0.08540671, + -0.017140968, -0.0035865551, 0.016678626, 0.005633034, + 0.015963363, 0.00871737, 0.060130805, 0.028611384, + 0.10109069, -0.015060172, -0.07894427, 0.06401885, + 0.011584063, -0.024466386, 0.0047652307, -0.09041358, + 0.030737216, -0.0046374933, 0.14215417, -0.11823516, + 0.019899689, 0.006106124, -0.027092824, 0.0786356, + 0.05052217, -0.058925, -0.011402121, -0.024987547, + -0.0013661642, -0.06832946, -0.015667673, -0.1083353, + -0.00096863037, -0.06988685, -0.053350925, -0.027275559, + -0.033664223, -0.07978348, -0.025200296, -0.017207067, + -0.058403496, -0.055697463, 0.005798788, 0.12965427, + -0.062582195, 0.0013350133, -0.10482091, 0.0379771, + 0.072521195, -0.0029455067, -0.13797039, -0.03628521, + 0.013806405, -0.017858358, -0.01008298, -0.07700066, + -0.017081132, 0.019358726, 0.0027079724, 0.004635139, + 0.062634714, -0.02338735, -0.039547626, -0.02050681, + 0.03385117, -0.083611414, 0.002862572, -0.09421313, + 0.058618143, -0.08598433, 0.00972939, 0.023867095, + -0.053934585, -0.023203006, 0.07452513, -0.048767887, + -0.07314807, -0.056307215, -0.10433547, -0.06440842, + 0.04328182, 0.04389765, -0.020006588, -0.09076438, + -0.11652589, -0.021705797, 0.03345259, -0.010329105, + -0.025767034, 0.013057034, -0.07316461, -0.10145612, + 0.06358255, 0.18531723, 0.07759293, 0.12006465, + 0.1305557, 0.058638252, -0.03393652, 0.09622831, + -0.16253184, -2.4580743e-06, 0.079869635, -0.070196845, + -0.005644518, 0.06857898, -0.12598175, -0.035084512, + 0.03156317, -0.12794146, -0.031963028, 0.04692781, + 0.030070418, 0.0071660685, -0.095516115, -0.004643372, + 0.040170413, -0.062104587, -0.0037324072, 0.0554317, + 0.08184801, -0.019164372, 0.06791302, 0.034257166, + -0.10307039, 0.021943003, 0.046745934, 0.0790918, + -0.0265588, -0.007824208, 0.042546265, -0.00977924, + -0.0002440307, -0.017384544, -0.017990116, 0.12252321, + -0.014512694, -0.08251313, 0.08861942, 0.13589665, + 0.026351685, 0.012641483, 0.07466548, 0.044301085, + -0.045414884, -0.051112458, 0.03444247, -0.08502782, + -0.04106223, -0.028126027, 0.028473156, 0.10467447}; + + recurrent_to_cell_weights_ = { + -0.037322544, 0.018592842, 0.0056175636, -0.06253426, + 0.055647098, -0.05713207, -0.05626563, 0.005559383, + 0.03375411, -0.025757805, -0.088049285, 0.06017052, + -0.06570978, 0.007384076, 0.035123326, -0.07920549, + 0.053676967, 0.044480428, -0.07663568, 0.0071805613, + 0.08089997, 0.05143358, 0.038261272, 0.03339287, + -0.027673481, 0.044746667, 0.028349208, 0.020090483, + -0.019443132, -0.030755889, -0.0040000007, 0.04465846, + -0.021585021, 0.0031670958, 0.0053199246, -0.056117613, + -0.10893326, 0.076739706, -0.08509834, -0.027997585, + 0.037871376, 0.01449768, -0.09002357, -0.06111149, + -0.046195522, 0.0422062, -0.005683705, -0.1253618, + -0.012925729, -0.04890792, 0.06985068, 0.037654128, + 0.03398274, -0.004781977, 0.007032333, -0.031787455, + 0.010868644, -0.031489216, 0.09525667, 0.013939797, + 0.0058680447, 0.0167067, 0.02668468, -0.04797466, + -0.048885044, -0.12722108, 0.035304096, 0.06554885, + 0.00972396, -0.039238118, -0.05159735, -0.11329045, + 0.1613692, -0.03750952, 0.06529313, -0.071974665, + -0.11769596, 0.015524369, -0.0013754242, -0.12446318, + 0.02786344, -0.014179351, 0.005264273, 0.14376344, + 0.015983658, 0.03406988, -0.06939408, 0.040699873, + 0.02111075, 0.09669095, 0.041345075, -0.08316494, + -0.07684199, -0.045768797, 0.032298047, -0.041805092, + 0.0119405, 0.0061010392, 0.12652606, 0.0064572375, + -0.024950314, 0.11574242, 0.04508852, -0.04335324, + 0.06760663, -0.027437469, 0.07216407, 0.06977076, + -0.05438599, 0.034033038, -0.028602652, 0.05346137, + 0.043184172, -0.037189785, 0.10420091, 0.00882477, + -0.054019816, -0.074273005, -0.030617684, -0.0028467078, + 0.024302477, -0.0038869337, 0.005332455, 0.0013399826, + 0.04361412, -0.007001822, 0.09631092, -0.06702025, + -0.042049985, -0.035070654, -0.04103342, -0.10273396, + 0.0544271, 0.037184782, -0.13150354, -0.0058036847, + -0.008264958, 0.042035464, 0.05891794, 0.029673764, + 0.0063542654, 0.044788733, 0.054816857, 0.062257513, + -0.00093483756, 0.048938446, -0.004952862, -0.007730018, + -0.04043371, -0.017094059, 0.07229206, -0.023670016, + -0.052195564, -0.025616996, -0.01520939, 0.045104615, + -0.007376126, 0.003533447, 0.006570588, 0.056037236, + 0.12436656, 0.051817212, 0.028532185, -0.08686856, + 0.11868599, 0.07663395, -0.07323171, 0.03463402, + -0.050708205, -0.04458982, -0.11590894, 0.021273347, + 0.1251325, -0.15313013, -0.12224372, 0.17228661, + 0.023029093, 0.086124025, 0.006445803, -0.03496501, + 0.028332196, 0.04449512, -0.042436164, -0.026587414, + -0.006041347, -0.09292539, -0.05678812, 0.03897832, + 0.09465633, 0.008115513, -0.02171956, 0.08304309, + 0.071401566, 0.019622514, 0.032163795, -0.004167056, + 0.02295182, 0.030739572, 0.056506045, 0.004612461, + 0.06524936, 0.059999723, 0.046395954, -0.0045512207, + -0.1335546, -0.030136576, 0.11584653, -0.014678886, + 0.0020118146, -0.09688814, -0.0790206, 0.039770417, + -0.0329582, 0.07922767, 0.029322514, 0.026405897, + 0.04207835, -0.07073373, 0.063781224, 0.0859677, + -0.10925287, -0.07011058, 0.048005477, 0.03438226, + -0.09606514, -0.006669445, -0.043381985, 0.04240257, + -0.06955775, -0.06769346, 0.043903265, -0.026784198, + -0.017840602, 0.024307009, -0.040079936, -0.019946516, + 0.045318738, -0.12233574, 0.026170589, 0.0074471775, + 0.15978073, 0.10185836, 0.10298046, -0.015476589, + -0.039390966, -0.072174534, 0.0739445, -0.1211869, + -0.0347889, -0.07943156, 0.014809798, -0.12412325, + -0.0030663363, 0.039695457, 0.0647603, -0.08291318, + -0.018529687, -0.004423833, 0.0037507233, 0.084633216, + -0.01514876, -0.056505352, -0.012800942, -0.06994386, + 0.012962922, -0.031234352, 0.07029052, 0.016418684, + 0.03618972, 0.055686004, -0.08663945, -0.017404709, + -0.054761406, 0.029065743, 0.052404847, 0.020238016, + 0.0048197987, -0.0214882, 0.07078733, 0.013016777, + 0.06262858, 0.009184685, 0.020785125, -0.043904778, + -0.0270329, -0.03299152, -0.060088247, -0.015162964, + -0.001828936, 0.12642565, -0.056757294, 0.013586685, + 0.09232601, -0.035886683, 0.06000002, 0.05229691, + -0.052580316, -0.082029596, -0.010794592, 0.012947712, + -0.036429964, -0.085508935, -0.13127148, -0.017744139, + 0.031502828, 0.036232427, -0.031581745, 0.023051167, + -0.05325106, -0.03421577, 0.028793324, -0.034633752, + -0.009881397, -0.043551125, -0.018609839, 0.0019097115, + -0.008799762, 0.056595087, 0.0022273948, 0.055752404}; + + recurrent_to_forget_weights_ = { + -0.057784554, -0.026057621, -0.068447545, -0.022581743, + 0.14811787, 0.10826372, 0.09471067, 0.03987225, + -0.0039523416, 0.00030638507, 0.053185795, 0.10572994, + 0.08414449, -0.022036452, -0.00066928595, -0.09203576, + 0.032950465, -0.10985798, -0.023809856, 0.0021431844, + -0.02196096, -0.00326074, 0.00058621005, -0.074678116, + -0.06193199, 0.055729095, 0.03736828, 0.020123724, + 0.061878487, -0.04729229, 0.034919553, -0.07585433, + -0.04421272, -0.044019096, 0.085488975, 0.04058006, + -0.06890133, -0.030951202, -0.024628663, -0.07672815, + 0.034293607, 0.08556707, -0.05293577, -0.033561368, + -0.04899627, 0.0241671, 0.015736353, -0.095442444, + -0.029564252, 0.016493602, -0.035026584, 0.022337519, + -0.026871363, 0.004780428, 0.0077918363, -0.03601621, + 0.016435321, -0.03263031, -0.09543275, -0.047392778, + 0.013454138, 0.028934088, 0.01685226, -0.086110644, + -0.046250615, -0.01847454, 0.047608484, 0.07339695, + 0.034546845, -0.04881143, 0.009128804, -0.08802852, + 0.03761666, 0.008096139, -0.014454086, 0.014361001, + -0.023502491, -0.0011840804, -0.07607001, 0.001856849, + -0.06509276, -0.006021153, -0.08570962, -0.1451793, + 0.060212336, 0.055259194, 0.06974018, 0.049454916, + -0.027794661, -0.08077226, -0.016179763, 0.1169753, + 0.17213494, -0.0056326236, -0.053934924, -0.0124349, + -0.11520337, 0.05409887, 0.088759385, 0.0019655675, + 0.0042065294, 0.03881498, 0.019844765, 0.041858196, + -0.05695512, 0.047233116, 0.038937137, -0.06542224, + 0.014429736, -0.09719407, 0.13908425, -0.05379757, + 0.012321099, 0.082840554, -0.029899208, 0.044217527, + 0.059855383, 0.07711018, -0.045319796, 0.0948846, + -0.011724666, -0.0033288454, -0.033542685, -0.04764985, + -0.13873616, 0.040668588, 0.034832682, -0.015319203, + -0.018715994, 0.046002675, 0.0599172, -0.043107376, + 0.0294216, -0.002314414, -0.022424703, 0.0030315618, + 0.0014641669, 0.0029166266, -0.11878115, 0.013738511, + 0.12375372, -0.0006038222, 0.029104086, 0.087442465, + 0.052958444, 0.07558703, 0.04817258, 0.044462286, + -0.015213451, -0.08783778, -0.0561384, -0.003008196, + 0.047060397, -0.002058388, 0.03429439, -0.018839769, + 0.024734668, 0.024614193, -0.042046934, 0.09597743, + -0.0043254104, 0.04320769, 0.0064070094, -0.0019131786, + -0.02558259, -0.022822596, -0.023273505, -0.02464396, + -0.10991725, -0.006240552, 0.0074488563, 0.024044557, + 0.04383914, -0.046476185, 0.028658995, 0.060410924, + 0.050786525, 0.009452605, -0.0073054377, -0.024810238, + 0.0052906186, 0.0066939713, -0.0020913032, 0.014515517, + 0.015898481, 0.021362653, -0.030262267, 0.016587038, + -0.011442813, 0.041154444, -0.007631438, -0.03423484, + -0.010977775, 0.036152758, 0.0066366293, 0.11915515, + 0.02318443, -0.041350313, 0.021485701, -0.10906167, + -0.028218046, -0.00954771, 0.020531068, -0.11995105, + -0.03672871, 0.024019798, 0.014255957, -0.05221243, + -0.00661567, -0.04630967, 0.033188973, 0.10107534, + -0.014027541, 0.030796422, -0.10270911, -0.035999842, + 0.15443139, 0.07684145, 0.036571592, -0.035900835, + -0.0034699554, 0.06209149, 0.015920248, -0.031122351, + -0.03858649, 0.01849943, 0.13872518, 0.01503974, + 0.069941424, -0.06948533, -0.0088794185, 0.061282158, + -0.047401894, 0.03100163, -0.041533746, -0.10430945, + 0.044574402, -0.01425562, -0.024290353, 0.034563623, + 0.05866852, 0.023947537, -0.09445152, 0.035450947, + 0.02247216, -0.0042998926, 0.061146557, -0.10250651, + 0.020881841, -0.06747029, 0.10062043, -0.0023941975, + 0.03532124, -0.016341697, 0.09685456, -0.016764693, + 0.051808182, 0.05875331, -0.04536488, 0.001626336, + -0.028892258, -0.01048663, -0.009793449, -0.017093895, + 0.010987891, 0.02357273, -0.00010856845, 0.0099760275, + -0.001845119, -0.03551521, 0.0018358806, 0.05763657, + -0.01769146, 0.040995963, 0.02235177, -0.060430344, + 0.11475477, -0.023854522, 0.10071741, 0.0686208, + -0.014250481, 0.034261297, 0.047418304, 0.08562733, + -0.030519066, 0.0060542435, 0.014653856, -0.038836084, + 0.04096551, 0.032249358, -0.08355519, -0.026823482, + 0.056386515, -0.010401743, -0.028396193, 0.08507674, + 0.014410365, 0.020995233, 0.17040324, 0.11511526, + 0.02459721, 0.0066619175, 0.025853224, -0.023133837, + -0.081302024, 0.017264642, -0.009585969, 0.09491168, + -0.051313367, 0.054532815, -0.014298593, 0.10657464, + 0.007076659, 0.10964551, 0.0409152, 0.008275321, + -0.07283536, 0.07937492, 0.04192024, -0.1075027}; + + recurrent_to_output_weights_ = { + 0.025825322, -0.05813119, 0.09495884, -0.045984812, + -0.01255415, -0.0026479573, -0.08196161, -0.054914974, + -0.0046604523, -0.029587349, -0.044576716, -0.07480124, + -0.082868785, 0.023254942, 0.027502948, -0.0039728214, + -0.08683098, -0.08116779, -0.014675607, -0.037924774, + -0.023314456, -0.007401714, -0.09255757, 0.029460307, + -0.08829125, -0.005139627, -0.08989442, -0.0555066, + 0.13596267, -0.025062224, -0.048351806, -0.03850004, + 0.07266485, -0.022414139, 0.05940088, 0.075114764, + 0.09597592, -0.010211725, -0.0049794707, -0.011523867, + -0.025980417, 0.072999895, 0.11091378, -0.081685916, + 0.014416728, 0.043229222, 0.034178585, -0.07530371, + 0.035837382, -0.085607, -0.007721233, -0.03287832, + -0.043848954, -0.06404588, -0.06632928, -0.073643476, + 0.008214239, -0.045984086, 0.039764922, 0.03474462, + 0.060612556, -0.080590084, 0.049127717, 0.04151091, + -0.030063879, 0.008801774, -0.023021035, -0.019558564, + 0.05158114, -0.010947698, -0.011825728, 0.0075720972, + 0.0699727, -0.0039981045, 0.069350146, 0.08799282, + 0.016156472, 0.035502106, 0.11695009, 0.006217345, + 0.13392477, -0.037875112, 0.025745004, 0.08940699, + -0.00924166, 0.0046702605, -0.036598757, -0.08811812, + 0.10522024, -0.032441203, 0.008176899, -0.04454919, + 0.07058152, 0.0067963637, 0.039206743, 0.03259838, + 0.03725492, -0.09515802, 0.013326398, -0.052055415, + -0.025676316, 0.03198509, -0.015951829, -0.058556724, + 0.036879618, 0.043357447, 0.028362012, -0.05908629, + 0.0059240665, -0.04995891, -0.019187413, 0.0276265, + -0.01628143, 0.0025863599, 0.08800015, 0.035250366, + -0.022165963, -0.07328642, -0.009415526, -0.07455109, + 0.11690406, 0.0363299, 0.07411125, 0.042103454, + -0.009660886, 0.019076364, 0.018299393, -0.046004917, + 0.08891175, 0.0431396, -0.026327137, -0.051502608, + 0.08979574, -0.051670972, 0.04940282, -0.07491107, + -0.021240504, 0.022596184, -0.034280192, 0.060163025, + -0.058211457, -0.051837247, -0.01349775, -0.04639988, + -0.035936575, -0.011681591, 0.064818054, 0.0073146066, + -0.021745546, -0.043124277, -0.06471268, -0.07053354, + -0.029321948, -0.05330136, 0.016933719, -0.053782392, + 0.13747959, -0.1361751, -0.11569455, 0.0033329215, + 0.05693899, -0.053219706, 0.063698, 0.07977434, + -0.07924483, 0.06936997, 0.0034815092, -0.007305279, + -0.037325785, -0.07251102, -0.033633437, -0.08677009, + 0.091591336, -0.14165086, 0.021752775, 0.019683983, + 0.0011612234, -0.058154266, 0.049996935, 0.0288841, + -0.0024567875, -0.14345716, 0.010955264, -0.10234828, + 0.1183656, -0.0010731248, -0.023590032, -0.072285876, + -0.0724771, -0.026382286, -0.0014920527, 0.042667855, + 0.0018776858, 0.02986552, 0.009814309, 0.0733756, + 0.12289186, 0.018043943, -0.0458958, 0.049412545, + 0.033632483, 0.05495232, 0.036686596, -0.013781798, + -0.010036754, 0.02576849, -0.08307328, 0.010112348, + 0.042521734, -0.05869831, -0.071689695, 0.03876447, + -0.13275425, -0.0352966, -0.023077697, 0.10285965, + 0.084736146, 0.15568255, -0.00040734606, 0.027835453, + -0.10292561, -0.032401145, 0.10053256, -0.026142767, + -0.08271222, -0.0030240538, -0.016368777, 0.1070414, + 0.042672627, 0.013456989, -0.0437609, -0.022309763, + 0.11576483, 0.04108048, 0.061026827, -0.0190714, + -0.0869359, 0.037901703, 0.0610107, 0.07202949, + 0.01675338, 0.086139716, -0.08795751, -0.014898893, + -0.023771819, -0.01965048, 0.007955471, -0.043740474, + 0.03346837, -0.10549954, 0.090567775, 0.042013682, + -0.03176985, 0.12569028, -0.02421228, -0.029526481, + 0.023851605, 0.031539805, 0.05292009, -0.02344001, + -0.07811758, -0.08834428, 0.10094801, 0.16594367, + -0.06861939, -0.021256343, -0.041093912, -0.06669611, + 0.035498552, 0.021757556, -0.09302526, -0.015403468, + -0.06614931, -0.051798206, -0.013874718, 0.03630673, + 0.010412845, -0.08077351, 0.046185967, 0.0035662893, + 0.03541868, -0.094149634, -0.034814864, 0.003128424, + -0.020674974, -0.03944324, -0.008110165, -0.11113267, + 0.08484226, 0.043586485, 0.040582247, 0.0968012, + -0.065249965, -0.028036479, 0.0050708856, 0.0017462453, + 0.0326779, 0.041296225, 0.09164146, -0.047743853, + -0.015952192, -0.034451712, 0.084197424, -0.05347844, + -0.11768019, 0.085926116, -0.08251791, -0.045081906, + 0.0948852, 0.068401024, 0.024856757, 0.06978981, + -0.057309967, -0.012775832, -0.0032452994, 0.01977615, + -0.041040014, -0.024264973, 0.063464895, 0.05431621, + }; + + cell_to_input_weights_ = { + 0.040369894, 0.030746894, 0.24704495, 0.018586371, -0.037586458, + -0.15312155, -0.11812848, -0.11465643, 0.20259799, 0.11418174, + -0.10116027, -0.011334949, 0.12411352, -0.076769054, -0.052169047, + 0.21198851, -0.38871562, -0.09061183, -0.09683246, -0.21929175}; + + cell_to_forget_weights_ = { + -0.01998659, -0.15568835, -0.24248174, -0.012770197, 0.041331276, + -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766, + -0.047248036, 0.021479502, 0.033189066, 0.11952997, -0.020432774, + 0.64658105, -0.06650122, -0.03467612, 0.095340036, 0.23647355}; + + cell_to_output_weights_ = { + 0.08286371, -0.08261836, -0.51210177, 0.002913762, 0.17764764, + -0.5495371, -0.08460716, -0.24552552, 0.030037103, 0.04123544, + -0.11940523, 0.007358328, 0.1890978, 0.4833202, -0.34441817, + 0.36312827, -0.26375428, 0.1457655, -0.19724406, 0.15548733}; + + projection_weights_ = { + -0.009802181, 0.09401916, 0.0717386, -0.13895074, + 0.09641832, 0.060420845, 0.08539281, 0.054285463, + 0.061395317, 0.034448683, -0.042991187, 0.019801661, + -0.16840284, -0.015726732, -0.23041931, -0.024478018, + -0.10959692, -0.013875541, 0.18600968, -0.061274476, + 0.0138165, -0.08160894, -0.07661644, 0.032372914, + 0.16169067, 0.22465782, -0.03993472, -0.004017731, + 0.08633481, -0.28869787, 0.08682067, 0.17240396, + 0.014975425, 0.056431185, 0.031037588, 0.16702051, + 0.0077946745, 0.15140012, 0.29405436, 0.120285, + -0.188994, -0.027265169, 0.043389652, -0.022061434, + 0.014777949, -0.20203483, 0.094781205, 0.19100232, + 0.13987629, -0.036132768, -0.06426278, -0.05108664, + 0.13221376, 0.009441198, -0.16715929, 0.15859416, + -0.040437475, 0.050779544, -0.022187516, 0.012166504, + 0.027685808, -0.07675938, -0.0055694645, -0.09444123, + 0.0046453946, 0.050794356, 0.10770313, -0.20790008, + -0.07149004, -0.11425117, 0.008225835, -0.035802525, + 0.14374903, 0.15262283, 0.048710253, 0.1847461, + -0.007487823, 0.11000021, -0.09542012, 0.22619456, + -0.029149994, 0.08527916, 0.009043713, 0.0042746216, + 0.016261552, 0.022461696, 0.12689082, -0.043589946, + -0.12035478, -0.08361797, -0.050666027, -0.1248618, + -0.1275799, -0.071875185, 0.07377272, 0.09944291, + -0.18897448, -0.1593054, -0.06526116, -0.040107165, + -0.004618631, -0.067624845, -0.007576253, 0.10727444, + 0.041546922, -0.20424393, 0.06907816, 0.050412357, + 0.00724631, 0.039827548, 0.12449835, 0.10747581, + 0.13708383, 0.09134148, -0.12617786, -0.06428341, + 0.09956831, 0.1208086, -0.14676677, -0.0727722, + 0.1126304, 0.010139365, 0.015571211, -0.038128063, + 0.022913318, -0.042050496, 0.16842307, -0.060597885, + 0.10531834, -0.06411776, -0.07451711, -0.03410368, + -0.13393489, 0.06534304, 0.003620307, 0.04490757, + 0.05970546, 0.05197996, 0.02839995, 0.10434969, + -0.013699693, -0.028353551, -0.07260381, 0.047201227, + -0.024575593, -0.036445823, 0.07155557, 0.009672501, + -0.02328883, 0.009533515, -0.03606021, -0.07421458, + -0.028082801, -0.2678904, -0.13221288, 0.18419984, + -0.13012612, -0.014588381, -0.035059117, -0.04824723, + 0.07830115, -0.056184657, 0.03277091, 0.025466874, + 0.14494097, -0.12522776, -0.098633975, -0.10766018, + -0.08317623, 0.08594209, 0.07749552, 0.039474737, + 0.1776665, -0.07409566, -0.0477268, 0.29323658, + 0.10801441, 0.1154011, 0.013952499, 0.10739139, + 0.10708251, -0.051456142, 0.0074137426, -0.10430189, + 0.10034707, 0.045594677, 0.0635285, -0.0715442, + -0.089667566, -0.10811871, 0.00026344223, 0.08298446, + -0.009525053, 0.006585689, -0.24567553, -0.09450807, + 0.09648481, 0.026996298, -0.06419476, -0.04752702, + -0.11063944, -0.23441927, -0.17608605, -0.052156363, + 0.067035615, 0.19271925, -0.0032889997, -0.043264326, + 0.09663576, -0.057112187, -0.10100678, 0.0628376, + 0.04447668, 0.017961001, -0.10094388, -0.10190601, + 0.18335468, 0.10494553, -0.052095775, -0.0026118709, + 0.10539724, -0.04383912, -0.042349473, 0.08438151, + -0.1947263, 0.02251204, 0.11216432, -0.10307853, + 0.17351969, -0.039091777, 0.08066188, -0.00561982, + 0.12633002, 0.11335965, -0.0088127935, -0.019777594, + 0.06864014, -0.059751723, 0.016233567, -0.06894641, + -0.28651384, -0.004228674, 0.019708522, -0.16305895, + -0.07468996, -0.0855457, 0.099339016, -0.07580735, + -0.13775392, 0.08434318, 0.08330512, -0.12131499, + 0.031935584, 0.09180414, -0.08876437, -0.08049874, + 0.008753825, 0.03498998, 0.030215185, 0.03907079, + 0.089751154, 0.029194152, -0.03337423, -0.019092513, + 0.04331237, 0.04299654, -0.036394123, -0.12915532, + 0.09793732, 0.07512415, -0.11319543, -0.032502122, + 0.15661901, 0.07671967, -0.005491124, -0.19379048, + -0.218606, 0.21448623, 0.017840758, 0.1416943, + -0.07051762, 0.19488361, 0.02664691, -0.18104725, + -0.09334311, 0.15026465, -0.15493552, -0.057762887, + -0.11604192, -0.262013, -0.01391798, 0.012185008, + 0.11156489, -0.07483202, 0.06693364, -0.26151478, + 0.046425626, 0.036540434, -0.16435726, 0.17338543, + -0.21401681, -0.11385144, -0.08283257, -0.069031075, + 0.030635102, 0.010969227, 0.11109743, 0.010919218, + 0.027526086, 0.13519906, 0.01891392, -0.046839405, + -0.040167913, 0.017953383, -0.09700955, 0.0061885654, + -0.07000971, 0.026893595, -0.038844477, 0.14543656}; + + lstm_input_ = { + {// Batch0: 4 (input_sequence_size) * 5 (n_input) + 0.787926, 0.151646, 0.071352, 0.118426, 0.458058, // step 0 + 0.596268, 0.998386, 0.568695, 0.864524, 0.571277, // step 1 + 0.073204, 0.296072, 0.743333, 0.069199, 0.045348, // step 2 + 0.867394, 0.291279, 0.013714, 0.482521, 0.626339}, // step 3 + + {// Batch1: 4 (input_sequence_size) * 5 (n_input) + 0.295743, 0.544053, 0.690064, 0.858138, 0.497181, // step 0 + 0.642421, 0.524260, 0.134799, 0.003639, 0.162482, // step 1 + 0.640394, 0.930399, 0.050782, 0.432485, 0.988078, // step 2 + 0.082922, 0.563329, 0.865614, 0.333232, 0.259916} // step 3 + }; + + lstm_golden_output_ = { + {// Batch0: 4 (input_sequence_size) * 16 (n_output) + -0.00396806, 0.029352, -0.00279226, 0.0159977, -0.00835576, + -0.0211779, 0.0283512, -0.0114597, 0.00907307, -0.0244004, + -0.0152191, -0.0259063, 0.00914318, 0.00415118, 0.017147, + 0.0134203, -0.0166936, 0.0381209, 0.000889694, 0.0143363, + -0.0328911, -0.0234288, 0.0333051, -0.012229, 0.0110322, + -0.0457725, -0.000832209, -0.0202817, 0.0327257, 0.0121308, + 0.0155969, 0.0312091, -0.0213783, 0.0350169, 0.000324794, + 0.0276012, -0.0263374, -0.0371449, 0.0446149, -0.0205474, + 0.0103729, -0.0576349, -0.0150052, -0.0292043, 0.0376827, + 0.0136115, 0.0243435, 0.0354492, -0.0189322, 0.0464512, + -0.00251373, 0.0225745, -0.0308346, -0.0317124, 0.0460407, + -0.0189395, 0.0149363, -0.0530162, -0.0150767, -0.0340193, + 0.0286833, 0.00824207, 0.0264887, 0.0305169}, + {// Batch1: 4 (input_sequence_size) * 16 (n_output) + -0.013869, 0.0287268, -0.00334693, 0.00733398, -0.0287926, + -0.0186926, 0.0193662, -0.0115437, 0.00422612, -0.0345232, + 0.00223253, -0.00957321, 0.0210624, 0.013331, 0.0150954, + 0.02168, -0.0141913, 0.0322082, 0.00227024, 0.0260507, + -0.0188721, -0.0296489, 0.0399134, -0.0160509, 0.0116039, + -0.0447318, -0.0150515, -0.0277406, 0.0316596, 0.0118233, + 0.0214762, 0.0293641, -0.0204549, 0.0450315, -0.00117378, + 0.0167673, -0.0375007, -0.0238314, 0.038784, -0.0174034, + 0.0131743, -0.0506589, -0.0048447, -0.0240239, 0.0325789, + 0.00790065, 0.0220157, 0.0333314, -0.0264787, 0.0387855, + -0.000764675, 0.0217599, -0.037537, -0.0335206, 0.0431679, + -0.0211424, 0.010203, -0.062785, -0.00832363, -0.025181, + 0.0412031, 0.0118723, 0.0239643, 0.0394009}}; } -} +}; -TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) { +TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, LstmBlackBoxTest) { const int n_batch = 2; const int n_input = 5; const int n_cell = 20; @@ -489,588 +1308,90 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) { {0}, // projection_bias tensor }); - lstm.SetInputToInputWeights( - {0.021393683, 0.06124551, 0.046905167, -0.014657677, -0.03149463, - 0.09171803, 0.14647801, 0.10797193, -0.0057968358, 0.0019193048, - -0.2726754, 0.10154029, -0.018539885, 0.080349885, -0.10262385, - -0.022599787, -0.09121155, -0.008675967, -0.045206103, -0.0821282, - -0.008045952, 0.015478081, 0.055217247, 0.038719587, 0.044153627, - -0.06453243, 0.05031825, -0.046935108, -0.008164439, 0.014574226, - -0.1671009, -0.15519552, -0.16819797, -0.13971269, -0.11953059, - 0.25005487, -0.22790983, 0.009855087, -0.028140958, -0.11200698, - 0.11295408, -0.0035217577, 0.054485075, 0.05184695, 0.064711206, - 0.10989193, 0.11674786, 0.03490607, 0.07727357, 0.11390585, - -0.1863375, -0.1034451, -0.13945189, -0.049401227, -0.18767063, - 0.042483903, 0.14233552, 0.13832581, 0.18350165, 0.14545603, - -0.028545704, 0.024939531, 0.050929718, 0.0076203286, -0.0029723682, - -0.042484224, -0.11827596, -0.09171104, -0.10808628, -0.16327988, - -0.2273378, -0.0993647, -0.017155107, 0.0023917493, 0.049272764, - 0.0038534778, 0.054764505, 0.089753784, 0.06947234, 0.08014476, - -0.04544234, -0.0497073, -0.07135631, -0.048929106, -0.004042012, - -0.009284026, 0.018042054, 0.0036860977, -0.07427302, -0.11434604, - -0.018995456, 0.031487543, 0.012834908, 0.019977754, 0.044256654, - -0.39292613, -0.18519334, -0.11651281, -0.06809892, 0.011373677}); - - lstm.SetInputToForgetWeights( - {-0.0018401089, -0.004852237, 0.03698424, 0.014181704, 0.028273236, - -0.016726194, -0.05249759, -0.10204261, 0.00861066, -0.040979505, - -0.009899187, 0.01923892, -0.028177269, -0.08535103, -0.14585495, - 0.10662567, -0.01909731, -0.017883534, -0.0047269356, -0.045103323, - 0.0030784295, 0.076784775, 0.07463696, 0.094531395, 0.0814421, - -0.12257899, -0.033945758, -0.031303465, 0.045630626, 0.06843887, - -0.13492945, -0.012480007, -0.0811829, -0.07224499, -0.09628791, - 0.045100946, 0.0012300825, 0.013964662, 0.099372394, 0.02543059, - 0.06958324, 0.034257296, 0.0482646, 0.06267997, 0.052625068, - 0.12784666, 0.07077897, 0.025725935, 0.04165009, 0.07241905, - 0.018668644, -0.037377294, -0.06277783, -0.08833636, -0.040120605, - -0.011405586, -0.007808335, -0.010301386, -0.005102167, 0.027717464, - 0.05483423, 0.11449111, 0.11289652, 0.10939839, 0.13396506, - -0.08402166, -0.01901462, -0.044678304, -0.07720565, 0.014350063, - -0.11757958, -0.0652038, -0.08185733, -0.076754324, -0.092614375, - 0.10405491, 0.052960336, 0.035755895, 0.035839386, -0.012540553, - 0.036881298, 0.02913376, 0.03420159, 0.05448447, -0.054523353, - 0.02582715, 0.02327355, -0.011857179, -0.0011980024, -0.034641717, - -0.026125094, -0.17582615, -0.15923657, -0.27486774, -0.0006143371, - 0.0001771948, -8.470171e-05, 0.02651807, 0.045790765, 0.06956496}); - - lstm.SetInputToCellWeights( - {-0.04580283, -0.09549462, -0.032418985, -0.06454633, - -0.043528453, 0.043018587, -0.049152344, -0.12418144, - -0.078985475, -0.07596889, 0.019484362, -0.11434962, - -0.0074034138, -0.06314844, -0.092981495, 0.0062155537, - -0.025034338, -0.0028890965, 0.048929527, 0.06235075, - 0.10665918, -0.032036792, -0.08505916, -0.10843358, - -0.13002433, -0.036816437, -0.02130134, -0.016518239, - 0.0047691227, -0.0025825808, 0.066017866, 0.029991534, - -0.10652836, -0.1037554, -0.13056071, -0.03266643, - -0.033702414, -0.006473424, -0.04611692, 0.014419339, - -0.025174323, 0.0396852, 0.081777506, 0.06157468, - 0.10210095, -0.009658194, 0.046511717, 0.03603906, - 0.0069369148, 0.015960095, -0.06507666, 0.09551598, - 0.053568836, 0.06408714, 0.12835667, -0.008714329, - -0.20211966, -0.12093674, 0.029450472, 0.2849013, - -0.029227901, 0.1164364, -0.08560263, 0.09941786, - -0.036999565, -0.028842626, -0.0033637602, -0.017012902, - -0.09720865, -0.11193351, -0.029155117, -0.017936034, - -0.009768936, -0.04223324, -0.036159635, 0.06505112, - -0.021742892, -0.023377212, -0.07221364, -0.06430552, - 0.05453865, 0.091149814, 0.06387331, 0.007518393, - 0.055960953, 0.069779344, 0.046411168, 0.10509911, - 0.07463894, 0.0075130584, 0.012850982, 0.04555431, - 0.056955688, 0.06555285, 0.050801456, -0.009862683, - 0.00826772, -0.026555609, -0.0073611983, -0.0014897042}); - - lstm.SetInputToOutputWeights( - {-0.0998932, -0.07201956, -0.052803773, -0.15629593, -0.15001918, - -0.07650751, 0.02359855, -0.075155355, -0.08037709, -0.15093534, - 0.029517552, -0.04751393, 0.010350531, -0.02664851, -0.016839722, - -0.023121163, 0.0077019283, 0.012851257, -0.05040649, -0.0129761, - -0.021737747, -0.038305793, -0.06870586, -0.01481247, -0.001285394, - 0.10124236, 0.083122835, 0.053313006, -0.062235646, -0.075637154, - -0.027833903, 0.029774971, 0.1130802, 0.09218906, 0.09506135, - -0.086665764, -0.037162706, -0.038880914, -0.035832845, -0.014481564, - -0.09825003, -0.12048569, -0.097665586, -0.05287633, -0.0964047, - -0.11366429, 0.035777505, 0.13568819, 0.052451383, 0.050649304, - 0.05798951, -0.021852335, -0.099848844, 0.014740475, -0.078897946, - 0.04974699, 0.014160473, 0.06973932, 0.04964942, 0.033364646, - 0.08190124, 0.025535367, 0.050893165, 0.048514254, 0.06945813, - -0.078907564, -0.06707616, -0.11844508, -0.09986688, -0.07509403, - 0.06263226, 0.14925587, 0.20188436, 0.12098451, 0.14639415, - 0.0015017595, -0.014267382, -0.03417257, 0.012711468, 0.0028300495, - -0.024758482, -0.05098548, -0.0821182, 0.014225672, 0.021544158, - 0.08949725, 0.07505268, -0.0020780868, 0.04908258, 0.06476295, - -0.022907063, 0.027562456, 0.040185735, 0.019567577, -0.015598739, - -0.049097303, -0.017121866, -0.083368234, -0.02332002, -0.0840956}); - - lstm.SetInputGateBias( - {0.02234832, 0.14757581, 0.18176508, 0.10380666, 0.053110216, - -0.06928846, -0.13942584, -0.11816189, 0.19483899, 0.03652339, - -0.10250295, 0.036714908, -0.18426876, 0.036065217, 0.21810818, - 0.02383196, -0.043370757, 0.08690144, -0.04444982, 0.00030581196}); - - lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696, - 0.11098921, 0.15378423, 0.09263801, 0.09790885, - 0.09508917, 0.061199076, 0.07665568, -0.015443159, - -0.03499149, 0.046190713, 0.08895977, 0.10899629, - 0.40694186, 0.06030037, 0.012413437, -0.06108739}); - - lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132, 0.033463873, - -0.1483596, -0.10639995, -0.091433935, 0.058573797, - -0.06809782, -0.07889636, -0.043246906, -0.09829136, - -0.4279842, 0.034901652, 0.18797937, 0.0075234566, - 0.016178843, 0.1749513, 0.13975595, 0.92058027}); - - lstm.SetOutputGateBias( - {0.046159424, -0.0012809046, 0.03563469, 0.12648113, 0.027195795, - 0.35373217, -0.018957434, 0.008907322, -0.0762701, 0.12018895, - 0.04216877, 0.0022856654, 0.040952638, 0.3147856, 0.08225149, - -0.057416286, -0.14995944, -0.008040261, 0.13208859, 0.029760877}); - - lstm.SetRecurrentToInputWeights( - {-0.001374326, -0.078856036, 0.10672688, 0.029162422, - -0.11585556, 0.02557986, -0.13446963, -0.035785314, - -0.01244275, 0.025961924, -0.02337298, -0.044228926, - -0.055839065, -0.046598054, -0.010546039, -0.06900766, - 0.027239809, 0.022582639, -0.013296484, -0.05459212, - 0.08981, -0.045407712, 0.08682226, -0.06867011, - -0.14390695, -0.02916037, 0.000996957, 0.091420636, - 0.14283475, -0.07390571, -0.06402044, 0.062524505, - -0.093129106, 0.04860203, -0.08364217, -0.08119002, - 0.009352075, 0.22920375, 0.0016303885, 0.11583097, - -0.13732095, 0.012405723, -0.07551853, 0.06343048, - 0.12162708, -0.031923793, -0.014335606, 0.01790974, - -0.10650317, -0.0724401, 0.08554849, -0.05727212, - 0.06556731, -0.042729504, -0.043227166, 0.011683251, - -0.013082158, -0.029302018, -0.010899579, -0.062036745, - -0.022509435, -0.00964907, -0.01567329, 0.04260106, - -0.07787477, -0.11576462, 0.017356863, 0.048673786, - -0.017577527, -0.05527947, -0.082487635, -0.040137455, - -0.10820036, -0.04666372, 0.022746278, -0.07851417, - 0.01068115, 0.032956902, 0.022433773, 0.0026891115, - 0.08944216, -0.0685835, 0.010513544, 0.07228705, - 0.02032331, -0.059686817, -0.0005566496, -0.086984694, - 0.040414046, -0.1380399, 0.094208956, -0.05722982, - 0.012092817, -0.04989123, -0.086576, -0.003399834, - -0.04696032, -0.045747425, 0.10091314, 0.048676282, - -0.029037097, 0.031399418, -0.0040285117, 0.047237843, - 0.09504992, 0.041799378, -0.049185462, -0.031518843, - -0.10516937, 0.026374253, 0.10058866, -0.0033195973, - -0.041975245, 0.0073591834, 0.0033782164, -0.004325073, - -0.10167381, 0.042500053, -0.01447153, 0.06464186, - -0.017142897, 0.03312627, 0.009205989, 0.024138335, - -0.011337001, 0.035530265, -0.010912711, 0.0706555, - -0.005894094, 0.051841937, -0.1401738, -0.02351249, - 0.0365468, 0.07590991, 0.08838724, 0.021681072, - -0.10086113, 0.019608743, -0.06195883, 0.077335775, - 0.023646897, -0.095322326, 0.02233014, 0.09756986, - -0.048691444, -0.009579111, 0.07595467, 0.11480546, - -0.09801813, 0.019894179, 0.08502348, 0.004032281, - 0.037211012, 0.068537936, -0.048005626, -0.091520436, - -0.028379958, -0.01556313, 0.06554592, -0.045599163, - -0.01672207, -0.020169014, -0.011877351, -0.20212261, - 0.010889619, 0.0047078193, 0.038385306, 0.08540671, - -0.017140968, -0.0035865551, 0.016678626, 0.005633034, - 0.015963363, 0.00871737, 0.060130805, 0.028611384, - 0.10109069, -0.015060172, -0.07894427, 0.06401885, - 0.011584063, -0.024466386, 0.0047652307, -0.09041358, - 0.030737216, -0.0046374933, 0.14215417, -0.11823516, - 0.019899689, 0.006106124, -0.027092824, 0.0786356, - 0.05052217, -0.058925, -0.011402121, -0.024987547, - -0.0013661642, -0.06832946, -0.015667673, -0.1083353, - -0.00096863037, -0.06988685, -0.053350925, -0.027275559, - -0.033664223, -0.07978348, -0.025200296, -0.017207067, - -0.058403496, -0.055697463, 0.005798788, 0.12965427, - -0.062582195, 0.0013350133, -0.10482091, 0.0379771, - 0.072521195, -0.0029455067, -0.13797039, -0.03628521, - 0.013806405, -0.017858358, -0.01008298, -0.07700066, - -0.017081132, 0.019358726, 0.0027079724, 0.004635139, - 0.062634714, -0.02338735, -0.039547626, -0.02050681, - 0.03385117, -0.083611414, 0.002862572, -0.09421313, - 0.058618143, -0.08598433, 0.00972939, 0.023867095, - -0.053934585, -0.023203006, 0.07452513, -0.048767887, - -0.07314807, -0.056307215, -0.10433547, -0.06440842, - 0.04328182, 0.04389765, -0.020006588, -0.09076438, - -0.11652589, -0.021705797, 0.03345259, -0.010329105, - -0.025767034, 0.013057034, -0.07316461, -0.10145612, - 0.06358255, 0.18531723, 0.07759293, 0.12006465, - 0.1305557, 0.058638252, -0.03393652, 0.09622831, - -0.16253184, -2.4580743e-06, 0.079869635, -0.070196845, - -0.005644518, 0.06857898, -0.12598175, -0.035084512, - 0.03156317, -0.12794146, -0.031963028, 0.04692781, - 0.030070418, 0.0071660685, -0.095516115, -0.004643372, - 0.040170413, -0.062104587, -0.0037324072, 0.0554317, - 0.08184801, -0.019164372, 0.06791302, 0.034257166, - -0.10307039, 0.021943003, 0.046745934, 0.0790918, - -0.0265588, -0.007824208, 0.042546265, -0.00977924, - -0.0002440307, -0.017384544, -0.017990116, 0.12252321, - -0.014512694, -0.08251313, 0.08861942, 0.13589665, - 0.026351685, 0.012641483, 0.07466548, 0.044301085, - -0.045414884, -0.051112458, 0.03444247, -0.08502782, - -0.04106223, -0.028126027, 0.028473156, 0.10467447}); - - lstm.SetRecurrentToForgetWeights( - {-0.057784554, -0.026057621, -0.068447545, -0.022581743, - 0.14811787, 0.10826372, 0.09471067, 0.03987225, - -0.0039523416, 0.00030638507, 0.053185795, 0.10572994, - 0.08414449, -0.022036452, -0.00066928595, -0.09203576, - 0.032950465, -0.10985798, -0.023809856, 0.0021431844, - -0.02196096, -0.00326074, 0.00058621005, -0.074678116, - -0.06193199, 0.055729095, 0.03736828, 0.020123724, - 0.061878487, -0.04729229, 0.034919553, -0.07585433, - -0.04421272, -0.044019096, 0.085488975, 0.04058006, - -0.06890133, -0.030951202, -0.024628663, -0.07672815, - 0.034293607, 0.08556707, -0.05293577, -0.033561368, - -0.04899627, 0.0241671, 0.015736353, -0.095442444, - -0.029564252, 0.016493602, -0.035026584, 0.022337519, - -0.026871363, 0.004780428, 0.0077918363, -0.03601621, - 0.016435321, -0.03263031, -0.09543275, -0.047392778, - 0.013454138, 0.028934088, 0.01685226, -0.086110644, - -0.046250615, -0.01847454, 0.047608484, 0.07339695, - 0.034546845, -0.04881143, 0.009128804, -0.08802852, - 0.03761666, 0.008096139, -0.014454086, 0.014361001, - -0.023502491, -0.0011840804, -0.07607001, 0.001856849, - -0.06509276, -0.006021153, -0.08570962, -0.1451793, - 0.060212336, 0.055259194, 0.06974018, 0.049454916, - -0.027794661, -0.08077226, -0.016179763, 0.1169753, - 0.17213494, -0.0056326236, -0.053934924, -0.0124349, - -0.11520337, 0.05409887, 0.088759385, 0.0019655675, - 0.0042065294, 0.03881498, 0.019844765, 0.041858196, - -0.05695512, 0.047233116, 0.038937137, -0.06542224, - 0.014429736, -0.09719407, 0.13908425, -0.05379757, - 0.012321099, 0.082840554, -0.029899208, 0.044217527, - 0.059855383, 0.07711018, -0.045319796, 0.0948846, - -0.011724666, -0.0033288454, -0.033542685, -0.04764985, - -0.13873616, 0.040668588, 0.034832682, -0.015319203, - -0.018715994, 0.046002675, 0.0599172, -0.043107376, - 0.0294216, -0.002314414, -0.022424703, 0.0030315618, - 0.0014641669, 0.0029166266, -0.11878115, 0.013738511, - 0.12375372, -0.0006038222, 0.029104086, 0.087442465, - 0.052958444, 0.07558703, 0.04817258, 0.044462286, - -0.015213451, -0.08783778, -0.0561384, -0.003008196, - 0.047060397, -0.002058388, 0.03429439, -0.018839769, - 0.024734668, 0.024614193, -0.042046934, 0.09597743, - -0.0043254104, 0.04320769, 0.0064070094, -0.0019131786, - -0.02558259, -0.022822596, -0.023273505, -0.02464396, - -0.10991725, -0.006240552, 0.0074488563, 0.024044557, - 0.04383914, -0.046476185, 0.028658995, 0.060410924, - 0.050786525, 0.009452605, -0.0073054377, -0.024810238, - 0.0052906186, 0.0066939713, -0.0020913032, 0.014515517, - 0.015898481, 0.021362653, -0.030262267, 0.016587038, - -0.011442813, 0.041154444, -0.007631438, -0.03423484, - -0.010977775, 0.036152758, 0.0066366293, 0.11915515, - 0.02318443, -0.041350313, 0.021485701, -0.10906167, - -0.028218046, -0.00954771, 0.020531068, -0.11995105, - -0.03672871, 0.024019798, 0.014255957, -0.05221243, - -0.00661567, -0.04630967, 0.033188973, 0.10107534, - -0.014027541, 0.030796422, -0.10270911, -0.035999842, - 0.15443139, 0.07684145, 0.036571592, -0.035900835, - -0.0034699554, 0.06209149, 0.015920248, -0.031122351, - -0.03858649, 0.01849943, 0.13872518, 0.01503974, - 0.069941424, -0.06948533, -0.0088794185, 0.061282158, - -0.047401894, 0.03100163, -0.041533746, -0.10430945, - 0.044574402, -0.01425562, -0.024290353, 0.034563623, - 0.05866852, 0.023947537, -0.09445152, 0.035450947, - 0.02247216, -0.0042998926, 0.061146557, -0.10250651, - 0.020881841, -0.06747029, 0.10062043, -0.0023941975, - 0.03532124, -0.016341697, 0.09685456, -0.016764693, - 0.051808182, 0.05875331, -0.04536488, 0.001626336, - -0.028892258, -0.01048663, -0.009793449, -0.017093895, - 0.010987891, 0.02357273, -0.00010856845, 0.0099760275, - -0.001845119, -0.03551521, 0.0018358806, 0.05763657, - -0.01769146, 0.040995963, 0.02235177, -0.060430344, - 0.11475477, -0.023854522, 0.10071741, 0.0686208, - -0.014250481, 0.034261297, 0.047418304, 0.08562733, - -0.030519066, 0.0060542435, 0.014653856, -0.038836084, - 0.04096551, 0.032249358, -0.08355519, -0.026823482, - 0.056386515, -0.010401743, -0.028396193, 0.08507674, - 0.014410365, 0.020995233, 0.17040324, 0.11511526, - 0.02459721, 0.0066619175, 0.025853224, -0.023133837, - -0.081302024, 0.017264642, -0.009585969, 0.09491168, - -0.051313367, 0.054532815, -0.014298593, 0.10657464, - 0.007076659, 0.10964551, 0.0409152, 0.008275321, - -0.07283536, 0.07937492, 0.04192024, -0.1075027}); - - lstm.SetRecurrentToCellWeights( - {-0.037322544, 0.018592842, 0.0056175636, -0.06253426, - 0.055647098, -0.05713207, -0.05626563, 0.005559383, - 0.03375411, -0.025757805, -0.088049285, 0.06017052, - -0.06570978, 0.007384076, 0.035123326, -0.07920549, - 0.053676967, 0.044480428, -0.07663568, 0.0071805613, - 0.08089997, 0.05143358, 0.038261272, 0.03339287, - -0.027673481, 0.044746667, 0.028349208, 0.020090483, - -0.019443132, -0.030755889, -0.0040000007, 0.04465846, - -0.021585021, 0.0031670958, 0.0053199246, -0.056117613, - -0.10893326, 0.076739706, -0.08509834, -0.027997585, - 0.037871376, 0.01449768, -0.09002357, -0.06111149, - -0.046195522, 0.0422062, -0.005683705, -0.1253618, - -0.012925729, -0.04890792, 0.06985068, 0.037654128, - 0.03398274, -0.004781977, 0.007032333, -0.031787455, - 0.010868644, -0.031489216, 0.09525667, 0.013939797, - 0.0058680447, 0.0167067, 0.02668468, -0.04797466, - -0.048885044, -0.12722108, 0.035304096, 0.06554885, - 0.00972396, -0.039238118, -0.05159735, -0.11329045, - 0.1613692, -0.03750952, 0.06529313, -0.071974665, - -0.11769596, 0.015524369, -0.0013754242, -0.12446318, - 0.02786344, -0.014179351, 0.005264273, 0.14376344, - 0.015983658, 0.03406988, -0.06939408, 0.040699873, - 0.02111075, 0.09669095, 0.041345075, -0.08316494, - -0.07684199, -0.045768797, 0.032298047, -0.041805092, - 0.0119405, 0.0061010392, 0.12652606, 0.0064572375, - -0.024950314, 0.11574242, 0.04508852, -0.04335324, - 0.06760663, -0.027437469, 0.07216407, 0.06977076, - -0.05438599, 0.034033038, -0.028602652, 0.05346137, - 0.043184172, -0.037189785, 0.10420091, 0.00882477, - -0.054019816, -0.074273005, -0.030617684, -0.0028467078, - 0.024302477, -0.0038869337, 0.005332455, 0.0013399826, - 0.04361412, -0.007001822, 0.09631092, -0.06702025, - -0.042049985, -0.035070654, -0.04103342, -0.10273396, - 0.0544271, 0.037184782, -0.13150354, -0.0058036847, - -0.008264958, 0.042035464, 0.05891794, 0.029673764, - 0.0063542654, 0.044788733, 0.054816857, 0.062257513, - -0.00093483756, 0.048938446, -0.004952862, -0.007730018, - -0.04043371, -0.017094059, 0.07229206, -0.023670016, - -0.052195564, -0.025616996, -0.01520939, 0.045104615, - -0.007376126, 0.003533447, 0.006570588, 0.056037236, - 0.12436656, 0.051817212, 0.028532185, -0.08686856, - 0.11868599, 0.07663395, -0.07323171, 0.03463402, - -0.050708205, -0.04458982, -0.11590894, 0.021273347, - 0.1251325, -0.15313013, -0.12224372, 0.17228661, - 0.023029093, 0.086124025, 0.006445803, -0.03496501, - 0.028332196, 0.04449512, -0.042436164, -0.026587414, - -0.006041347, -0.09292539, -0.05678812, 0.03897832, - 0.09465633, 0.008115513, -0.02171956, 0.08304309, - 0.071401566, 0.019622514, 0.032163795, -0.004167056, - 0.02295182, 0.030739572, 0.056506045, 0.004612461, - 0.06524936, 0.059999723, 0.046395954, -0.0045512207, - -0.1335546, -0.030136576, 0.11584653, -0.014678886, - 0.0020118146, -0.09688814, -0.0790206, 0.039770417, - -0.0329582, 0.07922767, 0.029322514, 0.026405897, - 0.04207835, -0.07073373, 0.063781224, 0.0859677, - -0.10925287, -0.07011058, 0.048005477, 0.03438226, - -0.09606514, -0.006669445, -0.043381985, 0.04240257, - -0.06955775, -0.06769346, 0.043903265, -0.026784198, - -0.017840602, 0.024307009, -0.040079936, -0.019946516, - 0.045318738, -0.12233574, 0.026170589, 0.0074471775, - 0.15978073, 0.10185836, 0.10298046, -0.015476589, - -0.039390966, -0.072174534, 0.0739445, -0.1211869, - -0.0347889, -0.07943156, 0.014809798, -0.12412325, - -0.0030663363, 0.039695457, 0.0647603, -0.08291318, - -0.018529687, -0.004423833, 0.0037507233, 0.084633216, - -0.01514876, -0.056505352, -0.012800942, -0.06994386, - 0.012962922, -0.031234352, 0.07029052, 0.016418684, - 0.03618972, 0.055686004, -0.08663945, -0.017404709, - -0.054761406, 0.029065743, 0.052404847, 0.020238016, - 0.0048197987, -0.0214882, 0.07078733, 0.013016777, - 0.06262858, 0.009184685, 0.020785125, -0.043904778, - -0.0270329, -0.03299152, -0.060088247, -0.015162964, - -0.001828936, 0.12642565, -0.056757294, 0.013586685, - 0.09232601, -0.035886683, 0.06000002, 0.05229691, - -0.052580316, -0.082029596, -0.010794592, 0.012947712, - -0.036429964, -0.085508935, -0.13127148, -0.017744139, - 0.031502828, 0.036232427, -0.031581745, 0.023051167, - -0.05325106, -0.03421577, 0.028793324, -0.034633752, - -0.009881397, -0.043551125, -0.018609839, 0.0019097115, - -0.008799762, 0.056595087, 0.0022273948, 0.055752404}); - - lstm.SetRecurrentToOutputWeights({ - 0.025825322, -0.05813119, 0.09495884, -0.045984812, -0.01255415, - -0.0026479573, -0.08196161, -0.054914974, -0.0046604523, -0.029587349, - -0.044576716, -0.07480124, -0.082868785, 0.023254942, 0.027502948, - -0.0039728214, -0.08683098, -0.08116779, -0.014675607, -0.037924774, - -0.023314456, -0.007401714, -0.09255757, 0.029460307, -0.08829125, - -0.005139627, -0.08989442, -0.0555066, 0.13596267, -0.025062224, - -0.048351806, -0.03850004, 0.07266485, -0.022414139, 0.05940088, - 0.075114764, 0.09597592, -0.010211725, -0.0049794707, -0.011523867, - -0.025980417, 0.072999895, 0.11091378, -0.081685916, 0.014416728, - 0.043229222, 0.034178585, -0.07530371, 0.035837382, -0.085607, - -0.007721233, -0.03287832, -0.043848954, -0.06404588, -0.06632928, - -0.073643476, 0.008214239, -0.045984086, 0.039764922, 0.03474462, - 0.060612556, -0.080590084, 0.049127717, 0.04151091, -0.030063879, - 0.008801774, -0.023021035, -0.019558564, 0.05158114, -0.010947698, - -0.011825728, 0.0075720972, 0.0699727, -0.0039981045, 0.069350146, - 0.08799282, 0.016156472, 0.035502106, 0.11695009, 0.006217345, - 0.13392477, -0.037875112, 0.025745004, 0.08940699, -0.00924166, - 0.0046702605, -0.036598757, -0.08811812, 0.10522024, -0.032441203, - 0.008176899, -0.04454919, 0.07058152, 0.0067963637, 0.039206743, - 0.03259838, 0.03725492, -0.09515802, 0.013326398, -0.052055415, - -0.025676316, 0.03198509, -0.015951829, -0.058556724, 0.036879618, - 0.043357447, 0.028362012, -0.05908629, 0.0059240665, -0.04995891, - -0.019187413, 0.0276265, -0.01628143, 0.0025863599, 0.08800015, - 0.035250366, -0.022165963, -0.07328642, -0.009415526, -0.07455109, - 0.11690406, 0.0363299, 0.07411125, 0.042103454, -0.009660886, - 0.019076364, 0.018299393, -0.046004917, 0.08891175, 0.0431396, - -0.026327137, -0.051502608, 0.08979574, -0.051670972, 0.04940282, - -0.07491107, -0.021240504, 0.022596184, -0.034280192, 0.060163025, - -0.058211457, -0.051837247, -0.01349775, -0.04639988, -0.035936575, - -0.011681591, 0.064818054, 0.0073146066, -0.021745546, -0.043124277, - -0.06471268, -0.07053354, -0.029321948, -0.05330136, 0.016933719, - -0.053782392, 0.13747959, -0.1361751, -0.11569455, 0.0033329215, - 0.05693899, -0.053219706, 0.063698, 0.07977434, -0.07924483, - 0.06936997, 0.0034815092, -0.007305279, -0.037325785, -0.07251102, - -0.033633437, -0.08677009, 0.091591336, -0.14165086, 0.021752775, - 0.019683983, 0.0011612234, -0.058154266, 0.049996935, 0.0288841, - -0.0024567875, -0.14345716, 0.010955264, -0.10234828, 0.1183656, - -0.0010731248, -0.023590032, -0.072285876, -0.0724771, -0.026382286, - -0.0014920527, 0.042667855, 0.0018776858, 0.02986552, 0.009814309, - 0.0733756, 0.12289186, 0.018043943, -0.0458958, 0.049412545, - 0.033632483, 0.05495232, 0.036686596, -0.013781798, -0.010036754, - 0.02576849, -0.08307328, 0.010112348, 0.042521734, -0.05869831, - -0.071689695, 0.03876447, -0.13275425, -0.0352966, -0.023077697, - 0.10285965, 0.084736146, 0.15568255, -0.00040734606, 0.027835453, - -0.10292561, -0.032401145, 0.10053256, -0.026142767, -0.08271222, - -0.0030240538, -0.016368777, 0.1070414, 0.042672627, 0.013456989, - -0.0437609, -0.022309763, 0.11576483, 0.04108048, 0.061026827, - -0.0190714, -0.0869359, 0.037901703, 0.0610107, 0.07202949, - 0.01675338, 0.086139716, -0.08795751, -0.014898893, -0.023771819, - -0.01965048, 0.007955471, -0.043740474, 0.03346837, -0.10549954, - 0.090567775, 0.042013682, -0.03176985, 0.12569028, -0.02421228, - -0.029526481, 0.023851605, 0.031539805, 0.05292009, -0.02344001, - -0.07811758, -0.08834428, 0.10094801, 0.16594367, -0.06861939, - -0.021256343, -0.041093912, -0.06669611, 0.035498552, 0.021757556, - -0.09302526, -0.015403468, -0.06614931, -0.051798206, -0.013874718, - 0.03630673, 0.010412845, -0.08077351, 0.046185967, 0.0035662893, - 0.03541868, -0.094149634, -0.034814864, 0.003128424, -0.020674974, - -0.03944324, -0.008110165, -0.11113267, 0.08484226, 0.043586485, - 0.040582247, 0.0968012, -0.065249965, -0.028036479, 0.0050708856, - 0.0017462453, 0.0326779, 0.041296225, 0.09164146, -0.047743853, - -0.015952192, -0.034451712, 0.084197424, -0.05347844, -0.11768019, - 0.085926116, -0.08251791, -0.045081906, 0.0948852, 0.068401024, - 0.024856757, 0.06978981, -0.057309967, -0.012775832, -0.0032452994, - 0.01977615, -0.041040014, -0.024264973, 0.063464895, 0.05431621, - }); - - lstm.SetCellToInputWeights( - {0.040369894, 0.030746894, 0.24704495, 0.018586371, -0.037586458, - -0.15312155, -0.11812848, -0.11465643, 0.20259799, 0.11418174, - -0.10116027, -0.011334949, 0.12411352, -0.076769054, -0.052169047, - 0.21198851, -0.38871562, -0.09061183, -0.09683246, -0.21929175}); - - lstm.SetCellToForgetWeights( - {-0.01998659, -0.15568835, -0.24248174, -0.012770197, 0.041331276, - -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766, - -0.047248036, 0.021479502, 0.033189066, 0.11952997, -0.020432774, - 0.64658105, -0.06650122, -0.03467612, 0.095340036, 0.23647355}); - - lstm.SetCellToOutputWeights( - {0.08286371, -0.08261836, -0.51210177, 0.002913762, 0.17764764, - -0.5495371, -0.08460716, -0.24552552, 0.030037103, 0.04123544, - -0.11940523, 0.007358328, 0.1890978, 0.4833202, -0.34441817, - 0.36312827, -0.26375428, 0.1457655, -0.19724406, 0.15548733}); - - lstm.SetProjectionWeights( - {-0.009802181, 0.09401916, 0.0717386, -0.13895074, 0.09641832, - 0.060420845, 0.08539281, 0.054285463, 0.061395317, 0.034448683, - -0.042991187, 0.019801661, -0.16840284, -0.015726732, -0.23041931, - -0.024478018, -0.10959692, -0.013875541, 0.18600968, -0.061274476, - 0.0138165, -0.08160894, -0.07661644, 0.032372914, 0.16169067, - 0.22465782, -0.03993472, -0.004017731, 0.08633481, -0.28869787, - 0.08682067, 0.17240396, 0.014975425, 0.056431185, 0.031037588, - 0.16702051, 0.0077946745, 0.15140012, 0.29405436, 0.120285, - -0.188994, -0.027265169, 0.043389652, -0.022061434, 0.014777949, - -0.20203483, 0.094781205, 0.19100232, 0.13987629, -0.036132768, - -0.06426278, -0.05108664, 0.13221376, 0.009441198, -0.16715929, - 0.15859416, -0.040437475, 0.050779544, -0.022187516, 0.012166504, - 0.027685808, -0.07675938, -0.0055694645, -0.09444123, 0.0046453946, - 0.050794356, 0.10770313, -0.20790008, -0.07149004, -0.11425117, - 0.008225835, -0.035802525, 0.14374903, 0.15262283, 0.048710253, - 0.1847461, -0.007487823, 0.11000021, -0.09542012, 0.22619456, - -0.029149994, 0.08527916, 0.009043713, 0.0042746216, 0.016261552, - 0.022461696, 0.12689082, -0.043589946, -0.12035478, -0.08361797, - -0.050666027, -0.1248618, -0.1275799, -0.071875185, 0.07377272, - 0.09944291, -0.18897448, -0.1593054, -0.06526116, -0.040107165, - -0.004618631, -0.067624845, -0.007576253, 0.10727444, 0.041546922, - -0.20424393, 0.06907816, 0.050412357, 0.00724631, 0.039827548, - 0.12449835, 0.10747581, 0.13708383, 0.09134148, -0.12617786, - -0.06428341, 0.09956831, 0.1208086, -0.14676677, -0.0727722, - 0.1126304, 0.010139365, 0.015571211, -0.038128063, 0.022913318, - -0.042050496, 0.16842307, -0.060597885, 0.10531834, -0.06411776, - -0.07451711, -0.03410368, -0.13393489, 0.06534304, 0.003620307, - 0.04490757, 0.05970546, 0.05197996, 0.02839995, 0.10434969, - -0.013699693, -0.028353551, -0.07260381, 0.047201227, -0.024575593, - -0.036445823, 0.07155557, 0.009672501, -0.02328883, 0.009533515, - -0.03606021, -0.07421458, -0.028082801, -0.2678904, -0.13221288, - 0.18419984, -0.13012612, -0.014588381, -0.035059117, -0.04824723, - 0.07830115, -0.056184657, 0.03277091, 0.025466874, 0.14494097, - -0.12522776, -0.098633975, -0.10766018, -0.08317623, 0.08594209, - 0.07749552, 0.039474737, 0.1776665, -0.07409566, -0.0477268, - 0.29323658, 0.10801441, 0.1154011, 0.013952499, 0.10739139, - 0.10708251, -0.051456142, 0.0074137426, -0.10430189, 0.10034707, - 0.045594677, 0.0635285, -0.0715442, -0.089667566, -0.10811871, - 0.00026344223, 0.08298446, -0.009525053, 0.006585689, -0.24567553, - -0.09450807, 0.09648481, 0.026996298, -0.06419476, -0.04752702, - -0.11063944, -0.23441927, -0.17608605, -0.052156363, 0.067035615, - 0.19271925, -0.0032889997, -0.043264326, 0.09663576, -0.057112187, - -0.10100678, 0.0628376, 0.04447668, 0.017961001, -0.10094388, - -0.10190601, 0.18335468, 0.10494553, -0.052095775, -0.0026118709, - 0.10539724, -0.04383912, -0.042349473, 0.08438151, -0.1947263, - 0.02251204, 0.11216432, -0.10307853, 0.17351969, -0.039091777, - 0.08066188, -0.00561982, 0.12633002, 0.11335965, -0.0088127935, - -0.019777594, 0.06864014, -0.059751723, 0.016233567, -0.06894641, - -0.28651384, -0.004228674, 0.019708522, -0.16305895, -0.07468996, - -0.0855457, 0.099339016, -0.07580735, -0.13775392, 0.08434318, - 0.08330512, -0.12131499, 0.031935584, 0.09180414, -0.08876437, - -0.08049874, 0.008753825, 0.03498998, 0.030215185, 0.03907079, - 0.089751154, 0.029194152, -0.03337423, -0.019092513, 0.04331237, - 0.04299654, -0.036394123, -0.12915532, 0.09793732, 0.07512415, - -0.11319543, -0.032502122, 0.15661901, 0.07671967, -0.005491124, - -0.19379048, -0.218606, 0.21448623, 0.017840758, 0.1416943, - -0.07051762, 0.19488361, 0.02664691, -0.18104725, -0.09334311, - 0.15026465, -0.15493552, -0.057762887, -0.11604192, -0.262013, - -0.01391798, 0.012185008, 0.11156489, -0.07483202, 0.06693364, - -0.26151478, 0.046425626, 0.036540434, -0.16435726, 0.17338543, - -0.21401681, -0.11385144, -0.08283257, -0.069031075, 0.030635102, - 0.010969227, 0.11109743, 0.010919218, 0.027526086, 0.13519906, - 0.01891392, -0.046839405, -0.040167913, 0.017953383, -0.09700955, - 0.0061885654, -0.07000971, 0.026893595, -0.038844477, 0.14543656}); - - static float lstm_input[][20] = { - {// Batch0: 4 (input_sequence_size) * 5 (n_input) - 0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386, - 0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199, - 0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339}, - - {// Batch1: 4 (input_sequence_size) * 5 (n_input) - 0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260, - 0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485, - 0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}}; - - static float lstm_golden_output[][64] = { - {// Batch0: 4 (input_sequence_size) * 16 (n_output) - -0.00396806, 0.029352, -0.00279226, 0.0159977, -0.00835576, - -0.0211779, 0.0283512, -0.0114597, 0.00907307, -0.0244004, - -0.0152191, -0.0259063, 0.00914318, 0.00415118, 0.017147, - 0.0134203, -0.0166936, 0.0381209, 0.000889694, 0.0143363, - -0.0328911, -0.0234288, 0.0333051, -0.012229, 0.0110322, - -0.0457725, -0.000832209, -0.0202817, 0.0327257, 0.0121308, - 0.0155969, 0.0312091, -0.0213783, 0.0350169, 0.000324794, - 0.0276012, -0.0263374, -0.0371449, 0.0446149, -0.0205474, - 0.0103729, -0.0576349, -0.0150052, -0.0292043, 0.0376827, - 0.0136115, 0.0243435, 0.0354492, -0.0189322, 0.0464512, - -0.00251373, 0.0225745, -0.0308346, -0.0317124, 0.0460407, - -0.0189395, 0.0149363, -0.0530162, -0.0150767, -0.0340193, - 0.0286833, 0.00824207, 0.0264887, 0.0305169}, - {// Batch1: 4 (input_sequence_size) * 16 (n_output) - -0.013869, 0.0287268, -0.00334693, 0.00733398, -0.0287926, - -0.0186926, 0.0193662, -0.0115437, 0.00422612, -0.0345232, - 0.00223253, -0.00957321, 0.0210624, 0.013331, 0.0150954, - 0.02168, -0.0141913, 0.0322082, 0.00227024, 0.0260507, - -0.0188721, -0.0296489, 0.0399134, -0.0160509, 0.0116039, - -0.0447318, -0.0150515, -0.0277406, 0.0316596, 0.0118233, - 0.0214762, 0.0293641, -0.0204549, 0.0450315, -0.00117378, - 0.0167673, -0.0375007, -0.0238314, 0.038784, -0.0174034, - 0.0131743, -0.0506589, -0.0048447, -0.0240239, 0.0325789, - 0.00790065, 0.0220157, 0.0333314, -0.0264787, 0.0387855, - -0.000764675, 0.0217599, -0.037537, -0.0335206, 0.0431679, - -0.0211424, 0.010203, -0.062785, -0.00832363, -0.025181, - 0.0412031, 0.0118723, 0.0239643, 0.0394009}}; - - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); - - const int input_sequence_size = - sizeof(lstm_input[0]) / sizeof(float) / (lstm.num_inputs()); - for (int i = 0; i < input_sequence_size; i++) { - float* batch0_start = lstm_input[0] + i * lstm.num_inputs(); - float* batch0_end = batch0_start + lstm.num_inputs(); - - lstm.SetInput(0, batch0_start, batch0_end); - - float* batch1_start = lstm_input[1] + i * lstm.num_inputs(); - float* batch1_end = batch1_start + lstm.num_inputs(); - lstm.SetInput(lstm.num_inputs(), batch1_start, batch1_end); - - lstm.Invoke(); - - float* golden_start_batch0 = lstm_golden_output[0] + i * lstm.num_outputs(); - float* golden_end_batch0 = golden_start_batch0 + lstm.num_outputs(); - float* golden_start_batch1 = lstm_golden_output[1] + i * lstm.num_outputs(); - float* golden_end_batch1 = golden_start_batch1 + lstm.num_outputs(); - std::vector<float> expected; - expected.insert(expected.end(), golden_start_batch0, golden_end_batch0); - expected.insert(expected.end(), golden_start_batch1, golden_end_batch1); - EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); - } + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); + + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); + + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + lstm.SetCellToInputWeights(cell_to_input_weights_); + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); + + lstm.SetProjectionWeights(projection_weights_); + + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); +} + +TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) { + const int n_batch = 2; + const int n_input = 5; + const int n_cell = 20; + const int n_output = 16; + + HybridLSTMOpModel lstm( + n_batch, n_input, n_cell, n_output, + /*use_cifg=*/false, /*use_peephole=*/true, + /*use_projection_weights=*/true, + /*use_projection_bias=*/false, + /*cell_clip=*/0.0, /*proj_clip=*/0.0, + { + {n_batch, n_input}, // input tensor + + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {n_cell}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {n_output, n_cell}, // projection_weight tensor + {0}, // projection_bias tensor + }); + + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); + + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); + + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + lstm.SetCellToInputWeights(cell_to_input_weights_); + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); + + lstm.SetProjectionWeights(projection_weights_); + + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467); } } // namespace diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum.cc b/tensorflow/contrib/lite/kernels/maximum_minimum.cc index 8d676218bd..7cb01465ee 100644 --- a/tensorflow/contrib/lite/kernels/maximum_minimum.cc +++ b/tensorflow/contrib/lite/kernels/maximum_minimum.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include <string.h> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -86,13 +86,14 @@ struct MinimumOp { template <typename data_type, typename op_type> void TFLiteOperation(TfLiteContext* context, TfLiteNode* node, const OpContext& op_context) { - reference_ops::TensorFlowMaximumMinimum<data_type>( + reference_ops::MaximumMinimumBroadcast4DSlow( + GetTensorShape(op_context.input1), GetTensorData<data_type>(op_context.input1), - GetTensorDims(op_context.input1), + GetTensorShape(op_context.input2), GetTensorData<data_type>(op_context.input2), - GetTensorDims(op_context.input2), + GetTensorShape(op_context.output), GetTensorData<data_type>(op_context.output), - GetTensorDims(op_context.output), op_type::template op<data_type>); + op_type::template op<data_type>); } template <KernelType kernel_type, typename OpType> diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc index 0752aa1804..fd4d5367c5 100644 --- a/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc +++ b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc @@ -126,10 +126,10 @@ TEST(MaximumOpTest, FloatWithBroadcastTest) { TEST(MaximumOpTest, Int32WithBroadcastTest) { std::initializer_list<int32_t> data1 = {1, 0, -1, -2, 3, 11}; std::initializer_list<int32_t> data2 = {2}; - TestModel<int32>(BuiltinOperator_MAXIMUM, {TensorType_INT32, {3, 1, 2}}, + TestModel<int32_t>(BuiltinOperator_MAXIMUM, {TensorType_INT32, {3, 1, 2}}, {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}}, data1, data2, {2, 2, 2, 2, 3, 11}); - TestModel<int32>(BuiltinOperator_MINIMUM, {TensorType_INT32, {3, 1, 2}}, + TestModel<int32_t>(BuiltinOperator_MINIMUM, {TensorType_INT32, {3, 1, 2}}, {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}}, data1, data2, {1, 0, -1, -2, 2, 2}); } diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc deleted file mode 100644 index 03e5db24de..0000000000 --- a/tensorflow/contrib/lite/kernels/mean.cc +++ /dev/null @@ -1,271 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include <string.h> -#include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" -#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" -#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" -#include "tensorflow/contrib/lite/kernels/internal/tensor.h" -#include "tensorflow/contrib/lite/kernels/kernel_util.h" -#include "tensorflow/contrib/lite/kernels/op_macros.h" - -namespace tflite { -namespace ops { -namespace builtin { -namespace mean { - -// This file has reference implementation of Mean. -enum KernelType { - kReference, -}; - -struct MeanContext { - MeanContext(TfLiteContext* context, TfLiteNode* node) { - params = reinterpret_cast<TfLiteMeanParams*>(node->builtin_data); - input = GetInput(context, node, 0); - axis = GetInput(context, node, 1); - output = GetOutput(context, node, 0); - } - TfLiteMeanParams* params; - const TfLiteTensor* input; - const TfLiteTensor* axis; - TfLiteTensor* output; -}; - -void* Init(TfLiteContext* context, const char* buffer, size_t length) { - // Creates two temp tensors to store index and axis for internal - // implementation only. - auto* scratch_tensor_index = new int; - context->AddTensors(context, 3, scratch_tensor_index); - return scratch_tensor_index; -} - -void Free(TfLiteContext* context, void* buffer) { - delete reinterpret_cast<int*>(buffer); -} - -// Resizes the temp tensor that stores resolved axis. -TfLiteStatus ResizeTempAxis(TfLiteContext* context, MeanContext* op_context, - TfLiteTensor* resolved_axis) { - TfLiteIntArray* axis_size = TfLiteIntArrayCreate(1); - axis_size->data[0] = static_cast<int>(NumElements(op_context->axis)); - return context->ResizeTensor(context, resolved_axis, axis_size); -} - -// Resizes the temp tensor that stores temp sum of reduced elements. -TfLiteStatus ResizeTempSum(TfLiteContext* context, MeanContext* op_context, - TfLiteTensor* temp_sum) { - TfLiteIntArray* size = TfLiteIntArrayCreate(1); - size->data[0] = static_cast<int>(NumElements(op_context->output)); - return context->ResizeTensor(context, temp_sum, size); -} - -// Resizes output array based on the input size and resolved axis. -TfLiteStatus ResizeOutputTensor(TfLiteContext* context, - MeanContext* op_context) { - size_t num_axis = NumElements(op_context->axis); - const TfLiteIntArray* input_dims = op_context->input->dims; - int input_num_dims = NumDimensions(op_context->input); - const int* axis = GetTensorData<int>(op_context->axis); - if (op_context->params->keep_dims) { - TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_num_dims); - for (int idx = 0; idx < input_num_dims; ++idx) { - bool is_axis = false; - for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) { - if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) { - is_axis = true; - break; - } - } - if (is_axis) { - output_dims->data[idx] = 1; - } else { - output_dims->data[idx] = input_dims->data[idx]; - } - } - return context->ResizeTensor(context, op_context->output, output_dims); - } else { - // Calculates size of reducing axis. - int num_reduce_axis = num_axis; - for (int i = 0; i < num_axis; ++i) { - int current = axis[i]; - if (current < 0) { - current += input_num_dims; - } - TF_LITE_ENSURE(context, current >= 0 && current < input_num_dims); - for (int j = 0; j < i; ++j) { - int previous = axis[j]; - if (previous < 0) { - previous += input_num_dims; - } - if (current == previous) { - --num_reduce_axis; - break; - } - } - } - // Determines output dimensions. - TfLiteIntArray* output_dims = - TfLiteIntArrayCreate(input_num_dims - num_reduce_axis); - int num_skip_axis = 0; - for (int idx = 0; idx < input_num_dims; ++idx) { - bool is_axis = false; - for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) { - if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) { - ++num_skip_axis; - is_axis = true; - break; - } - } - if (!is_axis) { - output_dims->data[idx - num_skip_axis] = input_dims->data[idx]; - } - } - return context->ResizeTensor(context, op_context->output, output_dims); - } -} - -// Initializes temp tensors to store index and resolved axis. -TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node, - MeanContext* op_context) { - // Creates a temp index to iterate through input data. - int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data); - TfLiteIntArrayFree(node->temporaries); - node->temporaries = TfLiteIntArrayCreate(3); - node->temporaries->data[0] = *scratch_tensor_index; - TfLiteTensor* scratch_tensor = GetTemporary(context, node, /*index=*/0); - scratch_tensor->type = kTfLiteInt32; - scratch_tensor->allocation_type = kTfLiteArenaRw; - TfLiteIntArray* index_size = TfLiteIntArrayCreate(1); - index_size->data[0] = NumDimensions(op_context->input); - TF_LITE_ENSURE_OK(context, - context->ResizeTensor(context, scratch_tensor, index_size)); - - // Creates a temp tensor to store resolved axis given input data. - node->temporaries->data[1] = *scratch_tensor_index + 1; - TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); - resolved_axis->type = kTfLiteInt32; - // Creates a temp tensor to store temp sums when calculating mean. - node->temporaries->data[2] = *scratch_tensor_index + 2; - TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2); - switch (op_context->input->type) { - case kTfLiteFloat32: - temp_sum->type = kTfLiteFloat32; - break; - case kTfLiteInt32: - temp_sum->type = kTfLiteInt64; - break; - case kTfLiteInt64: - temp_sum->type = kTfLiteInt64; - break; - case kTfLiteUInt8: - temp_sum->type = kTfLiteInt32; - break; - default: - return kTfLiteError; - } - return kTfLiteOk; -} - -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); - TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - - MeanContext op_context(context, node); - TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context)); - - TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); - TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2); - // Leaves work to Eval if axis is not constant; else resizes output. - if (!IsConstantTensor(op_context.axis)) { - SetTensorToDynamic(op_context.output); - SetTensorToDynamic(resolved_axis); - SetTensorToDynamic(temp_sum); - return kTfLiteOk; - } - resolved_axis->allocation_type = kTfLiteArenaRw; - TF_LITE_ENSURE_OK(context, - ResizeTempAxis(context, &op_context, resolved_axis)); - TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); - temp_sum->allocation_type = kTfLiteArenaRw; - return ResizeTempSum(context, &op_context, temp_sum); -} - -template <KernelType kernel_type> -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - MeanContext op_context(context, node); - int num_axis = static_cast<int>(NumElements(op_context.axis)); - TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0); - TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); - TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2); - // Resize the output tensor if the output tensor is dynamic. - if (IsDynamicTensor(op_context.output)) { - TF_LITE_ENSURE_OK(context, - ResizeTempAxis(context, &op_context, resolved_axis)); - TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); - TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum)); - } - -#define TF_LITE_MEAN(kernel_type, data_type, temp_data_type) \ - kernel_type::Mean<>( \ - GetTensorData<data_type>(op_context.input), \ - op_context.input->dims->data, op_context.input->dims->size, \ - GetTensorData<data_type>(op_context.output), \ - op_context.output->dims->data, op_context.output->dims->size, \ - GetTensorData<int>(op_context.axis), num_axis, \ - op_context.params->keep_dims, GetTensorData<int>(temp_index), \ - GetTensorData<int>(resolved_axis), \ - GetTensorData<temp_data_type>(temp_sum)) - - if (kernel_type == kReference) { - switch (op_context.input->type) { - case kTfLiteFloat32: - TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float)); - break; - case kTfLiteInt32: - TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int, int64_t)); - break; - case kTfLiteInt64: - TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int64_t, int64_t)); - break; - case kTfLiteUInt8: - TF_LITE_ENSURE_EQ(context, op_context.input->params.scale, - op_context.output->params.scale); - TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point, - op_context.output->params.zero_point); - TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int)); - break; - default: - return kTfLiteError; - } - } -#undef TF_LITE_MEAN - return kTfLiteOk; -} -} // namespace mean - -TfLiteRegistration* Register_MEAN_REF() { - static TfLiteRegistration r = {mean::Init, mean::Free, mean::Prepare, - mean::Eval<mean::kReference>}; - return &r; -} - -// TODO(kanlig): add optimized implementation of Mean. -TfLiteRegistration* Register_MEAN() { return Register_MEAN_REF(); } - -} // namespace builtin -} // namespace ops -} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/mean_test.cc b/tensorflow/contrib/lite/kernels/mean_test.cc deleted file mode 100644 index 79c9957f76..0000000000 --- a/tensorflow/contrib/lite/kernels/mean_test.cc +++ /dev/null @@ -1,219 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include <gtest/gtest.h> -#include "tensorflow/contrib/lite/interpreter.h" -#include "tensorflow/contrib/lite/kernels/register.h" -#include "tensorflow/contrib/lite/kernels/test_util.h" -#include "tensorflow/contrib/lite/model.h" - -namespace tflite { -namespace { - -using ::testing::ElementsAreArray; - -class BaseMeanOpModel : public SingleOpModel { - public: - void SetAxis(std::initializer_list<int> data) { PopulateTensor(axis_, data); } - - template <class T> - void SetInput(std::initializer_list<T> data) { - PopulateTensor(input_, data); - } - - template <class T> - std::vector<T> GetOutput() { - return ExtractVector<T>(output_); - } - - std::vector<float> GetDequantizedOutput() { - return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_), - GetScale(output_), GetZeroPoint(output_)); - } - - std::vector<int> GetOutputShape() { return GetTensorShape(output_); } - - int Input() { return input_; } - - protected: - int input_; - int axis_; - int output_; -}; - -// Model for the tests case where axis is a const tensor. -class MeanOpConstModel : public BaseMeanOpModel { - public: - MeanOpConstModel(const TensorData& input, const TensorData& output, - std::initializer_list<int> axis_shape, - std::initializer_list<int> axis, bool keep_dims) { - input_ = AddInput(input); - axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); - output_ = AddOutput(output); - SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions, - CreateMeanOptions(builder_, keep_dims).Union()); - BuildInterpreter({GetShape(input_)}); - } -}; - -// Model for the tests case where axis is a dynamic tensor. -class MeanOpDynamicModel : public BaseMeanOpModel { - public: - MeanOpDynamicModel(const TensorData& input, const TensorData& output, - const TensorData& axis, bool keep_dims) { - input_ = AddInput(input); - axis_ = AddInput(axis); - output_ = AddOutput(output); - SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions, - CreateMeanOptions(builder_, keep_dims).Union()); - BuildInterpreter({GetShape(input_)}); - } -}; - -TEST(ConstFloatMeanOpTest, NotKeepDims) { - std::initializer_list<float> data = { - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, - 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; - MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}}, - {4}, {1, 0, -3, -3}, false); - m.SetInput(data); - m.Invoke(); - EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); - EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13}))); -} - -TEST(ConstFloatMeanOpTest, KeepDims) { - std::initializer_list<float> data = { - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, - 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; - MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}}, - {2}, {0, 2}, true); - m.SetInput(data); - m.Invoke(); - EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); - EXPECT_THAT(m.GetOutput<float>(), - ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5}))); -} - -TEST(DynamicFloatMeanOpTest, NotKeepDims) { - std::initializer_list<float> data = { - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, - 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; - MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, - {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}}, - false); - std::initializer_list<int> axis = {1, 0, -3, -3}; - m.SetAxis(axis); - m.SetInput(data); - m.Invoke(); - EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); - EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13}))); -} - -TEST(DynamicFloatMeanOpTest, KeepDims) { - std::initializer_list<float> data = { - 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, - 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; - MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, - {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}}, - true); - std::initializer_list<int> axis = {0, 2}; - m.SetAxis(axis); - m.SetInput(data); - m.Invoke(); - EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); - EXPECT_THAT(m.GetOutput<float>(), - ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5}))); -} - -TEST(DynamicFloatMeanOpTest, Scale) { - std::initializer_list<float> data = {9.527}; - MeanOpDynamicModel m({TensorType_FLOAT32, {1}}, {TensorType_FLOAT32, {1}}, - {TensorType_INT32, {1}}, true); - std::initializer_list<int> axis = {0}; - m.SetAxis(axis); - m.SetInput(data); - m.Invoke(); - EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); - EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527}))); -} - -// for quantized Add, the error shouldn't exceed step -float GetTolerance(int min, int max) { return (max - min) / 255.0; } - -TEST(ConstUint8MeanOpTest, NotKeepDims) { - float kQuantizedTolerance = GetTolerance(-1.0, 1.0); - std::initializer_list<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; - MeanOpConstModel m({TensorType_UINT8, {1, 3, 2}, -1.0, 1.0}, - {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {1}, false); - m.QuantizeAndPopulate<uint8_t>(m.Input(), data); - m.Invoke(); - EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); - EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear( - {0.4, 0.4}, kQuantizedTolerance))); -} - -TEST(ConstUint8MeanOpTest, KeepDims) { - float kQuantizedTolerance = GetTolerance(-1.0, 1.0); - std::initializer_list<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; - MeanOpConstModel m({TensorType_UINT8, {3, 2}, -1.0, 1.0}, - {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {1}, true); - m.QuantizeAndPopulate<uint8_t>(m.Input(), data); - m.Invoke(); - EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1})); - EXPECT_THAT( - m.GetDequantizedOutput(), - ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance))); -} - -TEST(DynamicUint8MeanOpTest, NotKeepDims) { - float kQuantizedTolerance = GetTolerance(-5.0, 2.0); - std::initializer_list<float> data = {1.3, -4.8, -3.6, 0.24}; - MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -5.0, 2.0}, - {TensorType_UINT8, {2}, -5.0, 2.0}, - {TensorType_INT32, {1}}, false); - std::initializer_list<int> axis = {1}; - m.SetAxis(axis); - m.QuantizeAndPopulate<uint8_t>(m.Input(), data); - m.Invoke(); - EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); - EXPECT_THAT( - m.GetDequantizedOutput(), - ElementsAreArray(ArrayFloatNear({-1.75, -1.68}, kQuantizedTolerance))); -} - -TEST(DynamicUint8MeanOpTest, KeepDims) { - float kQuantizedTolerance = GetTolerance(-10.0, 12.0); - std::initializer_list<float> data = {11.14, -0.14, 7.423, 0.879}; - MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -10.0, 12.0}, - {TensorType_UINT8, {2}, -10.0, 12.0}, - {TensorType_INT32, {1}}, true); - std::initializer_list<int> axis = {0}; - m.SetAxis(axis); - m.QuantizeAndPopulate<uint8_t>(m.Input(), data); - m.Invoke(); - EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); - EXPECT_THAT( - m.GetDequantizedOutput(), - ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance))); -} - -} // namespace -} // namespace tflite - -int main(int argc, char** argv) { - ::tflite::LogToStderr(); - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/tensorflow/contrib/lite/kernels/mfcc.cc b/tensorflow/contrib/lite/kernels/mfcc.cc index 3f5bc4d68a..5153ce5634 100644 --- a/tensorflow/contrib/lite/kernels/mfcc.cc +++ b/tensorflow/contrib/lite/kernels/mfcc.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/kernels/internal/mfcc.h" -#include "flatbuffers/flexbuffers.h" -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h" #include "tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" diff --git a/tensorflow/contrib/lite/kernels/mfcc_test.cc b/tensorflow/contrib/lite/kernels/mfcc_test.cc index 0291ca8c1c..fe69223222 100644 --- a/tensorflow/contrib/lite/kernels/mfcc_test.cc +++ b/tensorflow/contrib/lite/kernels/mfcc_test.cc @@ -18,7 +18,7 @@ limitations under the License. #include <vector> #include <gtest/gtest.h> -#include "flatbuffers/flexbuffers.h" +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers #include "tensorflow/contrib/lite/interpreter.h" #include "tensorflow/contrib/lite/kernels/register.h" #include "tensorflow/contrib/lite/kernels/test_util.h" diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc index 6c4c3a1edc..e0aac8a842 100644 --- a/tensorflow/contrib/lite/kernels/mul.cc +++ b/tensorflow/contrib/lite/kernels/mul.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" @@ -39,6 +39,14 @@ constexpr int kOutputTensor = 0; struct OpData { bool requires_broadcast; + + // Parameters used in the quantized paths where the output is 8bit + int32 output_activation_min; + int32 output_activation_max; + + // Parameters used in all quantized paths + int32_t output_multiplier; + int output_shift; }; void* Init(TfLiteContext* context, const char* buffer, size_t length) { @@ -52,6 +60,7 @@ void Free(TfLiteContext* context, void* buffer) { } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data); OpData* data = reinterpret_cast<OpData*>(node->user_data); TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); @@ -62,7 +71,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = GetOutput(context, node, kOutputTensor); TF_LITE_ENSURE_EQ(context, input1->type, input2->type); - output->type = input2->type; data->requires_broadcast = !HaveSameShapes(input1, input2); @@ -74,74 +82,136 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output_size = TfLiteIntArrayCopy(input1->dims); } + if (output->type == kTfLiteUInt8) { + CalculateActivationRangeUint8(params->activation, output, + &data->output_activation_min, + &data->output_activation_max); + } + + if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) { + double real_multiplier = + input1->params.scale * input2->params.scale / output->params.scale; + QuantizeMultiplierSmallerThanOneExp( + real_multiplier, &data->output_multiplier, &data->output_shift); + } + return context->ResizeTensor(context, output, output_size); } template <KernelType kernel_type> -void EvalFloat(TfLiteContext* context, TfLiteNode* node, - TfLiteMulParams* params, const OpData* data, - const TfLiteTensor* input1, const TfLiteTensor* input2, - TfLiteTensor* output) { - float output_activation_min, output_activation_max; - CalculateActivationRangeFloat(params->activation, &output_activation_min, - &output_activation_max); -#define TF_LITE_MUL(type, opname) \ - type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \ - GetTensorData<float>(input2), GetTensorDims(input2), \ - output_activation_min, output_activation_max, \ - GetTensorData<float>(output), GetTensorDims(output)) - if (kernel_type == kReference) { - if (data->requires_broadcast) { - TF_LITE_MUL(reference_ops, BroadcastMul); +void EvalMul(TfLiteContext* context, TfLiteNode* node, TfLiteMulParams* params, + const OpData* data, const TfLiteTensor* input1, + const TfLiteTensor* input2, TfLiteTensor* output) { +#define TF_LITE_MUL(type, opname, data_type) \ + data_type output_activation_min, output_activation_max; \ + CalculateActivationRange(params->activation, &output_activation_min, \ + &output_activation_max); \ + tflite::ArithmeticParams op_params; \ + SetActivationParams(output_activation_min, output_activation_max, \ + &op_params); \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<data_type>(input1), GetTensorShape(input2), \ + GetTensorData<data_type>(input2), GetTensorShape(output), \ + GetTensorData<data_type>(output)) + + if (output->type == kTfLiteInt32) { + if (kernel_type == kReference) { + if (data->requires_broadcast) { + TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, int32_t); + } else { + TF_LITE_MUL(reference_ops, Mul, int32_t); + } } else { - TF_LITE_MUL(reference_ops, Mul); + if (data->requires_broadcast) { + TF_LITE_MUL(optimized_ops, BroadcastMul4DSlow, int32_t); + } else { + TF_LITE_MUL(optimized_ops, Mul, int32_t); + } } - } else { - if (data->requires_broadcast) { - TF_LITE_MUL(optimized_ops, BroadcastMul); + } else if (output->type == kTfLiteFloat32) { + if (kernel_type == kReference) { + if (data->requires_broadcast) { + TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, float); + } else { + TF_LITE_MUL(reference_ops, Mul, float); + } } else { - TF_LITE_MUL(optimized_ops, Mul); + if (data->requires_broadcast) { + TF_LITE_MUL(optimized_ops, BroadcastMul4DSlow, float); + } else { + TF_LITE_MUL(optimized_ops, Mul, float); + } } } #undef TF_LITE_MUL } template <KernelType kernel_type> -void EvalQuantized(TfLiteContext* context, TfLiteNode* node, - TfLiteMulParams* params, const OpData* data, - const TfLiteTensor* input1, const TfLiteTensor* input2, - TfLiteTensor* output) { - auto input1_offset = -input1->params.zero_point; - auto input2_offset = -input2->params.zero_point; - auto output_offset = output->params.zero_point; - - int32_t output_multiplier; - int output_shift; - - double real_multiplier = - input1->params.scale * input2->params.scale / output->params.scale; - QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier, - &output_shift); - - int32 output_activation_min, output_activation_max; - CalculateActivationRangeUint8(params->activation, output, - &output_activation_min, &output_activation_max); - +TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, + TfLiteMulParams* params, const OpData* data, + const TfLiteTensor* input1, + const TfLiteTensor* input2, TfLiteTensor* output) { + if (input1->type == kTfLiteUInt8 && input2->type == kTfLiteUInt8 && + output->type == kTfLiteUInt8) { #define TF_LITE_MUL(type, opname) \ - type::opname(GetTensorData<uint8_t>(input1), GetTensorDims(input1), \ - input1_offset, GetTensorData<uint8_t>(input2), \ - GetTensorDims(input2), input2_offset, output_offset, \ - output_multiplier, output_shift, output_activation_min, \ - output_activation_max, GetTensorData<uint8_t>(output), \ - GetTensorDims(output)); - // The quantized version of Mul doesn't support activations, so we - // always use BroadcastMul. - if (kernel_type == kReference) { - TF_LITE_MUL(reference_ops, BroadcastMul); + tflite::ArithmeticParams op_params; \ + SetActivationParams(data->output_activation_min, \ + data->output_activation_max, &op_params); \ + op_params.input1_offset = -input1->params.zero_point; \ + op_params.input2_offset = -input2->params.zero_point; \ + op_params.output_offset = output->params.zero_point; \ + op_params.output_multiplier = data->output_multiplier; \ + op_params.output_shift = data->output_shift; \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<uint8_t>(input1), GetTensorShape(input2), \ + GetTensorData<uint8_t>(input2), GetTensorShape(output), \ + GetTensorData<uint8_t>(output)) + + // The quantized version of Mul doesn't support activations, so we + // always use BroadcastMul. + if (kernel_type == kReference) { + TF_LITE_MUL(reference_ops, BroadcastMul4DSlow); + } else { + TF_LITE_MUL(optimized_ops, BroadcastMul4DSlow); + } +#undef TF_LITE_MUL + } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 && + output->type == kTfLiteInt16) { +#define TF_LITE_MUL(type, opname) \ + tflite::ArithmeticParams op_params; \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<int16_t>(input1), GetTensorShape(input2), \ + GetTensorData<int16_t>(input2), GetTensorShape(output), \ + GetTensorData<int16_t>(output)) + if (kernel_type == kReference) { + TF_LITE_MUL(reference_ops, Mul); + } else { + TF_LITE_MUL(optimized_ops, Mul); + } +#undef TF_LITE_MUL + } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 && + output->type == kTfLiteUInt8) { +#define TF_LITE_MUL(type, opname) \ + tflite::ArithmeticParams op_params; \ + SetActivationParams(data->output_activation_min, \ + data->output_activation_max, &op_params); \ + op_params.output_offset = output->params.zero_point; \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<int16_t>(input1), GetTensorShape(input2), \ + GetTensorData<int16_t>(input2), GetTensorShape(output), \ + GetTensorData<uint8_t>(output)) + if (kernel_type == kReference) { + TF_LITE_MUL(reference_ops, Mul); + } else { + TF_LITE_MUL(optimized_ops, Mul); + } +#undef TF_LITE_MUL } else { - TF_LITE_MUL(optimized_ops, BroadcastMul); + context->ReportError( + context, "Unsupported combination of input and output types in Mul."); + return kTfLiteError; } -#undef TF_LITE_MUL + return kTfLiteOk; } template <KernelType kernel_type> @@ -153,14 +223,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - if (output->type == kTfLiteFloat32) { - EvalFloat<kernel_type>(context, node, params, data, input1, input2, output); - } else if (output->type == kTfLiteUInt8) { - EvalQuantized<kernel_type>(context, node, params, data, input1, input2, - output); + if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) { + EvalMul<kernel_type>(context, node, params, data, input1, input2, output); + } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) { + TF_LITE_ENSURE_OK( + context, EvalQuantized<kernel_type>(context, node, params, data, input1, + input2, output)); } else { context->ReportError(context, - "Mul only supports FLOAT32 and quantized UINT8 now."); + "Mul only supports FLOAT32, INT32 and quantized UINT8 " + "and INT16 now, got %d.", + output->type); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/contrib/lite/kernels/mul_test.cc index f1a30f8263..2807550a6b 100644 --- a/tensorflow/contrib/lite/kernels/mul_test.cc +++ b/tensorflow/contrib/lite/kernels/mul_test.cc @@ -52,12 +52,22 @@ class FloatMulOpModel : public BaseMulOpModel { std::vector<float> GetOutput() { return ExtractVector<float>(output_); } }; +class IntegerMulOpModel : public BaseMulOpModel { + public: + using BaseMulOpModel::BaseMulOpModel; + + std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); } +}; + // For quantized Mul, the error shouldn't exceed (2*step + step^2). // The param min=-1.0 & max=1.0 is used in the following tests. // The tolerance value is ~0.0157. const float kQuantizedStep = 2.0 / 255.0; const float kQuantizedTolerance = 2.0 * kQuantizedStep + kQuantizedStep * kQuantizedStep; +const float kQuantizedStepInt16 = 2.0 / 32767.0; +const float kQuantizedToleranceInt16 = + 2.0 * kQuantizedStepInt16 + kQuantizedStepInt16 * kQuantizedStepInt16; class QuantizedMulOpModel : public BaseMulOpModel { public: @@ -67,6 +77,11 @@ class QuantizedMulOpModel : public BaseMulOpModel { return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_), GetScale(output_), GetZeroPoint(output_)); } + + std::vector<float> GetDequantizedOutputInt16() { + return Dequantize<int16_t>(ExtractVector<int16_t>(output_), + GetScale(output_), GetZeroPoint(output_)); + } }; TEST(FloatMulOpTest, NoActivation) { @@ -125,6 +140,57 @@ TEST(FloatMulOpTest, WithBroadcast) { } } +TEST(IntegerMulOpTest, NoActivation) { + IntegerMulOpModel m({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, {TensorType_INT32, {}}, + ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-20, 4, 21, 40})); +} + +TEST(IntegerMulOpTest, ActivationRELU_N1_TO_1) { + IntegerMulOpModel m({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, {TensorType_INT32, {}}, + ActivationFunctionType_RELU_N1_TO_1); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1, 1, 1, 1})); +} + +TEST(IntegerMulOpTest, VariousInputShapes) { + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + IntegerMulOpModel m({TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8, 11, 20}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5, 11, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-20, 4, 21, 40, 121, 20})) + << "With shape number " << i; + } +} + +TEST(IntegerMulOpTest, WithBroadcast) { + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + IntegerMulOpModel m({TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, {}}, // always a scalar + {TensorType_INT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8, 11, 20}); + m.PopulateTensor<int32_t>(m.input2(), {1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear({-20, 2, 7, 8, 11, 20}))) + << "With shape number " << i; + } +} + TEST(QuantizedMulOpTest, NoActivation) { QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, @@ -138,6 +204,38 @@ TEST(QuantizedMulOpTest, NoActivation) { kQuantizedTolerance))); } +TEST(QuantizedMulOpTest, NoActivationInt16) { + const float kMin = -1.f; + const float kMax = 32767.f / 32768.f; + QuantizedMulOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_INT16, {}, kMin, kMax}, + ActivationFunctionType_NONE); + m.QuantizeAndPopulate<int16_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7}); + m.QuantizeAndPopulate<int16_t>(m.input2(), {0.6, 0.4, 0.9, 0.8}); + m.Invoke(); + EXPECT_THAT(m.GetDequantizedOutputInt16(), + ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56}, + kQuantizedToleranceInt16))); +} + +TEST(QuantizedMulOpTest, NoActivationInt16WithUint8Output) { + const float kMinInt16 = -1.f; + const float kMaxInt16 = 32767.f / 32768.f; + const float kMinUint8 = -1.f; + const float kMaxUint8 = 127.f / 128.f; + QuantizedMulOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16}, + {TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16}, + {TensorType_UINT8, {}, kMinUint8, kMaxUint8}, + ActivationFunctionType_NONE); + m.QuantizeAndPopulate<int16_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7}); + m.QuantizeAndPopulate<int16_t>(m.input2(), {0.6, 0.4, 0.9, 0.8}); + m.Invoke(); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56}, + kQuantizedTolerance))); +} + // for quantized Mul, the error shouldn't exceed 2*step float GetTolerance(int min, int max) { float kQuantizedStep = (max - min) / 255.0; diff --git a/tensorflow/contrib/lite/kernels/neg.cc b/tensorflow/contrib/lite/kernels/neg.cc index b8b53f3402..0ddd0644f5 100644 --- a/tensorflow/contrib/lite/kernels/neg.cc +++ b/tensorflow/contrib/lite/kernels/neg.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" namespace tflite { @@ -59,7 +59,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { break; default: context->ReportError( - context, "Neg only currently supports int64, int32, and float32.", + context, + "Neg only currently supports int64, int32, and float32, got %d.", input->type); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/neg_test.cc b/tensorflow/contrib/lite/kernels/neg_test.cc index 3c95ac8cc2..3d3594c60b 100644 --- a/tensorflow/contrib/lite/kernels/neg_test.cc +++ b/tensorflow/contrib/lite/kernels/neg_test.cc @@ -58,9 +58,9 @@ TEST(NegOpModel, NegFloat) { TEST(NegOpModel, NegInt32) { NegOpModel m({TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 3}}); - m.SetInput<int32>({-2, -1, 0, 1, 2, 3}); + m.SetInput<int32_t>({-2, -1, 0, 1, 2, 3}); m.Invoke(); - EXPECT_THAT(m.GetOutput<int32>(), ElementsAreArray({2, 1, 0, -1, -2, -3})); + EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({2, 1, 0, -1, -2, -3})); } TEST(NegOpModel, NegInt64) { diff --git a/tensorflow/contrib/lite/kernels/one_hot.cc b/tensorflow/contrib/lite/kernels/one_hot.cc new file mode 100644 index 0000000000..910aed6f14 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/one_hot.cc @@ -0,0 +1,199 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace one_hot { + +constexpr int kIndicesTensor = 0; +constexpr int kDepthTensor = 1; +constexpr int kOnValueTensor = 2; +constexpr int kOffValueTensor = 3; +constexpr int kOutputTensor = 0; + +// Convenience utility for destructuring a node into the appropriate tensors and +// data for the op. Note that this destructuring is quite cheap, so we can avoid +// allocating op-specific, persistent data on the heap. +struct OneHotContext { + OneHotContext(TfLiteContext* context, TfLiteNode* node) { + indices = GetInput(context, node, kIndicesTensor); + depth = GetInput(context, node, kDepthTensor); + on_value = GetInput(context, node, kOnValueTensor); + off_value = GetInput(context, node, kOffValueTensor); + output = GetOutput(context, node, kOutputTensor); + + const auto* params = + reinterpret_cast<TfLiteOneHotParams*>(node->builtin_data); + const int indices_dims = indices->dims->size; + axis = (params->axis == -1) ? indices_dims : params->axis; + output_dims = indices_dims + 1; + dtype = on_value->type; + } + + const TfLiteTensor* indices; + const TfLiteTensor* depth; + const TfLiteTensor* on_value; + const TfLiteTensor* off_value; + TfLiteTensor* output; + int axis; + int output_dims; + TfLiteType dtype; +}; + +template <typename T, typename TI> +void OneHotComputeImpl(const OneHotContext& op_context) { + // prefix_dim_size == # of elements before the axis + // depth == # of elements per axis + // suffix_dim_size == # of elements after the axis + int prefix_dim_size = 1; + for (int i = 0; i < op_context.axis; ++i) { + prefix_dim_size *= op_context.indices->dims->data[i]; + } + const int suffix_dim_size = NumElements(op_context.indices) / prefix_dim_size; + const int depth = *op_context.depth->data.i32; + + const T on_value = *GetTensorData<T>(op_context.on_value); + const T off_value = *GetTensorData<T>(op_context.off_value); + + // View the indices as a matrix of size: + // prefix_dim_size x suffix_dim_size + // View the output as a matrix of size: + // prefix_dim_size x depth x suffix_dim_size + // Then the output is: + // output(i, j, k) == (indices(i, k) == j) ? on : off + T* output = GetTensorData<T>(op_context.output); + const TI* indices = GetTensorData<TI>(op_context.indices); + for (int i = 0; i < prefix_dim_size; ++i) { + for (int j = 0; j < depth; ++j) { + for (int k = 0; k < suffix_dim_size; ++k, ++output) { + *output = static_cast<int>(indices[i * suffix_dim_size + k]) == j + ? on_value + : off_value; + } + } + } +} + +template <typename T> +void OneHotCompute(const OneHotContext& op_context) { + if (op_context.indices->type == kTfLiteInt64) { + OneHotComputeImpl<T, int64_t>(op_context); + } else { + OneHotComputeImpl<T, int>(op_context); + } +} + +TfLiteStatus ResizeOutputTensor(TfLiteContext* context, + const OneHotContext& op_context) { + TF_LITE_ENSURE(context, *op_context.depth->data.i32 >= 0); + TfLiteIntArray* output_size = TfLiteIntArrayCreate(op_context.output_dims); + for (int i = 0; i < op_context.output_dims; ++i) { + if (i < op_context.axis) { + output_size->data[i] = op_context.indices->dims->data[i]; + } else if (i == op_context.axis) { + output_size->data[i] = *op_context.depth->data.i32; + } else { + output_size->data[i] = op_context.indices->dims->data[i - 1]; + } + } + return context->ResizeTensor(context, op_context.output, output_size); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 4); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + OneHotContext op_context{context, node}; + switch (op_context.dtype) { + // TODO(b/111744875): Support uint8 and quantization. + case kTfLiteFloat32: + case kTfLiteInt16: + case kTfLiteInt32: + case kTfLiteInt64: + case kTfLiteBool: + op_context.output->type = op_context.dtype; + break; + default: + context->ReportError(context, "Unknown output data type: %d", + op_context.dtype); + return kTfLiteError; + } + + TF_LITE_ENSURE(context, op_context.indices->type == kTfLiteInt32 || + op_context.indices->type == kTfLiteInt64); + TF_LITE_ENSURE(context, op_context.axis >= 0 && + op_context.axis < op_context.output_dims); + TF_LITE_ENSURE_EQ(context, NumElements(op_context.depth), 1); + TF_LITE_ENSURE_EQ(context, NumElements(op_context.on_value), 1); + TF_LITE_ENSURE_EQ(context, NumElements(op_context.off_value), 1); + TF_LITE_ENSURE_EQ(context, op_context.on_value->type, op_context.dtype); + TF_LITE_ENSURE_EQ(context, op_context.off_value->type, op_context.dtype); + + if (!IsConstantTensor(op_context.depth)) { + SetTensorToDynamic(op_context.output); + return kTfLiteOk; + } + + return ResizeOutputTensor(context, op_context); +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + OneHotContext op_context{context, node}; + + if (IsDynamicTensor(op_context.output)) { + ResizeOutputTensor(context, op_context); + } + + switch (op_context.output->type) { + case kTfLiteFloat32: + OneHotCompute<float>(op_context); + break; + case kTfLiteInt32: + OneHotCompute<int>(op_context); + break; + case kTfLiteInt64: + OneHotCompute<int64_t>(op_context); + break; + case kTfLiteBool: + OneHotCompute<bool>(op_context); + break; + default: + return kTfLiteError; + } + + return kTfLiteOk; +} + +} // namespace one_hot + +TfLiteRegistration* Register_ONE_HOT() { + static TfLiteRegistration r = { + nullptr, + nullptr, + one_hot::Prepare, + one_hot::Eval, + }; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/one_hot_test.cc b/tensorflow/contrib/lite/kernels/one_hot_test.cc new file mode 100644 index 0000000000..6b604ec7a7 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/one_hot_test.cc @@ -0,0 +1,182 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include <initializer_list> + +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; + +template <typename T> +class OneHotOpModel : public SingleOpModel { + public: + OneHotOpModel(std::initializer_list<int> input_shape, int depth_value, + TensorType dtype, int axis = -1, T on_value = 1, + T off_value = 0, TensorType indices_type = TensorType_INT32) { + indices_ = AddInput(indices_type); + int depth = AddInput(TensorType_INT32); + int on = AddInput(dtype); + int off = AddInput(dtype); + output_ = AddOutput(dtype); + SetBuiltinOp(BuiltinOperator_ONE_HOT, BuiltinOptions_OneHotOptions, + CreateOneHotOptions(builder_, axis).Union()); + BuildInterpreter({input_shape}); + + PopulateTensor<int>(depth, {depth_value}); + PopulateTensor<T>(on, {on_value}); + PopulateTensor<T>(off, {off_value}); + } + + template <typename TI> + void SetIndices(std::initializer_list<TI> data) { + PopulateTensor<TI>(indices_, data); + } + + TfLiteStatus InvokeWithResult() { return interpreter_->Invoke(); } + + int32_t GetOutputSize() { return GetTensorSize(output_); } + std::vector<T> GetOutput() { return ExtractVector<T>(output_); } + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + private: + int indices_; + int output_; +}; + +TEST(OneHotOpTest, BasicFloat) { + const int depth = 3; + OneHotOpModel<float> model({3}, depth, TensorType_FLOAT32); + model.SetIndices({0, 1, 2}); + model.Invoke(); + + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3})); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f})); +} + +TEST(OneHotOpTest, BasicInt) { + const int depth = 3; + OneHotOpModel<int> model({3}, depth, TensorType_INT32); + model.SetIndices({0, 1, 2}); + model.Invoke(); + + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3})); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0, 0, 1, 0, 0, 0, 1})); +} + +TEST(OneHotOpTest, BasicBool) { + const int depth = 3; + OneHotOpModel<bool> model({3}, depth, TensorType_BOOL); + model.SetIndices({0, 1, 2}); + model.Invoke(); + + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3})); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({true, false, false, false, true, false, false, + false, true})); +} + +TEST(OneHotOpTest, SmallDepth) { + const int depth = 1; + OneHotOpModel<int> model({3}, depth, TensorType_INT32); + model.SetIndices({0, 1, 2}); + model.Invoke(); + + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 1})); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0})); +} + +TEST(OneHotOpTest, BigDepth) { + const int depth = 4; + OneHotOpModel<int> model({2}, depth, TensorType_INT32); + model.SetIndices({0, 1}); + model.Invoke(); + + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 4})); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0, 0, 0, 1, 0, 0})); +} + +TEST(OneHotOpTest, OnOffValues) { + const int depth = 3; + const int axis = -1; + const int on = 5; + const int off = 0; + OneHotOpModel<int> model({4}, depth, TensorType_INT32, axis, on, off); + model.SetIndices({0, 2, -1, 1}); + model.Invoke(); + + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({4, 3})); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({5, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0})); +} + +TEST(OneHotOpTest, ZeroAxis) { + const int depth = 3; + const int axis = 0; + const int on = 5; + const int off = 0; + OneHotOpModel<int> model({4}, depth, TensorType_INT32, axis, on, off); + model.SetIndices({0, 2, -1, 1}); + model.Invoke(); + + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 4})); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({5, 0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 0})); +} + +TEST(OneHotOpTest, MultiDimensionalIndices) { + const int depth = 3; + const int axis = -1; + const float on = 2; + const float off = 0; + OneHotOpModel<float> model({2, 2}, depth, TensorType_FLOAT32, axis, on, off); + model.SetIndices({0, 2, 1, -1}); + model.Invoke(); + + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 3})); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0})); +} + +TEST(OneHotOpTest, Int64Indices) { + const int depth = 3; + const int axis = -1; + const int on = 1; + const int off = 0; + OneHotOpModel<int> model({3}, depth, TensorType_INT32, axis, on, off, + TensorType_INT64); + std::initializer_list<int64_t> indices = {0, 1, 2}; + model.SetIndices(indices); + model.Invoke(); + + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3})); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0, 0, 1, 0, 0, 0, 1})); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/contrib/lite/kernels/op_macros.h index 7568eaa88e..11e814daee 100644 --- a/tensorflow/contrib/lite/kernels/op_macros.h +++ b/tensorflow/contrib/lite/kernels/op_macros.h @@ -12,23 +12,61 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_ -#define TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_ +// If we're on a platform without standard IO functions, fall back to a +// non-portable function. +#ifdef TF_LITE_MCU_DEBUG_LOG + +// This header is pulled in from the support library at +// https://github.com/google/stm32_bare_lib +#include <debug_log.h> + +#define DEBUG_LOG(x) \ + do { \ + DebugLog(x); \ + } while (0) + +inline void InfiniteLoop() { + DEBUG_LOG("HALTED\n"); + while (1) { + } +} +#define TFLITE_ASSERT_FALSE InfiniteLoop(); +#define TFLITE_ABORT InfiniteLoop(); + +#else // TF_LITE_MCU_DEBUG_LOG + +#include <cassert> #include <cstdio> +#include <cstdlib> -#define TF_LITE_FATAL(msg) \ - do { \ - fprintf(stderr, "%s\n", (msg)); \ - exit(1); \ +#define DEBUG_LOG(x) \ + do { \ + fprintf(stderr, "%s", (x)); \ } while (0) + +#define TFLITE_ASSERT_FALSE assert(false) +#define TFLITE_ABORT abort() + +#endif // TF_LITE_MCU_DEBUG_LOG + +#define TF_LITE_FATAL(msg) \ + do { \ + DEBUG_LOG(msg); \ + DEBUG_LOG("\nFATAL\n"); \ + TFLITE_ABORT; \ + } while (0) + #define TF_LITE_ASSERT(x) \ do { \ if (!(x)) TF_LITE_FATAL(#x); \ } while (0) + #define TF_LITE_ASSERT_EQ(x, y) \ do { \ if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \ } while (0) -#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_ +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_ diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc index bcad58406a..90a915bb02 100644 --- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc +++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc @@ -95,8 +95,12 @@ class LSTMOpModel : public SingleOpModel { projection_bias_ = AddNullInput(); } - output_state_ = AddOutput(TensorType_FLOAT32); - cell_state_ = AddOutput(TensorType_FLOAT32); + // Adding the 2 input state tensors. + input_activation_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true); + input_cell_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true); + output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions, @@ -174,22 +178,6 @@ class LSTMOpModel : public SingleOpModel { PopulateTensor(projection_bias_, f); } - void ResetOutputState() { - const int zero_buffer_size = n_cell_ * n_batch_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(output_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - - void ResetCellState() { - const int zero_buffer_size = n_cell_ * n_batch_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(cell_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - void SetInput(int offset, float* begin, float* end) { PopulateTensor(input_, offset, begin, end); } @@ -228,10 +216,10 @@ class LSTMOpModel : public SingleOpModel { int projection_weights_; int projection_bias_; + int input_activation_state_; + int input_cell_state_; int output_; - int output_state_; - int cell_state_; int n_batch_; int n_input_; @@ -316,10 +304,6 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { lstm.SetCellToOutputWeights( {-0.17135078, 0.82760304, 0.85573703, -0.77109635}); - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); - // Verify the model by unpacking it. lstm.Verify(); } diff --git a/tensorflow/contrib/lite/kernels/pack.cc b/tensorflow/contrib/lite/kernels/pack.cc new file mode 100644 index 0000000000..c368582ef7 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/pack.cc @@ -0,0 +1,135 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace pack { +namespace { + +constexpr int kOutputTensor = 0; + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + const TfLitePackParams* data = + reinterpret_cast<TfLitePackParams*>(node->builtin_data); + + TF_LITE_ENSURE_EQ(context, NumInputs(node), data->values_count); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + const TfLiteTensor* input0 = GetInput(context, node, 0); + TF_LITE_ENSURE(context, NumDimensions(input0) < 4); + TF_LITE_ENSURE(context, NumDimensions(input0) >= data->axis); + // TODO(renjieliu): Support negative axis. + TF_LITE_ENSURE(context, data->axis >= 0); + if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 && + input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16) { + context->ReportError(context, + "Currently pack only supports " + "float32/uint8/int16/int32."); + return kTfLiteError; + } + // Make sure all inputs have the same shape and type. + for (int i = 1; i < data->values_count; ++i) { + const TfLiteTensor* input = GetInput(context, node, i); + TF_LITE_ENSURE(context, HaveSameShapes(input0, input)); + TF_LITE_ENSURE_EQ(context, input0->type, input->type); + } + + // Resize output. rank R will become rank R + 1 + const int dimension_size = NumDimensions(input0) + 1; + const TfLiteIntArray* input_shape = input0->dims; + TfLiteIntArray* output_shape = TfLiteIntArrayCreate(dimension_size); + int i = 0; + for (int index = 0; index < dimension_size; ++index) { + if (index == data->axis) { + output_shape->data[index] = data->values_count; + } else { + output_shape->data[index] = input_shape->data[i++]; + } + } + + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TF_LITE_ENSURE_EQ(context, output->type, input0->type); + + // Guarantee input/output quantization params match as we do not support + // packing quantized tensors. + for (int i = 0; i < data->values_count; i++) { + const TfLiteTensor* input = GetInput(context, node, i); + TF_LITE_ENSURE_EQ(context, input->params.zero_point, + output->params.zero_point); + TF_LITE_ENSURE_EQ(context, input->params.scale, output->params.scale); + } + + return context->ResizeTensor(context, output, output_shape); +} + +template <typename T> +void PackImpl(TfLiteContext* context, TfLiteNode* node, TfLiteTensor* output, + int values_count, int axis) { + VectorOfTensors<T> all_inputs(*context, *node->inputs); + tflite::PackParams op_params; + op_params.axis = axis; + op_params.inputs_count = values_count; + + reference_ops::Pack<T>(op_params, all_inputs.shapes(), all_inputs.data(), + GetTensorShape(output), GetTensorData<T>(output)); +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLitePackParams* data = + reinterpret_cast<TfLitePackParams*>(node->builtin_data); + + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + switch (output->type) { + case kTfLiteFloat32: { + PackImpl<float>(context, node, output, data->values_count, data->axis); + break; + } + case kTfLiteUInt8: { + PackImpl<uint8_t>(context, node, output, data->values_count, data->axis); + break; + } + case kTfLiteInt32: { + PackImpl<int32_t>(context, node, output, data->values_count, data->axis); + break; + } + default: { + context->ReportError(context, + "Currently pack only supports " + "float32/uint8/int32."); + return kTfLiteError; + } + } + + return kTfLiteOk; +} + +} // namespace +} // namespace pack + +TfLiteRegistration* Register_PACK() { + static TfLiteRegistration r = {nullptr, nullptr, pack::Prepare, pack::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/pack_test.cc b/tensorflow/contrib/lite/kernels/pack_test.cc new file mode 100644 index 0000000000..c70dbd2764 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/pack_test.cc @@ -0,0 +1,154 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; + +template <typename T> +class PackOpModel : public SingleOpModel { + public: + PackOpModel(const TensorData& input_template, int axis, int values_count) { + std::vector<std::vector<int>> all_input_shapes; + for (int i = 0; i < values_count; ++i) { + all_input_shapes.push_back(input_template.shape); + AddInput(input_template); + } + output_ = AddOutput({input_template.type, /*shape=*/{}, input_template.min, + input_template.max}); + SetBuiltinOp(BuiltinOperator_PACK, BuiltinOptions_PackOptions, + CreatePackOptions(builder_, values_count, axis).Union()); + BuildInterpreter(all_input_shapes); + } + + void SetInput(int index, std::initializer_list<T> data) { + PopulateTensor(index, data); + } + + std::vector<T> GetOutput() { return ExtractVector<T>(output_); } + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + private: + int output_; +}; + +// float32 tests. +TEST(PackOpTest, FloatThreeInputs) { + PackOpModel<float> model({TensorType_FLOAT32, {2}}, 0, 3); + model.SetInput(0, {1, 4}); + model.SetInput(1, {2, 5}); + model.SetInput(2, {3, 6}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2)); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6})); +} + +TEST(PackOpTest, FloatThreeInputsDifferentAxis) { + PackOpModel<float> model({TensorType_FLOAT32, {2}}, 1, 3); + model.SetInput(0, {1, 4}); + model.SetInput(1, {2, 5}); + model.SetInput(2, {3, 6}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3)); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6})); +} + +TEST(PackOpTest, FloatMultilDimensions) { + PackOpModel<float> model({TensorType_FLOAT32, {2, 3}}, 1, 2); + model.SetInput(0, {1, 2, 3, 4, 5, 6}); + model.SetInput(1, {7, 8, 9, 10, 11, 12}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3)); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12})); +} + +// int32 tests. +TEST(PackOpTest, Int32ThreeInputs) { + PackOpModel<int32_t> model({TensorType_INT32, {2}}, 0, 3); + model.SetInput(0, {1, 4}); + model.SetInput(1, {2, 5}); + model.SetInput(2, {3, 6}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2)); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6})); +} + +TEST(PackOpTest, Int32ThreeInputsDifferentAxis) { + PackOpModel<int32_t> model({TensorType_INT32, {2}}, 1, 3); + model.SetInput(0, {1, 4}); + model.SetInput(1, {2, 5}); + model.SetInput(2, {3, 6}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3)); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6})); +} + +TEST(PackOpTest, Int32MultilDimensions) { + PackOpModel<int32_t> model({TensorType_INT32, {2, 3}}, 1, 2); + model.SetInput(0, {1, 2, 3, 4, 5, 6}); + model.SetInput(1, {7, 8, 9, 10, 11, 12}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3)); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12})); +} + +// uint8 +TEST(PackOpTest, Uint8ThreeInputs) { + PackOpModel<uint8_t> model({TensorType_UINT8, {2}}, 0, 3); + model.SetInput(0, {1, 4}); + model.SetInput(1, {2, 5}); + model.SetInput(2, {3, 6}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2)); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6})); +} + +TEST(PackOpTest, Uint8ThreeInputsDifferentAxis) { + PackOpModel<uint8_t> model({TensorType_UINT8, {2}}, 1, 3); + model.SetInput(0, {1, 4}); + model.SetInput(1, {2, 5}); + model.SetInput(2, {3, 6}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3)); + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6})); +} + +TEST(PackOpTest, Uint8MultilDimensions) { + PackOpModel<uint8_t> model({TensorType_UINT8, {2, 3}}, 1, 2); + model.SetInput(0, {1, 2, 3, 4, 5, 6}); + model.SetInput(1, {7, 8, 9, 10, 11, 12}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3)); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12})); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc index ecac2dd5e3..0d939405f6 100644 --- a/tensorflow/contrib/lite/kernels/pad.cc +++ b/tensorflow/contrib/lite/kernels/pad.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include <string.h> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -92,8 +92,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { op_context.constant_values->type); } - // TODO(nupurgarg): Our current implementations rely on the inputs being 4D. - TF_LITE_ENSURE_EQ(context, op_context.dims, 4); + // TODO(nupurgarg): Current implementations rely on the inputs being <= 4D. + TF_LITE_ENSURE(context, op_context.dims <= 4); // Exit early if paddings is a non-const tensor. Set output tensor to // dynamic so output size can be determined in Eval. @@ -128,18 +128,28 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { // TODO(nupurgarg): Change kernel implementation to use padding arrays in // forward order (depth, width, height, batch). // Build paddings in order of int[] = {batch, height, width, depth} to match - // kernel implementation of Pad in referenced_ops.h and optimized_ops.h. + // kernel implementation of Pad in reference_ops.h and optimized_ops.h. for (int idx = op_context.dims - 1; idx >= 0; --idx) { before_padding.push_back(paddings_data[idx * 2]); after_padding.push_back(paddings_data[idx * 2 + 1]); } -#define TF_LITE_PAD(type, scalar, pad_value) \ - type::PadV2(GetTensorData<scalar>(op_context.input), \ - GetTensorDims(op_context.input), before_padding, after_padding, \ - GetTensorData<scalar>(op_context.output), \ - GetTensorDims(op_context.output), pad_value) - +#define TF_LITE_PAD(type, scalar, pad_value) \ + TF_LITE_ENSURE(context, before_padding.size() <= 4); \ + TF_LITE_ENSURE(context, after_padding.size() <= 4); \ + tflite::PadParams op_params; \ + op_params.left_padding_count = before_padding.size(); \ + op_params.right_padding_count = after_padding.size(); \ + for (int i = 0; i < op_context.dims; ++i) { \ + op_params.left_padding[i] = before_padding[op_context.dims - 1 - i]; \ + op_params.right_padding[i] = after_padding[op_context.dims - 1 - i]; \ + } \ + const scalar pad_value_copy = pad_value; \ + \ + type::Pad(op_params, GetTensorShape(op_context.input), \ + GetTensorData<scalar>(op_context.input), &pad_value_copy, \ + GetTensorShape(op_context.output), \ + GetTensorData<scalar>(op_context.output)) switch (op_context.input->type) { case kTfLiteFloat32: { float pad_value = op_context.constant_values == nullptr @@ -199,7 +209,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } } break; default: - context->ReportError(context, "Type is currently not supported by Pad."); + context->ReportError(context, + "Type %d is currently not supported by Pad.", + op_context.input->type); return kTfLiteError; } #undef TF_LITE_PAD diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc index f8b9064fbb..f663899713 100644 --- a/tensorflow/contrib/lite/kernels/pad_test.cc +++ b/tensorflow/contrib/lite/kernels/pad_test.cc @@ -193,7 +193,7 @@ TEST(PadOpTest, TooManyDimensions) { PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2}, {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, {TensorType_FLOAT32}), - "dims != 4"); + "dims <= 4"); } TEST(PadOpTest, UnequalDimensions) { @@ -221,6 +221,15 @@ TEST(PadOpTest, SimpleConstTest) { EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1})); } +TEST(PadOpTest, SimpleConst1DTest) { + PadOpConstModel m({TensorType_FLOAT32, {2}}, {1, 2}, {1, 2}, + {TensorType_FLOAT32}); + m.SetInput({2, 3}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 3, 0, 0})); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({5})); +} + TEST(PadOpTest, SimpleDynamicTest) { PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, {TensorType_FLOAT32}); @@ -334,7 +343,7 @@ TEST(PadV2OpTest, TooManyDimensions) { {TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2}, {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0, {TensorType_FLOAT32}), - "dims != 4"); + "dims <= 4"); } TEST(PadV2OpTest, UnequalDimensions) { diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h index 3cb55f19a9..42b6b45d3b 100644 --- a/tensorflow/contrib/lite/kernels/padding.h +++ b/tensorflow/contrib/lite/kernels/padding.h @@ -15,7 +15,7 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_ #define TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_ -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" namespace tflite { diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc index 645d9f4008..6451142391 100644 --- a/tensorflow/contrib/lite/kernels/pooling.cc +++ b/tensorflow/contrib/lite/kernels/pooling.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -20,8 +19,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -80,24 +79,24 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) { // Matching GetWindowedOutputSize in TensorFlow. auto padding = params->padding; - auto computeOutSize = [padding](int imageSize, int filterSize, - int stride) -> int { + auto compute_out_size = [padding](int image_size, int filter_size, + int stride) -> int { return padding == kTfLitePaddingSame - ? (imageSize + stride - 1) / stride + ? (image_size + stride - 1) / stride : padding == kTfLitePaddingValid - ? (imageSize - filterSize + stride) / stride + ? (image_size - filter_size + stride) / stride : 0; }; - int outWidth = - computeOutSize(width, params->filter_width, params->stride_width); - int outHeight = - computeOutSize(height, params->filter_height, params->stride_height); + int out_width = + compute_out_size(width, params->filter_width, params->stride_width); + int out_height = + compute_out_size(height, params->filter_height, params->stride_height); data->padding.height = ComputePadding(params->stride_height, 1, height, - params->filter_height, outHeight); + params->filter_height, out_height); data->padding.width = ComputePadding(params->stride_width, 1, width, - params->filter_width, outWidth); + params->filter_width, out_width); if (input->type == kTfLiteUInt8) { if (pool_type == kAverage || pool_type == kMax) { @@ -111,12 +110,12 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) { } } - TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4); - outputSize->data[0] = batches; - outputSize->data[1] = outHeight; - outputSize->data[2] = outWidth; - outputSize->data[3] = channels_out; - return context->ResizeTensor(context, output, outputSize); + TfLiteIntArray* output_size = TfLiteIntArrayCreate(4); + output_size->data[0] = batches; + output_size->data[1] = out_height; + output_size->data[2] = out_width; + output_size->data[3] = channels_out; + return context->ResizeTensor(context, output, output_size); } template <KernelType kernel_type> @@ -124,14 +123,21 @@ void AverageEvalFloat(TfLiteContext* context, TfLiteNode* node, TfLitePoolParams* params, OpData* data, const TfLiteTensor* input, TfLiteTensor* output) { float activation_min, activation_max; - CalculateActivationRangeFloat(params->activation, &activation_min, - &activation_max); -#define TF_LITE_AVERAGE_POOL(type) \ - type::AveragePool( \ - GetTensorData<float>(input), GetTensorDims(input), params->stride_width, \ - params->stride_height, data->padding.width, data->padding.height, \ - params->filter_width, params->filter_height, activation_min, \ - activation_max, GetTensorData<float>(output), GetTensorDims(output)) + CalculateActivationRange(params->activation, &activation_min, + &activation_max); +#define TF_LITE_AVERAGE_POOL(type) \ + tflite::PoolParams op_params; \ + op_params.stride_height = params->stride_height; \ + op_params.stride_width = params->stride_width; \ + op_params.filter_height = params->filter_height; \ + op_params.filter_width = params->filter_width; \ + op_params.padding_values.height = data->padding.height; \ + op_params.padding_values.width = data->padding.width; \ + op_params.float_activation_min = activation_min; \ + op_params.float_activation_max = activation_max; \ + type::AveragePool(op_params, GetTensorShape(input), \ + GetTensorData<float>(input), GetTensorShape(output), \ + GetTensorData<float>(output)) if (kernel_type == kReference) { TF_LITE_AVERAGE_POOL(reference_ops); } else { @@ -148,13 +154,19 @@ void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node, int32_t activation_max; CalculateActivationRangeUint8(params->activation, output, &activation_min, &activation_max); -#define TF_LITE_AVERAGE_POOL(type) \ - type::AveragePool(GetTensorData<uint8_t>(input), GetTensorDims(input), \ - params->stride_width, params->stride_height, \ - data->padding.width, data->padding.height, \ - params->filter_width, params->filter_height, \ - activation_min, activation_max, \ - GetTensorData<uint8_t>(output), GetTensorDims(output)) +#define TF_LITE_AVERAGE_POOL(type) \ + tflite::PoolParams op_params; \ + op_params.stride_height = params->stride_height; \ + op_params.stride_width = params->stride_width; \ + op_params.filter_height = params->filter_height; \ + op_params.filter_width = params->filter_width; \ + op_params.padding_values.height = data->padding.height; \ + op_params.padding_values.width = data->padding.width; \ + op_params.quantized_activation_min = activation_min; \ + op_params.quantized_activation_max = activation_max; \ + type::AveragePool(op_params, GetTensorShape(input), \ + GetTensorData<uint8_t>(input), GetTensorShape(output), \ + GetTensorData<uint8_t>(output)) if (kernel_type == kReference) { TF_LITE_AVERAGE_POOL(reference_ops); } else { @@ -168,14 +180,20 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node, TfLitePoolParams* params, OpData* data, const TfLiteTensor* input, TfLiteTensor* output) { float activation_min, activation_max; - CalculateActivationRangeFloat(params->activation, &activation_min, - &activation_max); + CalculateActivationRange(params->activation, &activation_min, + &activation_max); #define TF_LITE_MAX_POOL(type) \ - type::MaxPool( \ - GetTensorData<float>(input), GetTensorDims(input), params->stride_width, \ - params->stride_height, data->padding.width, data->padding.height, \ - params->filter_width, params->filter_height, activation_min, \ - activation_max, GetTensorData<float>(output), GetTensorDims(output)) + tflite::PoolParams op_params; \ + op_params.stride_height = params->stride_height; \ + op_params.stride_width = params->stride_width; \ + op_params.filter_height = params->filter_height; \ + op_params.filter_width = params->filter_width; \ + op_params.padding_values.height = data->padding.height; \ + op_params.padding_values.width = data->padding.width; \ + op_params.float_activation_min = activation_min; \ + op_params.float_activation_max = activation_max; \ + type::MaxPool(op_params, GetTensorShape(input), GetTensorData<float>(input), \ + GetTensorShape(output), GetTensorData<float>(output)) if (kernel_type == kReference) { TF_LITE_MAX_POOL(reference_ops); } else { @@ -192,13 +210,19 @@ void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node, int32_t activation_max; CalculateActivationRangeUint8(params->activation, output, &activation_min, &activation_max); -#define TF_LITE_MAX_POOL(type) \ - type::MaxPool(GetTensorData<uint8_t>(input), GetTensorDims(input), \ - params->stride_width, params->stride_height, \ - data->padding.width, data->padding.height, \ - params->filter_width, params->filter_height, activation_min, \ - activation_max, GetTensorData<uint8_t>(output), \ - GetTensorDims(output)) +#define TF_LITE_MAX_POOL(type) \ + tflite::PoolParams op_params; \ + op_params.stride_height = params->stride_height; \ + op_params.stride_width = params->stride_width; \ + op_params.filter_height = params->filter_height; \ + op_params.filter_width = params->filter_width; \ + op_params.padding_values.height = data->padding.height; \ + op_params.padding_values.width = data->padding.width; \ + op_params.quantized_activation_min = activation_min; \ + op_params.quantized_activation_max = activation_max; \ + type::MaxPool(op_params, GetTensorShape(input), \ + GetTensorData<uint8_t>(input), GetTensorShape(output), \ + GetTensorData<uint8_t>(output)) if (kernel_type == kReference) { TF_LITE_MAX_POOL(reference_ops); } else { @@ -212,14 +236,20 @@ void L2EvalFloat(TfLiteContext* context, TfLiteNode* node, TfLitePoolParams* params, OpData* data, const TfLiteTensor* input, TfLiteTensor* output) { float activation_min, activation_max; - CalculateActivationRangeFloat(params->activation, &activation_min, - &activation_max); -#define TF_LITE_L2_POOL(type) \ - type::L2Pool( \ - GetTensorData<float>(input), GetTensorDims(input), params->stride_width, \ - params->stride_height, data->padding.width, data->padding.height, \ - params->filter_width, params->filter_height, activation_min, \ - activation_max, GetTensorData<float>(output), GetTensorDims(output)) + CalculateActivationRange(params->activation, &activation_min, + &activation_max); +#define TF_LITE_L2_POOL(type) \ + tflite::PoolParams op_params; \ + op_params.stride_height = params->stride_height; \ + op_params.stride_width = params->stride_width; \ + op_params.filter_height = params->filter_height; \ + op_params.filter_width = params->filter_width; \ + op_params.padding_values.height = data->padding.height; \ + op_params.padding_values.width = data->padding.width; \ + op_params.float_activation_min = activation_min; \ + op_params.float_activation_max = activation_max; \ + type::L2Pool(op_params, GetTensorShape(input), GetTensorData<float>(input), \ + GetTensorShape(output), GetTensorData<float>(output)) if (kernel_type == kReference) { TF_LITE_L2_POOL(reference_ops); } else { @@ -246,7 +276,8 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) { output); break; default: - context->ReportError(context, "Type not currently supported."); + context->ReportError(context, "Type %d not currently supported.", + input->type); return kTfLiteError; } return kTfLiteOk; @@ -267,7 +298,8 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) { MaxEvalQuantized<kernel_type>(context, node, params, data, input, output); break; default: - context->ReportError(context, "Type not currently supported."); + context->ReportError(context, "Type %d not currently supported.", + input->type); return kTfLiteError; } return kTfLiteOk; @@ -288,7 +320,8 @@ TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) { // We don't have a quantized implementation, so just fall through to the // 'default' case. default: - context->ReportError(context, "Type not currently supported."); + context->ReportError(context, "Type %d not currently supported.", + input->type); return kTfLiteError; } return kTfLiteOk; diff --git a/tensorflow/contrib/lite/kernels/pow.cc b/tensorflow/contrib/lite/kernels/pow.cc new file mode 100644 index 0000000000..1e96cc80b1 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/pow.cc @@ -0,0 +1,143 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace pow { +namespace { + +// Input/output tensor index. +constexpr int kInputTensor1 = 0; +constexpr int kInputTensor2 = 1; +constexpr int kOutputTensor = 0; + +// Op data for pow op. +struct OpData { + bool requires_broadcast; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* data = new OpData; + data->requires_broadcast = false; + return data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<OpData*>(buffer); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + OpData* data = reinterpret_cast<OpData*>(node->user_data); + + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + TF_LITE_ENSURE_EQ(context, input1->type, input2->type); + + const TfLiteType type = input1->type; + if (type != kTfLiteInt32 && type != kTfLiteFloat32) { + context->ReportError(context, "Unsupported data type %d.", type); + return kTfLiteError; + } + output->type = type; + + data->requires_broadcast = !HaveSameShapes(input1, input2); + + TfLiteIntArray* output_size = nullptr; + if (data->requires_broadcast) { + TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast( + context, input1, input2, &output_size)); + } else { + output_size = TfLiteIntArrayCopy(input1->dims); + } + + return context->ResizeTensor(context, output, output_size); +} + +template <typename T> +void PowImpl(const TfLiteTensor* input1, const TfLiteTensor* input2, + TfLiteTensor* output, bool requires_broadcast) { + if (requires_broadcast) { + reference_ops::BroadcastPow4DSlow( + GetTensorShape(input1), GetTensorData<T>(input1), + GetTensorShape(input2), GetTensorData<T>(input2), + GetTensorShape(output), GetTensorData<T>(output)); + } else { + reference_ops::Pow(GetTensorShape(input1), GetTensorData<T>(input1), + GetTensorShape(input2), GetTensorData<T>(input2), + GetTensorShape(output), GetTensorData<T>(output)); + } +} + +TfLiteStatus CheckValue(TfLiteContext* context, const TfLiteTensor* input) { + const int64_t num_elements = NumElements(input); + const int32_t* data = GetTensorData<int32_t>(input); + for (int i = 0; i < num_elements; ++i) { + if (data[i] < 0) { + context->ReportError(context, + "POW does not support negative value for int32."); + return kTfLiteError; + } + } + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + OpData* data = reinterpret_cast<OpData*>(node->user_data); + + const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); + const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + switch (output->type) { + case kTfLiteInt32: { + // TensorFlow does not support negative for int32. + TF_LITE_ENSURE_OK(context, CheckValue(context, input2)); + PowImpl<int32_t>(input1, input2, output, data->requires_broadcast); + break; + } + case kTfLiteFloat32: { + PowImpl<float>(input1, input2, output, data->requires_broadcast); + break; + } + default: { + context->ReportError(context, "Unsupported data type: %d", output->type); + return kTfLiteError; + } + } + return kTfLiteOk; +} + +} // namespace +} // namespace pow + +TfLiteRegistration* Register_POW() { + static TfLiteRegistration r = {pow::Init, pow::Free, pow::Prepare, pow::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/pow_test.cc b/tensorflow/contrib/lite/kernels/pow_test.cc new file mode 100644 index 0000000000..74b3aef5bd --- /dev/null +++ b/tensorflow/contrib/lite/kernels/pow_test.cc @@ -0,0 +1,117 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; + +template <typename T> +class PowOpModel : public SingleOpModel { + public: + PowOpModel(const TensorData& input1, const TensorData& input2, + const TensorData& output) { + input1_ = AddInput(input1); + input2_ = AddInput(input2); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_POW, BuiltinOptions_PowOptions, + CreatePowOptions(builder_).Union()); + BuildInterpreter({GetShape(input1_), GetShape(input2_)}); + } + + int input1() { return input1_; } + int input2() { return input2_; } + + std::vector<T> GetOutput() { return ExtractVector<T>(output_); } + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + private: + int input1_; + int input2_; + int output_; +}; + +TEST(PowOpModel, Simple) { + PowOpModel<int32_t> model({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {}}); + model.PopulateTensor<int32_t>(model.input1(), {12, 2, 7, 8}); + model.PopulateTensor<int32_t>(model.input2(), {1, 2, 3, 1}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), ElementsAre(12, 4, 343, 8)); +} + +TEST(PowOpModel, NegativeAndZeroValue) { + PowOpModel<int32_t> model({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {}}); + model.PopulateTensor<int32_t>(model.input1(), {0, 2, -7, 8}); + model.PopulateTensor<int32_t>(model.input2(), {1, 2, 3, 0}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), ElementsAre(0, 4, -343, 1)); +} + +TEST(PowOpModel, Float) { + PowOpModel<float> model({TensorType_FLOAT32, {1, 2, 2, 1}}, + {TensorType_FLOAT32, {1, 2, 2, 1}}, + {TensorType_FLOAT32, {}}); + model.PopulateTensor<float>(model.input1(), {0.3, 0.4, 0.7, 5.8}); + model.PopulateTensor<float>(model.input2(), {0.5, 2.7, 3.1, 3.2}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray(ArrayFloatNear( + {0.5477226, 0.08424846, 0.33098164, 277.313}, 1e-3))); +} + +TEST(PowOpModel, NegativeFloatTest) { + PowOpModel<float> model({TensorType_FLOAT32, {1, 2, 2, 1}}, + {TensorType_FLOAT32, {1, 2, 2, 1}}, + {TensorType_FLOAT32, {}}); + model.PopulateTensor<float>(model.input1(), {0.3, 0.4, 0.7, 5.8}); + model.PopulateTensor<float>(model.input2(), {0.5, -2.7, 3.1, -3.2}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray(ArrayFloatNear( + {0.5477226, 11.869653, 0.33098164, 0.003606}, 1e-3))); +} + +TEST(PowOpModel, BroadcastTest) { + PowOpModel<int32_t> model({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1}}, {TensorType_INT32, {}}); + model.PopulateTensor<int32_t>(model.input1(), {12, 2, 7, 8}); + model.PopulateTensor<int32_t>(model.input2(), {4}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), ElementsAre(20736, 16, 2401, 4096)); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/reduce.cc b/tensorflow/contrib/lite/kernels/reduce.cc new file mode 100644 index 0000000000..4732a37a65 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/reduce.cc @@ -0,0 +1,513 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <string.h> +#include <limits> +#include <vector> +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace reduce { + +// This file has reference implementation of reduce_* operators. +enum KernelType { + kReference, +}; + +struct OpContext { + OpContext(TfLiteContext* context, TfLiteNode* node) { + params = reinterpret_cast<TfLiteReducerParams*>(node->builtin_data); + input = GetInput(context, node, 0); + axis = GetInput(context, node, 1); + output = GetOutput(context, node, 0); + } + TfLiteReducerParams* params; + const TfLiteTensor* input; + const TfLiteTensor* axis; + TfLiteTensor* output; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + // Creates two temp tensors to store index and axis for internal + // implementation only. + auto* scratch_tensor_index = new int; + context->AddTensors(context, 3, scratch_tensor_index); + return scratch_tensor_index; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<int*>(buffer); +} + +// Resizes the temp tensor that stores resolved axis. +TfLiteStatus ResizeTempAxis(TfLiteContext* context, OpContext* op_context, + TfLiteTensor* resolved_axis) { + TfLiteIntArray* axis_size = TfLiteIntArrayCreate(1); + axis_size->data[0] = static_cast<int>(NumElements(op_context->axis)); + return context->ResizeTensor(context, resolved_axis, axis_size); +} + +// Resizes the temp tensor that stores temp sum of reduced elements. +TfLiteStatus ResizeTempSum(TfLiteContext* context, OpContext* op_context, + TfLiteTensor* temp_sum) { + TfLiteIntArray* size = TfLiteIntArrayCreate(1); + size->data[0] = static_cast<int>(NumElements(op_context->output)); + return context->ResizeTensor(context, temp_sum, size); +} + +// Resizes output array based on the input size and resolved axis. +TfLiteStatus ResizeOutputTensor(TfLiteContext* context, OpContext* op_context) { + size_t num_axis = NumElements(op_context->axis); + const TfLiteIntArray* input_dims = op_context->input->dims; + int input_num_dims = NumDimensions(op_context->input); + if (input_num_dims == 0) { + return context->ResizeTensor(context, op_context->output, + TfLiteIntArrayCreate(0)); + } + const int* axis = GetTensorData<int>(op_context->axis); + if (op_context->params->keep_dims) { + TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_num_dims); + for (int idx = 0; idx < input_num_dims; ++idx) { + bool is_axis = false; + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) { + if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) { + is_axis = true; + break; + } + } + if (is_axis) { + output_dims->data[idx] = 1; + } else { + output_dims->data[idx] = input_dims->data[idx]; + } + } + return context->ResizeTensor(context, op_context->output, output_dims); + } else { + // Calculates size of reducing axis. + int num_reduce_axis = num_axis; + for (int i = 0; i < num_axis; ++i) { + int current = axis[i]; + if (current < 0) { + current += input_num_dims; + } + TF_LITE_ENSURE(context, current >= 0 && current < input_num_dims); + for (int j = 0; j < i; ++j) { + int previous = axis[j]; + if (previous < 0) { + previous += input_num_dims; + } + if (current == previous) { + --num_reduce_axis; + break; + } + } + } + // Determines output dimensions. + TfLiteIntArray* output_dims = + TfLiteIntArrayCreate(input_num_dims - num_reduce_axis); + int num_skip_axis = 0; + for (int idx = 0; idx < input_num_dims; ++idx) { + bool is_axis = false; + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) { + if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) { + ++num_skip_axis; + is_axis = true; + break; + } + } + if (!is_axis) { + output_dims->data[idx - num_skip_axis] = input_dims->data[idx]; + } + } + return context->ResizeTensor(context, op_context->output, output_dims); + } +} + +// Initializes temp tensors to store index and resolved axis. +TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node, + OpContext* op_context) { + // Creates a temp index to iterate through input data. + int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data); + TfLiteIntArrayFree(node->temporaries); + node->temporaries = TfLiteIntArrayCreate(3); + node->temporaries->data[0] = *scratch_tensor_index; + TfLiteTensor* scratch_tensor = GetTemporary(context, node, /*index=*/0); + scratch_tensor->type = kTfLiteInt32; + scratch_tensor->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* index_size = TfLiteIntArrayCreate(1); + index_size->data[0] = NumDimensions(op_context->input); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, scratch_tensor, index_size)); + + // Creates a temp tensor to store resolved axis given input data. + node->temporaries->data[1] = *scratch_tensor_index + 1; + TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); + resolved_axis->type = kTfLiteInt32; + // Creates a temp tensor to store temp sums when calculating mean. + node->temporaries->data[2] = *scratch_tensor_index + 2; + TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2); + switch (op_context->input->type) { + case kTfLiteFloat32: + temp_sum->type = kTfLiteFloat32; + break; + case kTfLiteInt32: + temp_sum->type = kTfLiteInt64; + break; + case kTfLiteInt64: + temp_sum->type = kTfLiteInt64; + break; + case kTfLiteUInt8: + temp_sum->type = kTfLiteInt32; + break; + case kTfLiteBool: + temp_sum->type = kTfLiteBool; + break; + default: + return kTfLiteError; + } + return kTfLiteOk; +} + +TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + OpContext op_context(context, node); + TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context)); + + TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); + // Leaves work to Eval if axis is not constant; else resizes output. + if (!IsConstantTensor(op_context.axis)) { + SetTensorToDynamic(op_context.output); + SetTensorToDynamic(resolved_axis); + return kTfLiteOk; + } + resolved_axis->allocation_type = kTfLiteArenaRw; + TF_LITE_ENSURE_OK(context, + ResizeTempAxis(context, &op_context, resolved_axis)); + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + return kTfLiteOk; +} + +TfLiteStatus PrepareAny(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + const TfLiteTensor* input = GetInput(context, node, 0); + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteBool); + return PrepareSimple(context, node); +} + +TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_OK(context, PrepareSimple(context, node)); + + // reduce_mean requires a buffer to store intermediate sum result. + OpContext op_context(context, node); + TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2); + if (!IsConstantTensor(op_context.axis)) { + SetTensorToDynamic(temp_sum); + return kTfLiteOk; + } + temp_sum->allocation_type = kTfLiteArenaRw; + return ResizeTempSum(context, &op_context, temp_sum); +} + +template <KernelType kernel_type> +TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) { + OpContext op_context(context, node); + int num_axis = static_cast<int>(NumElements(op_context.axis)); + TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0); + TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); + TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2); + // Resize the output tensor if the output tensor is dynamic. + if (IsDynamicTensor(op_context.output)) { + TF_LITE_ENSURE_OK(context, + ResizeTempAxis(context, &op_context, resolved_axis)); + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum)); + } + +#define TF_LITE_MEAN(kernel_type, data_type, temp_data_type) \ + kernel_type::Mean<>( \ + GetTensorData<data_type>(op_context.input), \ + op_context.input->dims->data, op_context.input->dims->size, \ + GetTensorData<data_type>(op_context.output), \ + op_context.output->dims->data, op_context.output->dims->size, \ + GetTensorData<int>(op_context.axis), num_axis, \ + op_context.params->keep_dims, GetTensorData<int>(temp_index), \ + GetTensorData<int>(resolved_axis), \ + GetTensorData<temp_data_type>(temp_sum)) + + if (kernel_type == kReference) { + switch (op_context.input->type) { + case kTfLiteFloat32: + TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float)); + break; + case kTfLiteInt32: + TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int, int64_t)); + break; + case kTfLiteInt64: + TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int64_t, int64_t)); + break; + case kTfLiteUInt8: + if (op_context.input->params.zero_point == + op_context.output->params.zero_point && + op_context.input->params.scale == op_context.output->params.scale) { + TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int)); + } else { + TF_LITE_ENSURE( + context, + reference_ops::QuantizedMeanOrSum<>( + GetTensorData<uint8_t>(op_context.input), + op_context.input->params.zero_point, + op_context.input->params.scale, op_context.input->dims->data, + op_context.input->dims->size, + GetTensorData<uint8_t>(op_context.output), + op_context.output->params.zero_point, + op_context.output->params.scale, + op_context.output->dims->data, op_context.output->dims->size, + GetTensorData<int>(op_context.axis), num_axis, + op_context.params->keep_dims, GetTensorData<int>(temp_index), + GetTensorData<int>(resolved_axis), + GetTensorData<int>(temp_sum), /*compute_sum=*/false)); + } + break; + default: + return kTfLiteError; + } + } +#undef TF_LITE_MEAN + return kTfLiteOk; +} + +// The underlying logic for Reduce Sum/Prod/Max/Min/Any +template <typename T> +TfLiteStatus EvalLogic(TfLiteContext* context, TfLiteNode* node, + OpContext* op_context, T init_value, + T reducer(const T current, const T in)) { + int64_t num_axis = NumElements(op_context->axis); + TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0); + TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); + // Resize the output tensor if the output tensor is dynamic. + if (IsDynamicTensor(op_context->output)) { + TF_LITE_ENSURE_OK(context, + ResizeTempAxis(context, op_context, resolved_axis)); + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, op_context)); + } + if (op_context->input->type == kTfLiteUInt8) { + TF_LITE_ENSURE_EQ(context, op_context->input->params.scale, + op_context->output->params.scale); + TF_LITE_ENSURE_EQ(context, op_context->input->params.zero_point, + op_context->output->params.zero_point); + } + TF_LITE_ENSURE( + context, + reference_ops::ReduceGeneric<T>( + GetTensorData<T>(op_context->input), op_context->input->dims->data, + op_context->input->dims->size, GetTensorData<T>(op_context->output), + op_context->output->dims->data, op_context->output->dims->size, + GetTensorData<int>(op_context->axis), num_axis, + op_context->params->keep_dims, GetTensorData<int>(temp_index), + GetTensorData<int>(resolved_axis), init_value, reducer)); + return kTfLiteOk; +} + +enum ReduceType { + kSum, + kProd, + kMax, + kMin, + kAny, +}; + +// Eval for determined input type and reduce type. +template <typename T> +TfLiteStatus EvalType(TfLiteContext* context, TfLiteNode* node, + OpContext* op_context, ReduceType reduce_type) { + switch (reduce_type) { + case kSum: + return EvalLogic<T>( + context, node, op_context, static_cast<T>(0), + [](const T current, const T in) -> T { return in + current; }); + break; + case kProd: + return EvalLogic<T>( + context, node, op_context, static_cast<T>(1), + [](const T current, const T in) -> T { return in * current; }); + break; + case kMax: + return EvalLogic<T>(context, node, op_context, + std::numeric_limits<T>::lowest(), + [](const T current, const T in) -> T { + return (in > current) ? in : current; + }); + break; + case kMin: + return EvalLogic<T>(context, node, op_context, + std::numeric_limits<T>::max(), + [](const T current, const T in) -> T { + return (in < current) ? in : current; + }); + break; + default: + return kTfLiteError; + } +} + +// Template specialization for bool type +template <> +TfLiteStatus EvalType<bool>(TfLiteContext* context, TfLiteNode* node, + OpContext* op_context, ReduceType reduce_type) { + switch (reduce_type) { + case kAny: + return EvalLogic<bool>(context, node, op_context, false, + [](const bool current, const bool in) -> bool { + return in || current; + }); + break; + default: + return kTfLiteError; + } +} + +// The entry point that handles input types and then calls template functions to +// handle ReduceType. +template <KernelType kernel_type, ReduceType reduce_type> +TfLiteStatus EvalGeneric(TfLiteContext* context, TfLiteNode* node) { + if (kernel_type != kReference) { + return kTfLiteOk; + } + OpContext op_context(context, node); + switch (op_context.input->type) { + case kTfLiteFloat32: + return EvalType<float>(context, node, &op_context, reduce_type); + break; + case kTfLiteInt32: + return EvalType<int>(context, node, &op_context, reduce_type); + break; + case kTfLiteInt64: + return EvalType<int64_t>(context, node, &op_context, reduce_type); + break; + case kTfLiteUInt8: + return EvalType<uint8_t>(context, node, &op_context, reduce_type); + break; + case kTfLiteBool: + return EvalType<bool>(context, node, &op_context, reduce_type); + break; + default: + return kTfLiteError; + } +} + +TfLiteStatus EvalSum(TfLiteContext* context, TfLiteNode* node) { + OpContext op_context(context, node); + const auto& input = op_context.input; + const auto& output = op_context.output; + if (input->type != kTfLiteUInt8 || + (input->params.scale == output->params.scale && + input->params.zero_point == output->params.zero_point)) { + return EvalGeneric<kReference, kSum>(context, node); + } else { + // Rescaling 8bit reduce sum. + int num_axis = static_cast<int>(NumElements(op_context.axis)); + TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0); + TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1); + TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2); + // Resize the output tensor if the output tensor is dynamic. + if (IsDynamicTensor(op_context.output)) { + TF_LITE_ENSURE_OK(context, + ResizeTempAxis(context, &op_context, resolved_axis)); + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum)); + } + + TF_LITE_ENSURE( + context, + reference_ops::QuantizedMeanOrSum<>( + GetTensorData<uint8_t>(op_context.input), + op_context.input->params.zero_point, op_context.input->params.scale, + op_context.input->dims->data, op_context.input->dims->size, + GetTensorData<uint8_t>(op_context.output), + op_context.output->params.zero_point, + op_context.output->params.scale, op_context.output->dims->data, + op_context.output->dims->size, GetTensorData<int>(op_context.axis), + num_axis, op_context.params->keep_dims, + GetTensorData<int>(temp_index), GetTensorData<int>(resolved_axis), + GetTensorData<int32>(temp_sum), /*compute_sum=*/true)); + } + + return kTfLiteOk; +} +} // namespace reduce + +TfLiteRegistration* Register_MEAN_REF() { + static TfLiteRegistration r = {reduce::Init, reduce::Free, + reduce::PrepareMeanOrSum, + reduce::EvalMean<reduce::kReference>}; + return &r; +} + +TfLiteRegistration* Register_SUM_REF() { + static TfLiteRegistration r = {reduce::Init, reduce::Free, + reduce::PrepareMeanOrSum, reduce::EvalSum}; + return &r; +} + +TfLiteRegistration* Register_REDUCE_PROD_REF() { + static TfLiteRegistration r = { + reduce::Init, reduce::Free, reduce::PrepareSimple, + reduce::EvalGeneric<reduce::kReference, reduce::kProd>}; + return &r; +} + +TfLiteRegistration* Register_REDUCE_MAX_REF() { + static TfLiteRegistration r = { + reduce::Init, reduce::Free, reduce::PrepareSimple, + reduce::EvalGeneric<reduce::kReference, reduce::kMax>}; + return &r; +} + +TfLiteRegistration* Register_REDUCE_MIN_REF() { + static TfLiteRegistration r = { + reduce::Init, reduce::Free, reduce::PrepareSimple, + reduce::EvalGeneric<reduce::kReference, reduce::kMin>}; + return &r; +} + +TfLiteRegistration* Register_REDUCE_ANY_REF() { + static TfLiteRegistration r = { + reduce::Init, reduce::Free, reduce::PrepareAny, + reduce::EvalGeneric<reduce::kReference, reduce::kAny>}; + return &r; +} + +// TODO(kanlig): add optimized implementation of Mean. +TfLiteRegistration* Register_MEAN() { return Register_MEAN_REF(); } +TfLiteRegistration* Register_SUM() { return Register_SUM_REF(); } +TfLiteRegistration* Register_REDUCE_PROD() { + return Register_REDUCE_PROD_REF(); +} +TfLiteRegistration* Register_REDUCE_MAX() { return Register_REDUCE_MAX_REF(); } +TfLiteRegistration* Register_REDUCE_MIN() { return Register_REDUCE_MIN_REF(); } +TfLiteRegistration* Register_REDUCE_ANY() { return Register_REDUCE_ANY_REF(); } + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/reduce_test.cc b/tensorflow/contrib/lite/kernels/reduce_test.cc new file mode 100644 index 0000000000..fb2ec58ab2 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/reduce_test.cc @@ -0,0 +1,975 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; +using ::testing::IsEmpty; + +class BaseOpModel : public SingleOpModel { + public: + void SetAxis(const std::vector<int>& data) { PopulateTensor(axis_, data); } + + template <class T> + void SetInput(std::vector<T> data) { + PopulateTensor(input_, data); + } + + template <class T> + std::vector<T> GetOutput() { + return ExtractVector<T>(output_); + } + + std::vector<float> GetDequantizedOutput() { + return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_), + GetScale(output_), GetZeroPoint(output_)); + } + + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + int Input() { return input_; } + + protected: + int input_; + int axis_; + int output_; +}; + +// Model for the tests case where axis is a const tensor. +class MeanOpConstModel : public BaseOpModel { + public: + MeanOpConstModel(const TensorData& input, const TensorData& output, + std::initializer_list<int> axis_shape, + std::initializer_list<int> axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a dynamic tensor. +class MeanOpDynamicModel : public BaseOpModel { + public: + MeanOpDynamicModel(const TensorData& input, const TensorData& output, + const TensorData& axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddInput(axis); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a const tensor. +class SumOpConstModel : public BaseOpModel { + public: + SumOpConstModel(const TensorData& input, const TensorData& output, + std::initializer_list<int> axis_shape, + std::initializer_list<int> axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_SUM, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a dynamic tensor. +class SumOpDynamicModel : public BaseOpModel { + public: + SumOpDynamicModel(const TensorData& input, const TensorData& output, + const TensorData& axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddInput(axis); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_SUM, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a const tensor. +class ProdOpConstModel : public BaseOpModel { + public: + ProdOpConstModel(const TensorData& input, const TensorData& output, + std::initializer_list<int> axis_shape, + std::initializer_list<int> axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_PROD, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a dynamic tensor. +class ProdOpDynamicModel : public BaseOpModel { + public: + ProdOpDynamicModel(const TensorData& input, const TensorData& output, + const TensorData& axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddInput(axis); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_PROD, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a const tensor. +class MaxOpConstModel : public BaseOpModel { + public: + MaxOpConstModel(const TensorData& input, const TensorData& output, + std::initializer_list<int> axis_shape, + std::initializer_list<int> axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_MAX, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a dynamic tensor. +class MaxOpDynamicModel : public BaseOpModel { + public: + MaxOpDynamicModel(const TensorData& input, const TensorData& output, + const TensorData& axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddInput(axis); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_MAX, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a const tensor. +class MinOpConstModel : public BaseOpModel { + public: + MinOpConstModel(const TensorData& input, const TensorData& output, + std::initializer_list<int> axis_shape, + std::initializer_list<int> axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_MIN, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a dynamic tensor. +class MinOpDynamicModel : public BaseOpModel { + public: + MinOpDynamicModel(const TensorData& input, const TensorData& output, + const TensorData& axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddInput(axis); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_MIN, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a const tensor. +class AnyOpConstModel : public BaseOpModel { + public: + AnyOpConstModel(const TensorData& input, const TensorData& output, + std::initializer_list<int> axis_shape, + std::initializer_list<int> axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_ANY, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a dynamic tensor. +class AnyOpDynamicModel : public BaseOpModel { + public: + AnyOpDynamicModel(const TensorData& input, const TensorData& output, + const TensorData& axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddInput(axis); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_REDUCE_ANY, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// for quantized Add, the error shouldn't exceed step +float GetTolerance(int min, int max) { return (max - min) / 255.0; } + +// Tests for reduce_mean +TEST(ConstFloatMeanOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}}, + {4}, {1, 0, -3, -3}, false); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13}))); +} + +TEST(ConstFloatMeanOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}}, + {2}, {0, 2}, true); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5}))); +} + +TEST(ConstFloatMeanOpTest, Scalar) { + std::vector<float> data = {3.27}; + MeanOpConstModel m({TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, {}, + {0}, true); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), IsEmpty()); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({3.27}))); +} + +TEST(DynamicFloatMeanOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}}, + false); + std::vector<int> axis = {1, 0, -3, -3}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13}))); +} + +TEST(DynamicFloatMeanOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}}, + true); + std::vector<int> axis = {0, 2}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5}))); +} + +TEST(DynamicFloatMeanOpTest, Scale) { + std::vector<float> data = {9.527}; + MeanOpDynamicModel m({TensorType_FLOAT32, {1}}, {TensorType_FLOAT32, {1}}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527}))); +} + + +TEST(ConstUint8MeanOpTest, NotKeepDims) { + float kQuantizedTolerance = GetTolerance(-1.0, 1.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + MeanOpConstModel m({TensorType_UINT8, {1, 3, 2}, -1.0, 1.0}, + {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {1}, false); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); + EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear( + {0.4, 0.4}, kQuantizedTolerance))); +} + +TEST(ConstUint8MeanOpTest, KeepDims) { + float kQuantizedTolerance = GetTolerance(-1.0, 1.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + MeanOpConstModel m({TensorType_UINT8, {3, 2}, -1.0, 1.0}, + {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {1}, true); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1})); + EXPECT_THAT( + m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance))); +} + +TEST(DynamicUint8MeanOpTest, NotKeepDims) { + float kQuantizedTolerance = GetTolerance(-5.0, 2.0); + std::vector<float> data = {1.3, -4.8, -3.6, 0.24}; + MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -5.0, 2.0}, + {TensorType_UINT8, {2}, -5.0, 2.0}, + {TensorType_INT32, {1}}, false); + std::vector<int> axis = {1}; + m.SetAxis(axis); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT( + m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({-1.75, -1.68}, kQuantizedTolerance))); +} + +TEST(DynamicUint8MeanOpTest, KeepDims) { + float kQuantizedTolerance = GetTolerance(-10.0, 12.0); + std::vector<float> data = {11.14, -0.14, 7.423, 0.879}; + MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -10.0, 12.0}, + {TensorType_UINT8, {2}, -10.0, 12.0}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); + EXPECT_THAT( + m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance))); +} + +TEST(DynamicUint8MeanOpTest, QuantizedScalar) { + float kQuantizedTolerance = GetTolerance(-10.0, 12.0); + std::vector<float> data = {0.643}; + MeanOpDynamicModel m({TensorType_UINT8, {}, 0.0, 1.0}, + {TensorType_UINT8, {}, -10.0, 12.0}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), IsEmpty()); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({0.643}, kQuantizedTolerance))); +} + +TEST(ConstUint8MeanOpTest, QuantizedKeepDims) { + float kQuantizedTolerance = GetTolerance(-5.0, 5.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + MeanOpConstModel m({TensorType_UINT8, {3, 2}, 0.0, 1.0}, + {TensorType_UINT8, {3}, -5.0, 5.0}, {1}, {1}, true); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1})); + EXPECT_THAT( + m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance))); +} + +// Tests for reduce_sum + +TEST(ConstFloatSumOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + SumOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}}, + {4}, {1, 0, -3, -3}, false); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({144, 156}))); +} + +TEST(ConstFloatSumOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + SumOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}}, + {2}, {0, 2}, true); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({84, 100, 116}))); +} + +TEST(DynamicFloatSumOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + SumOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}}, + false); + std::vector<int> axis = {1, 0, -3, -3}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({144, 156}))); +} + +TEST(ConstFloatSumOpTest, Scalar) { + std::vector<float> data = {17.}; + SumOpConstModel m({TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, {}, {0}, + false); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), IsEmpty()); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({17.}))); +} + +TEST(DynamicFloatSumOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + SumOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}}, true); + std::vector<int> axis = {0, 2}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({84, 100, 116}))); +} + +TEST(DynamicFloatSumOpTest, Scale) { + std::vector<float> data = {9.527}; + SumOpDynamicModel m({TensorType_FLOAT32, {1}}, {TensorType_FLOAT32, {1}}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527}))); +} + +TEST(ConstUint8SumOpTest, NotKeepDims) { + float kQuantizedTolerance = GetTolerance(-1.0, 1.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + SumOpConstModel m({TensorType_UINT8, {1, 3, 2}, -1.0, 1.0}, + {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {1}, false); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray( + ArrayFloatNear({-0.823529, -0.815686}, kQuantizedTolerance))); +} + +TEST(ConstUint8SumOpTest, NotKeepDimsRescaling) { + float kQuantizedTolerance = GetTolerance(0.0, 2.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + SumOpConstModel m({TensorType_UINT8, {1, 3, 2}, 0.0, 1.0}, + {TensorType_UINT8, {2}, 0.0, 2.0}, {1}, {1}, false); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); + EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear( + {1.2, 1.2}, kQuantizedTolerance))); +} + +TEST(ConstUint8SumOpTest, KeepDims) { + float kQuantizedTolerance = GetTolerance(-1.0, 1.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + SumOpConstModel m({TensorType_UINT8, {3, 2}, -1.0, 1.0}, + {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {1}, true); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({-0.407843, -0.313726, 0.0941177}, + kQuantizedTolerance))); +} + +TEST(DynamicUint8SumOpTest, NotKeepDims) { + float kQuantizedTolerance = GetTolerance(-5.0, 2.0); + std::vector<float> data = {1.3, -4.8, -3.6, 0.24}; + SumOpDynamicModel m({TensorType_UINT8, {2, 2}, -5.0, 2.0}, + {TensorType_UINT8, {2}, -5.0, 2.0}, + {TensorType_INT32, {1}}, false); + std::vector<int> axis = {1}; + m.SetAxis(axis); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray( + ArrayFloatNear({1.48235, 1.64706}, kQuantizedTolerance))); +} + +TEST(DynamicUint8SumOpTest, KeepDims) { + float kQuantizedTolerance = GetTolerance(-10.0, 12.0); + std::vector<float> data = {11.14, -0.14, 7.423, 0.879}; + SumOpDynamicModel m({TensorType_UINT8, {2, 2}, -10.0, 12.0}, + {TensorType_UINT8, {2}, -10.0, 12.0}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); + EXPECT_THAT( + m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({6.47059, 10.698}, kQuantizedTolerance))); +} + +// Tests for reduce_prod + +TEST(ConstFloatProdOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + ProdOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}}, + {4}, {1, 0, -3, -3}, false); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT( + m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({3.162341376e+11, 1.9619905536e+12}))); +} + +TEST(ConstFloatProdOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + ProdOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}}, + {2}, {0, 2}, true); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray( + ArrayFloatNear({7.74592e+06, 1.197504e+08, 6.6889152e+08}))); +} + +TEST(DynamicFloatProdOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + ProdOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}}, + false); + std::vector<int> axis = {1, 0, -3, -3}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT( + m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({3.16234143225e+11, 1.9619905536e+12}))); +} + +TEST(DynamicFloatProdOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + ProdOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}}, + true); + std::vector<int> axis = {0, 2}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray( + ArrayFloatNear({7.74592e+06, 1.197504e+08, 6.6889152e+08}))); +} + +TEST(DynamicFloatProdOpTest, Scale) { + std::vector<float> data = {9.527}; + ProdOpDynamicModel m({TensorType_FLOAT32, {1}}, {TensorType_FLOAT32, {1}}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527}))); +} + +// Tests for reduce_max + +TEST(ConstFloatMaxOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MaxOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}}, + {4}, {1, 0, -3, -3}, false); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({23, 24}))); +} + +TEST(ConstFloatMaxOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MaxOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}}, + {2}, {0, 2}, true); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({20, 22, 24}))); +} + +TEST(DynamicFloatMaxOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MaxOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}}, + false); + std::vector<int> axis = {1, 0, -3, -3}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({23, 24}))); +} + +TEST(DynamicFloatMaxOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MaxOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}}, true); + std::vector<int> axis = {0, 2}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({20, 22, 24}))); +} + +TEST(DynamicFloatMaxOpTest, Scale) { + std::vector<float> data = {9.527}; + MaxOpDynamicModel m({TensorType_FLOAT32, {1}}, {TensorType_FLOAT32, {1}}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527}))); +} + +TEST(ConstUint8MaxOpTest, NotKeepDims) { + float kQuantizedTolerance = GetTolerance(-1.0, 1.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + MaxOpConstModel m({TensorType_UINT8, {1, 3, 2}, -1.0, 1.0}, + {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {1}, false); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray( + ArrayFloatNear({0.501961, 0.603922}, kQuantizedTolerance))); +} + +TEST(ConstUint8MaxOpTest, KeepDims) { + float kQuantizedTolerance = GetTolerance(-1.0, 1.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + MaxOpConstModel m({TensorType_UINT8, {3, 2}, -1.0, 1.0}, + {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {1}, true); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray( + ArrayFloatNear({0.4, 0.4, 0.603922}, kQuantizedTolerance))); +} + +TEST(DynamicUint8MaxOpTest, NotKeepDims) { + float kQuantizedTolerance = GetTolerance(-5.0, 2.0); + std::vector<float> data = {1.3, -4.8, -3.6, 0.24}; + MaxOpDynamicModel m({TensorType_UINT8, {2, 2}, -5.0, 2.0}, + {TensorType_UINT8, {2}, -5.0, 2.0}, + {TensorType_INT32, {1}}, false); + std::vector<int> axis = {1}; + m.SetAxis(axis); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray( + ArrayFloatNear({1.2902, 0.247059}, kQuantizedTolerance))); +} + +TEST(DynamicUint8MaxOpTest, KeepDims) { + float kQuantizedTolerance = GetTolerance(-10.0, 12.0); + std::vector<float> data = {11.14, -0.14, 7.423, 0.879}; + MaxOpDynamicModel m({TensorType_UINT8, {2, 2}, -10.0, 12.0}, + {TensorType_UINT8, {2}, -10.0, 12.0}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray( + ArrayFloatNear({11.1294, 0.862745}, kQuantizedTolerance))); +} + +TEST(DynamicUint8MaxOpTest, Scalar) { + float kQuantizedTolerance = GetTolerance(-10.0, 12.0); + std::vector<float> data = {11.14}; + MaxOpDynamicModel m({TensorType_UINT8, {}, -10.0, 12.0}, + {TensorType_UINT8, {}, -10.0, 12.0}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), IsEmpty()); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance))); +} + +// Tests for reduce_min + +TEST(ConstFloatMinOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MinOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}}, + {4}, {1, 0, -3, -3}, false); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({1, 2}))); +} + +TEST(ConstFloatMinOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MinOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}}, + {2}, {0, 2}, true); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({1, 3, 5}))); +} + +TEST(DynamicFloatMinOpTest, NotKeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MinOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}}, + false); + std::vector<int> axis = {1, 0, -3, -3}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({1, 2}))); +} + +TEST(DynamicFloatMinOpTest, KeepDims) { + std::vector<float> data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MinOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}}, true); + std::vector<int> axis = {0, 2}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({1, 3, 5}))); +} + +TEST(DynamicFloatMinOpTest, Scalar) { + std::vector<float> data = {9.527}; + MinOpDynamicModel m({TensorType_FLOAT32, {1}}, {TensorType_FLOAT32, {1}}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527}))); +} + +TEST(ConstUint8MinOpTest, NotKeepDims) { + float kQuantizedTolerance = GetTolerance(-1.0, 1.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + MinOpConstModel m({TensorType_UINT8, {1, 3, 2}, -1.0, 1.0}, + {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {1}, false); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); + EXPECT_THAT( + m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({0.294117, 0.2}, kQuantizedTolerance))); +} + +TEST(ConstUint8MinOpTest, KeepDims) { + float kQuantizedTolerance = GetTolerance(-1.0, 1.0); + std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6}; + MinOpConstModel m({TensorType_UINT8, {3, 2}, -1.0, 1.0}, + {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {1}, true); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1})); + EXPECT_THAT( + m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({0.2, 0.3, 0.5}, kQuantizedTolerance))); +} + +TEST(DynamicUint8MinOpTest, NotKeepDims) { + float kQuantizedTolerance = GetTolerance(-5.0, 2.0); + std::vector<float> data = {1.3, -4.8, -3.6, 0.24}; + MinOpDynamicModel m({TensorType_UINT8, {2, 2}, -5.0, 2.0}, + {TensorType_UINT8, {2}, -5.0, 2.0}, + {TensorType_INT32, {1}}, false); + std::vector<int> axis = {1}; + m.SetAxis(axis); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT( + m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({-4.807843, -3.6}, kQuantizedTolerance))); +} + +TEST(DynamicUint8MinOpTest, KeepDims) { + float kQuantizedTolerance = GetTolerance(-10.0, 12.0); + std::vector<float> data = {11.14, -0.14, 7.423, 0.879}; + MinOpDynamicModel m({TensorType_UINT8, {2, 2}, -10.0, 12.0}, + {TensorType_UINT8, {2}, -10.0, 12.0}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray( + ArrayFloatNear({7.427451, -0.164706}, kQuantizedTolerance))); +} + +TEST(DynamicUint8MinOpTest, Scalar) { + float kQuantizedTolerance = GetTolerance(-10.0, 12.0); + std::vector<float> data = {11.14}; + MinOpDynamicModel m({TensorType_UINT8, {}, -10.0, 12.0}, + {TensorType_UINT8, {}, -10.0, 12.0}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.QuantizeAndPopulate<uint8_t>(m.Input(), data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), IsEmpty()); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance))); +} + +// Tests for reduce_any + +TEST(ConstAnyOpTest, NotKeepDims) { + std::vector<bool> data = {false, false, false, false, false, false, + false, true, false, false, false, true}; + AnyOpConstModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_BOOL, {2}}, {4}, + {1, 0, -3, -3}, false); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<bool>(), ElementsAreArray({false, true})); +} + +TEST(ConstAnyOpTest, KeepDims) { + std::vector<bool> data = {false, false, false, false, false, false, + false, true, false, false, false, true}; + AnyOpConstModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_BOOL, {3}}, {2}, + {0, 2}, true); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<bool>(), ElementsAreArray({true, false, true})); +} + +TEST(DynamicAnyOpTest, NotKeepDims) { + std::vector<bool> data = {false, false, false, false, false, false, + false, true, false, false, false, true}; + AnyOpDynamicModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_BOOL, {2}}, + {TensorType_INT32, {4}}, false); + std::vector<int> axis = {1, 0, -3, -3}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput<bool>(), ElementsAreArray({false, true})); +} + +TEST(DynamicAnyOpTest, KeepDims) { + std::vector<bool> data = {false, false, false, false, false, false, + false, true, false, false, false, true}; + AnyOpDynamicModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_BOOL, {3}}, + {TensorType_INT32, {2}}, true); + std::vector<int> axis = {0, 2}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput<bool>(), ElementsAreArray({true, false, true})); +} + +TEST(DynamicAnyOpTest, Scalar) { + std::vector<bool> data = {false}; + AnyOpDynamicModel m({TensorType_BOOL, {1}}, {TensorType_BOOL, {1}}, + {TensorType_INT32, {1}}, true); + std::vector<int> axis = {0}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); + EXPECT_THAT(m.GetOutput<bool>(), ElementsAreArray({false})); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc index 21cc185e9f..9402105fa7 100644 --- a/tensorflow/contrib/lite/kernels/register.cc +++ b/tensorflow/contrib/lite/kernels/register.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/util.h" namespace tflite { namespace ops { @@ -21,7 +22,10 @@ namespace ops { namespace custom { TfLiteRegistration* Register_AUDIO_SPECTROGRAM(); +TfLiteRegistration* Register_LAYER_NORM_LSTM(); TfLiteRegistration* Register_MFCC(); +TfLiteRegistration* Register_DETECTION_POSTPROCESS(); +TfLiteRegistration* Register_RELU_1(); } // namespace custom @@ -73,6 +77,7 @@ TfLiteRegistration* Register_SQUEEZE(); TfLiteRegistration* Register_STRIDED_SLICE(); TfLiteRegistration* Register_EXP(); TfLiteRegistration* Register_TOPK_V2(); +TfLiteRegistration* Register_LOG(); TfLiteRegistration* Register_LOG_SOFTMAX(); TfLiteRegistration* Register_CAST(); TfLiteRegistration* Register_DEQUANTIZE(); @@ -80,16 +85,68 @@ TfLiteRegistration* Register_PRELU(); TfLiteRegistration* Register_MAXIMUM(); TfLiteRegistration* Register_MINIMUM(); TfLiteRegistration* Register_ARG_MAX(); +TfLiteRegistration* Register_ARG_MIN(); TfLiteRegistration* Register_GREATER(); TfLiteRegistration* Register_GREATER_EQUAL(); TfLiteRegistration* Register_LESS(); TfLiteRegistration* Register_LESS_EQUAL(); TfLiteRegistration* Register_FLOOR(); +TfLiteRegistration* Register_TILE(); TfLiteRegistration* Register_NEG(); +TfLiteRegistration* Register_SUM(); +TfLiteRegistration* Register_REDUCE_PROD(); +TfLiteRegistration* Register_REDUCE_MAX(); +TfLiteRegistration* Register_REDUCE_MIN(); +TfLiteRegistration* Register_REDUCE_ANY(); TfLiteRegistration* Register_SELECT(); TfLiteRegistration* Register_SLICE(); TfLiteRegistration* Register_SIN(); TfLiteRegistration* Register_TRANSPOSE_CONV(); +TfLiteRegistration* Register_EXPAND_DIMS(); +TfLiteRegistration* Register_SPARSE_TO_DENSE(); +TfLiteRegistration* Register_EQUAL(); +TfLiteRegistration* Register_NOT_EQUAL(); +TfLiteRegistration* Register_SQRT(); +TfLiteRegistration* Register_RSQRT(); +TfLiteRegistration* Register_SHAPE(); +TfLiteRegistration* Register_POW(); +TfLiteRegistration* Register_FAKE_QUANT(); +TfLiteRegistration* Register_PACK(); +TfLiteRegistration* Register_ONE_HOT(); +TfLiteRegistration* Register_LOGICAL_OR(); +TfLiteRegistration* Register_LOGICAL_AND(); +TfLiteRegistration* Register_LOGICAL_NOT(); +TfLiteRegistration* Register_UNPACK(); +TfLiteRegistration* Register_FLOOR_DIV(); +TfLiteRegistration* Register_SQUARE(); +TfLiteRegistration* Register_ZEROS_LIKE(); + +TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) { + context->ReportError( + context, + "Regular TensorFlow ops are not supported by this interpreter. Make sure " + "you invoke the Flex delegate before inference."); + return kTfLiteError; +} + +const TfLiteRegistration* BuiltinOpResolver::FindOp(tflite::BuiltinOperator op, + int version) const { + return MutableOpResolver::FindOp(op, version); +} + +const TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op, + int version) const { + // Return the NULL Op for all ops whose name start with "Flex", allowing + // the interpreter to delegate their execution. + if (IsFlexOp(op)) { + static TfLiteRegistration null_op{ + nullptr, nullptr, &UnsupportedTensorFlowOp, + nullptr, nullptr, BuiltinOperator_CUSTOM, + "Flex", 1}; + return &null_op; + } + return MutableOpResolver::FindOp(op, version); +} BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_RELU, Register_RELU()); @@ -101,7 +158,9 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D()); AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D()); AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D()); - AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D()); + AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(), + /* min_version */ 1, + /* max_version */ 2); AddBuiltin(BuiltinOperator_SVDF, Register_SVDF()); AddBuiltin(BuiltinOperator_RNN, Register_RNN()); AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, @@ -111,7 +170,9 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP()); AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE, Register_EMBEDDING_LOOKUP_SPARSE()); - AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED()); + AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(), + /* min_version */ 1, + /* max_version */ 2); AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION()); AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP()); AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX()); @@ -123,7 +184,8 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION()); AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION, Register_LOCAL_RESPONSE_NORMALIZATION()); - AddBuiltin(BuiltinOperator_LSTM, Register_LSTM()); + AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1, + /* max_version */ 2); AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, Register_BIDIRECTIONAL_SEQUENCE_LSTM()); AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, @@ -144,6 +206,7 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE()); AddBuiltin(BuiltinOperator_EXP, Register_EXP()); AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2()); + AddBuiltin(BuiltinOperator_LOG, Register_LOG()); AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX()); AddBuiltin(BuiltinOperator_CAST, Register_CAST()); AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE()); @@ -151,6 +214,7 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM()); AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM()); AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX()); + AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN()); AddBuiltin(BuiltinOperator_GREATER, Register_GREATER()); AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL()); AddBuiltin(BuiltinOperator_LESS, Register_LESS()); @@ -161,12 +225,40 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_SLICE, Register_SLICE()); AddBuiltin(BuiltinOperator_SIN, Register_SIN()); AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV()); + AddBuiltin(BuiltinOperator_TILE, Register_TILE()); + AddBuiltin(BuiltinOperator_SUM, Register_SUM()); + AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD()); + AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX()); + AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN()); + AddBuiltin(BuiltinOperator_REDUCE_ANY, Register_REDUCE_ANY()); + AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS()); + AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE()); + AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL()); + AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL()); + AddBuiltin(BuiltinOperator_SQRT, Register_SQRT()); + AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT()); + AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE()); + AddBuiltin(BuiltinOperator_POW, Register_POW()); + AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2); + AddBuiltin(BuiltinOperator_PACK, Register_PACK()); + AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT()); + AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR()); + AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND()); + AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT()); + AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK()); + AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV()); + AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE()); + AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE()); // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that // custom ops aren't always included by default. AddCustom("Mfcc", tflite::ops::custom::Register_MFCC()); AddCustom("AudioSpectrogram", tflite::ops::custom::Register_AUDIO_SPECTROGRAM()); + AddCustom("LayerNormLstm", tflite::ops::custom::Register_LAYER_NORM_LSTM()); + AddCustom("Relu1", tflite::ops::custom::Register_RELU_1()); + AddCustom("TFLite_Detection_PostProcess", + tflite::ops::custom::Register_DETECTION_POSTPROCESS()); } } // namespace builtin diff --git a/tensorflow/contrib/lite/kernels/register.h b/tensorflow/contrib/lite/kernels/register.h index b928f1b302..61856ab9de 100644 --- a/tensorflow/contrib/lite/kernels/register.h +++ b/tensorflow/contrib/lite/kernels/register.h @@ -16,8 +16,9 @@ limitations under the License. #define TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_ #include <unordered_map> -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/model.h" +#include "tensorflow/contrib/lite/mutable_op_resolver.h" namespace tflite { namespace ops { @@ -26,10 +27,14 @@ namespace builtin { class BuiltinOpResolver : public MutableOpResolver { public: BuiltinOpResolver(); + + const TfLiteRegistration* FindOp(tflite::BuiltinOperator op, + int version) const override; + const TfLiteRegistration* FindOp(const char* op, int version) const override; }; } // namespace builtin } // namespace ops } // namespace tflite -#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_BUILTIN_KERNELS_H +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_ diff --git a/tensorflow/contrib/lite/kernels/relu1.cc b/tensorflow/contrib/lite/kernels/relu1.cc new file mode 100644 index 0000000000..abafee2d57 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/relu1.cc @@ -0,0 +1,59 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +namespace tflite { +namespace ops { +namespace custom { +namespace relu1 { + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + const TfLiteTensor* input = GetInput(context, node, 0); + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + TfLiteTensor* output = GetOutput(context, node, 0); + output->type = input->type; + return context->ResizeTensor(context, output, + TfLiteIntArrayCopy(input->dims)); +} + +// This is derived from lite/kernels/activations.cc. +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input = GetInput(context, node, 0); + TfLiteTensor* output = GetOutput(context, node, 0); + const int elements = NumElements(input); + const float* in = input->data.f; + const float* in_end = in + elements; + float* out = output->data.f; + for (; in < in_end; ++in, ++out) { + *out = std::min(std::max(0.f, *in), 1.f); + } + return kTfLiteOk; +} + +} // namespace relu1 + +TfLiteRegistration* Register_RELU_1() { + static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr, + relu1::Prepare, relu1::Eval}; + return &r; +} + +} // namespace custom +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/relu1_test.cc b/tensorflow/contrib/lite/kernels/relu1_test.cc new file mode 100644 index 0000000000..b1d25a9f50 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/relu1_test.cc @@ -0,0 +1,79 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" + +namespace tflite { +namespace ops { +namespace custom { + +TfLiteRegistration* Register_RELU_1(); + +namespace { + +using ::testing::ElementsAreArray; + +class BaseActivationsOpModel : public SingleOpModel { + public: + explicit BaseActivationsOpModel(const TensorData& input) { + input_ = AddInput(input); + output_ = AddOutput({input.type, {}}); + flexbuffers::Builder fbb; + fbb.Map([&]() {}); + fbb.Finish(); + SetCustomOp("RELU_1", fbb.GetBuffer(), Register_RELU_1); + BuildInterpreter({GetShape(input_)}); + } + + protected: + int input_; + int output_; +}; + +class FloatActivationsOpModel : public BaseActivationsOpModel { + public: + using BaseActivationsOpModel::BaseActivationsOpModel; + + void SetInput(std::initializer_list<float> data) { + PopulateTensor(input_, data); + } + std::vector<float> GetOutput() { return ExtractVector<float>(output_); } +}; + +TEST(FloatActivationsOpTest, Relu1) { + FloatActivationsOpModel m(/*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}}); + m.SetInput({ + 0.0, -0.6, 0.2, -0.4, // + 0.3, -2.0, 1.1, -0.1, // + }); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 0.0, 0.0, 0.2, 0.0, // + 0.3, 0.0, 1.0, 0.0, // + })); +} + +} // namespace +} // namespace custom +} // namespace ops +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc index 3287040695..f41147b2d6 100644 --- a/tensorflow/contrib/lite/kernels/reshape.cc +++ b/tensorflow/contrib/lite/kernels/reshape.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include <string.h> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" @@ -25,16 +25,11 @@ namespace builtin { namespace reshape { constexpr int kInputTensor = 0; +constexpr int kShapeTensor = 1; constexpr int kOutputTensor = 0; -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data); - - // TODO(ahentz): we are often given a tensor with the shape but we only pay - // attention to what the shape specified in 'params'. - TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2); - TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - +TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node, + TfLiteIntArray* output_shape) { const TfLiteTensor* input = GetInput(context, node, kInputTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); @@ -42,37 +37,84 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // special -1 value, meaning it will be calculated automatically based on the // input. Here we calculate what that dimension should be so that the number // of output elements in the same as the number of input elements. - int num_input_elements = 1; - for (int i = 0; i < NumDimensions(input); ++i) { - num_input_elements *= SizeOfDimension(input, i); - } + int num_input_elements = NumElements(input); - TfLiteIntArray* output_size = TfLiteIntArrayCreate(params->num_dimensions); int num_output_elements = 1; int stretch_dim = -1; - for (int i = 0; i < params->num_dimensions; ++i) { - int value = params->shape[i]; + for (int i = 0; i < output_shape->size; ++i) { + int value = output_shape->data[i]; if (value == -1) { TF_LITE_ENSURE_EQ(context, stretch_dim, -1); stretch_dim = i; } else { num_output_elements *= value; - output_size->data[i] = value; } } if (stretch_dim != -1) { - output_size->data[stretch_dim] = num_input_elements / num_output_elements; - num_output_elements *= output_size->data[stretch_dim]; + output_shape->data[stretch_dim] = num_input_elements / num_output_elements; + num_output_elements *= output_shape->data[stretch_dim]; } TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements); - return context->ResizeTensor(context, output, output_size); + return context->ResizeTensor(context, output, output_shape); +} + +TfLiteStatus ResizeOutputWithShapeTensor(TfLiteContext* context, + TfLiteNode* node) { + const TfLiteTensor* shape = GetInput(context, node, kShapeTensor); + + TfLiteIntArray* output_shape = TfLiteIntArrayCreate(shape->dims->data[0]); + for (int i = 0; i < output_shape->size; ++i) { + output_shape->data[i] = shape->data.i32[i]; + } + return ResizeOutput(context, node, output_shape); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data); + + TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + // Attempt to use shape tensor if it exists. + if (NumInputs(node) == 2) { + const TfLiteTensor* shape = GetInput(context, node, kShapeTensor); + // Check if the shape tensor is valid. + if (shape->dims->size == 1 && shape->type == kTfLiteInt32) { + // Set the output tensor as dynamic if the shape isn't constnat. + if (!IsConstantTensor(shape)) { + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + SetTensorToDynamic(output); + return kTfLiteOk; + } + // Shape is constant. Resize now. + return ResizeOutputWithShapeTensor(context, node); + } + } + // The function is returned above this line if the shape tensor is usable. + // Now fallback to the shape parameter in `TfLiteReshapeParams`. + int num_dimensions = params->num_dimensions; + if (num_dimensions == 1 && params->shape[0] == 0) { + // Legacy tflite models use a shape parameter of [0] to indicate scalars, + // so adjust accordingly. TODO(b/111614235): Allow zero-sized buffers during + // toco conversion. + num_dimensions = 0; + } + TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions); + for (int i = 0; i < num_dimensions; ++i) { + output_shape->data[i] = params->shape[i]; + } + return ResizeOutput(context, node, output_shape); } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input = GetInput(context, node, kInputTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + if (IsDynamicTensor(output)) { + TF_LITE_ENSURE_OK(context, ResizeOutputWithShapeTensor(context, node)); + } + memcpy(output->data.raw, input->data.raw, input->bytes); return kTfLiteOk; diff --git a/tensorflow/contrib/lite/kernels/reshape_test.cc b/tensorflow/contrib/lite/kernels/reshape_test.cc index aecbd0399f..52d71350d3 100644 --- a/tensorflow/contrib/lite/kernels/reshape_test.cc +++ b/tensorflow/contrib/lite/kernels/reshape_test.cc @@ -22,18 +22,27 @@ namespace tflite { namespace { using ::testing::ElementsAreArray; +using ::testing::IsEmpty; class ReshapeOpModel : public SingleOpModel { public: ReshapeOpModel(std::initializer_list<int> input_shape, - std::initializer_list<int> new_shape) { + std::initializer_list<int> new_shape, + bool use_shape_input_tensor = false) { input_ = AddInput(TensorType_FLOAT32); output_ = AddOutput(TensorType_FLOAT32); + int shape_input_tensor = + use_shape_input_tensor ? AddInput(TensorType_INT32) : -1; SetBuiltinOp( BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions, CreateReshapeOptions(builder_, builder_.CreateVector<int>(new_shape)) .Union()); - BuildInterpreter({input_shape}); + if (use_shape_input_tensor) { + BuildInterpreter({input_shape, GetShape(shape_input_tensor)}); + PopulateTensor<int>(shape_input_tensor, new_shape); + } else { + BuildInterpreter({input_shape}); + } } void SetInput(std::initializer_list<float> data) { @@ -71,6 +80,14 @@ TEST(ReshapeOpTest, SimpleTest) { EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2})); } +TEST(ReshapeOpTest, ShapeTensorInput) { + ReshapeOpModel m({1, 2, 4, 1}, {2, 2, 2}, /*use_shape_input_tensor=*/true); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8})); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2})); +} + TEST(ReshapeOpTest, WithStretchDimension) { ReshapeOpModel m({1, 2, 4, 1}, {2, 1, -1}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8}); @@ -79,6 +96,22 @@ TEST(ReshapeOpTest, WithStretchDimension) { EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 4})); } +TEST(ReshapeOpTest, ScalarOutput) { + ReshapeOpModel m({1}, {}); + m.SetInput({3}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({3})); + EXPECT_THAT(m.GetOutputShape(), IsEmpty()); +} + +TEST(ReshapeOpTest, LegacyScalarOutput) { + ReshapeOpModel m({1}, {0}); + m.SetInput({3}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({3})); + EXPECT_THAT(m.GetOutputShape(), IsEmpty()); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc index e4bd0f5b85..fb045d15f3 100644 --- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc +++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -61,12 +61,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4); TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1); - // TODO(ahentz): Our current implementations only support float32. - TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE_EQ(context, size->type, kTfLiteInt32); // ResizeBilinear creates a float tensor even when the input is made of // integers. - output->type = kTfLiteFloat32; + output->type = input->type; if (!IsConstantTensor(size)) { SetTensorToDynamic(output); @@ -90,21 +88,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } if (output->type == kTfLiteFloat32) { -#define TF_LITE_RESIZE_BILINEAR(type) \ - type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input), \ - GetTensorData<int32>(size), GetTensorDims(size), \ - GetTensorData<float>(output), GetTensorDims(output), \ - params->align_corners) +#define TF_LITE_RESIZE_BILINEAR(type, datatype) \ + tflite::ResizeBilinearParams op_params; \ + op_params.align_corners = params->align_corners; \ + type::ResizeBilinear(op_params, GetTensorShape(input), \ + GetTensorData<datatype>(input), GetTensorShape(size), \ + GetTensorData<int32>(size), GetTensorShape(output), \ + GetTensorData<datatype>(output)) if (kernel_type == kReference) { - TF_LITE_RESIZE_BILINEAR(reference_ops); + TF_LITE_RESIZE_BILINEAR(reference_ops, float); } if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) { - TF_LITE_RESIZE_BILINEAR(optimized_ops); + TF_LITE_RESIZE_BILINEAR(optimized_ops, float); + } + } else if (output->type == kTfLiteUInt8) { + if (kernel_type == kReference) { + TF_LITE_RESIZE_BILINEAR(reference_ops, uint8_t); + } + if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) { + TF_LITE_RESIZE_BILINEAR(optimized_ops, uint8_t); } #undef TF_LITE_RESIZE_BILINEAR } else { - context->ReportError(context, "Inputs and outputs not all float types."); + context->ReportError(context, "Output type is %d, requires float.", + output->type); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc index 4e03f3820a..f4289105f7 100644 --- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc +++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc @@ -22,6 +22,7 @@ namespace tflite { namespace { using ::testing::ElementsAreArray; +using uint8 = std::uint8_t; class ResizeBilinearOpModel : public SingleOpModel { public: @@ -34,7 +35,7 @@ class ResizeBilinearOpModel : public SingleOpModel { } else { size_ = AddInput({TensorType_INT32, {2}}); } - output_ = AddOutput(TensorType_FLOAT32); // Always float. + output_ = AddOutput(input.type); SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR, BuiltinOptions_ResizeBilinearOptions, CreateResizeBilinearOptions(builder_).Union()); @@ -45,12 +46,16 @@ class ResizeBilinearOpModel : public SingleOpModel { } } - void SetInput(std::initializer_list<float> data) { + template <typename T> + void SetInput(std::initializer_list<T> data) { PopulateTensor(input_, data); } void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); } - std::vector<float> GetOutput() { return ExtractVector<float>(output_); } + template <typename T> + std::vector<T> GetOutput() { + return ExtractVector<T>(output_); + } private: int input_; @@ -60,60 +65,121 @@ class ResizeBilinearOpModel : public SingleOpModel { TEST(ResizeBilinearOpTest, HorizontalResize) { ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}}); - m.SetInput({3, 6}); + m.SetInput<float>({3, 6}); m.SetSize({1, 3}); m.Invoke(); - EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6}))); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({3, 5, 6}))); ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3}); - const_m.SetInput({3, 6}); + const_m.SetInput<float>({3, 6}); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({3, 5, 6}))); +} + +TEST(ResizeBilinearOpTest, HorizontalResize8Bit) { + ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}}); + m.SetInput<uint8>({3, 6}); + m.SetSize({1, 3}); + m.Invoke(); + EXPECT_THAT(m.GetOutput<uint8>(), + ElementsAreArray(ArrayFloatNear({3, 5, 6}))); + + ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 1, 2, 1}}, {1, 3}); + const_m.SetInput<uint8>({3, 6}); const_m.Invoke(); - EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6}))); + EXPECT_THAT(const_m.GetOutput<uint8>(), + ElementsAreArray(ArrayFloatNear({3, 5, 6}))); } TEST(ResizeBilinearOpTest, VerticalResize) { ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}}); - m.SetInput({3, 9}); + m.SetInput<float>({3, 9}); m.SetSize({3, 1}); m.Invoke(); - EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9}))); + EXPECT_THAT(m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({3, 7, 9}))); ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1}); - const_m.SetInput({3, 9}); + const_m.SetInput<float>({3, 9}); const_m.Invoke(); - EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9}))); + EXPECT_THAT(const_m.GetOutput<float>(), + ElementsAreArray(ArrayFloatNear({3, 7, 9}))); +} + +TEST(ResizeBilinearOpTest, VerticalResize8Bit) { + ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}}); + m.SetInput<uint8>({3, 9}); + m.SetSize({3, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput<uint8>(), + ElementsAreArray(ArrayFloatNear({3, 7, 9}))); + + ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 1, 1}}, {3, 1}); + const_m.SetInput<uint8>({3, 9}); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput<uint8>(), + ElementsAreArray(ArrayFloatNear({3, 7, 9}))); } TEST(ResizeBilinearOpTest, TwoDimensionalResize) { ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}); - m.SetInput({ + m.SetInput<float>({ 3, 6, // 9, 12 // }); m.SetSize({3, 3}); m.Invoke(); - EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({ - 3, 5, 6, // - 7, 9, 10, // - 9, 11, 12, // - }))); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + }))); ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3}); - const_m.SetInput({ + const_m.SetInput<float>({ + 3, 6, // + 9, 12 // + }); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + }))); +} + +TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) { + ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 1}}); + m.SetInput<uint8>({ + 3, 6, // + 9, 12 // + }); + m.SetSize({3, 3}); + m.Invoke(); + EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + }))); + + ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 1}}, {3, 3}); + const_m.SetInput<uint8>({ 3, 6, // 9, 12 // }); const_m.Invoke(); - EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({ - 3, 5, 6, // - 7, 9, 10, // - 9, 11, 12, // - }))); + EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + }))); } TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) { ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}); - m.SetInput({ + m.SetInput<float>({ 3, 6, // 9, 12, // 4, 10, // @@ -121,60 +187,123 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) { }); m.SetSize({3, 3}); m.Invoke(); - EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({ - 3, 5, 6, // - 7, 9, 10, // - 9, 11, 12, // - 4, 8, 10, // - 8, 12, 14, // - 10, 14, 16, // - }))); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + 4, 8, 10, // + 8, 12, 14, // + 10, 14, 16, // + }))); ResizeBilinearOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3}); - const_m.SetInput({ + const_m.SetInput<float>({ 3, 6, // 9, 12, // 4, 10, // 10, 16 // }); const_m.Invoke(); - EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({ - 3, 5, 6, // - 7, 9, 10, // - 9, 11, 12, // - 4, 8, 10, // - 8, 12, 14, // - 10, 14, 16, // - }))); + EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + 4, 8, 10, // + 8, 12, 14, // + 10, 14, 16, // + }))); } TEST(ResizeBilinearOpTest, ThreeDimensionalResize) { ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}); - m.SetInput({ + m.SetInput<float>({ 3, 4, 6, 10, // 9, 10, 12, 16, // }); m.SetSize({3, 3}); m.Invoke(); - EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({ - 3, 4, 5, 8, 6, 10, // - 7, 8, 9, 12, 10, 14, // - 9, 10, 11, 14, 12, 16, // - }))); + EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({ + 3, 4, 5, 8, 6, 10, // + 7, 8, 9, 12, 10, 14, // + 9, 10, 11, 14, 12, 16, // + }))); ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3}); - const_m.SetInput({ + const_m.SetInput<float>({ 3, 4, 6, 10, // 9, 10, 12, 16, // }); const_m.Invoke(); - EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({ - 3, 4, 5, 8, 6, 10, // - 7, 8, 9, 12, 10, 14, // - 9, 10, 11, 14, 12, 16, // - }))); + EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({ + 3, 4, 5, 8, 6, 10, // + 7, 8, 9, 12, 10, 14, // + 9, 10, 11, 14, 12, 16, // + }))); } +TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) { + ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}}); + m.SetInput<uint8>({ + 3, 6, // + 9, 12, // + 4, 10, // + 12, 16 // + }); + m.SetSize({3, 3}); + m.Invoke(); + EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + 4, 8, 10, // + 9, 12, 14, // + 12, 14, 16, // + }))); + + ResizeBilinearOpModel const_m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3}); + const_m.SetInput<uint8>({ + 3, 6, // + 9, 12, // + 4, 10, // + 12, 16 // + }); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + 4, 8, 10, // + 9, 12, 14, // + 12, 14, 16, // + }))); +} + +TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) { + ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}}); + m.SetInput<uint8>({ + 3, 4, 6, 10, // + 10, 12, 14, 16, // + }); + m.SetSize({3, 3}); + m.Invoke(); + EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({ + 3, 4, 5, 8, 6, 10, // + 7, 9, 10, 12, 11, 14, // + 10, 12, 12, 14, 14, 16, // + }))); + + ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3}); + const_m.SetInput<uint8>({ + 3, 4, 6, 10, // + 10, 12, 14, 16, // + }); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({ + 3, 4, 5, 8, 6, 10, // + 7, 9, 10, 12, 11, 14, // + 10, 12, 12, 14, 14, 16, // + }))); +} } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/select.cc b/tensorflow/contrib/lite/kernels/select.cc index 9bc8a1a34a..4780a86ee5 100644 --- a/tensorflow/contrib/lite/kernels/select.cc +++ b/tensorflow/contrib/lite/kernels/select.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -70,12 +70,12 @@ TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) { bool is_rank_one = !HaveSameShapes(input_condition, input_x); -#define TF_LITE_SELECT(type, op) \ - reference_ops::op(GetTensorData<bool>(input_condition), \ - GetTensorDims(input_condition), \ - GetTensorData<type>(input_x), GetTensorDims(input_x), \ - GetTensorData<type>(input_y), GetTensorDims(input_y), \ - GetTensorData<type>(output), GetTensorDims(output)); +#define TF_LITE_SELECT(type, op) \ + reference_ops::op(GetTensorShape(input_condition), \ + GetTensorData<bool>(input_condition), \ + GetTensorShape(input_x), GetTensorData<type>(input_x), \ + GetTensorShape(input_y), GetTensorData<type>(input_y), \ + GetTensorShape(output), GetTensorData<type>(output)); #define TF_LITE_SWITCH(type, op) \ switch (type) { \ @@ -89,6 +89,9 @@ TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteUInt8: \ TF_LITE_SELECT(uint8_t, op); \ break; \ + case kTfLiteInt16: \ + TF_LITE_SELECT(int16_t, op); \ + break; \ case kTfLiteInt32: \ TF_LITE_SELECT(int32_t, op); \ break; \ @@ -97,7 +100,9 @@ TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) { break; \ default: \ context->ReportError(context, \ - "Does not support type other than bool|float|int"); \ + "Does not support type other than bool|float|int, " \ + "got %d", \ + type); \ return kTfLiteError; \ } diff --git a/tensorflow/contrib/lite/kernels/select_test.cc b/tensorflow/contrib/lite/kernels/select_test.cc index cfe24a5fc9..5b2e61cd29 100644 --- a/tensorflow/contrib/lite/kernels/select_test.cc +++ b/tensorflow/contrib/lite/kernels/select_test.cc @@ -88,11 +88,24 @@ TEST(SelectOpTest, SelectUInt8) { TensorType_UINT8); model.PopulateTensor<bool>(model.input1(), {false, true, false, false}); - model.PopulateTensor<uint8>(model.input2(), {1, 2, 3, 4}); - model.PopulateTensor<uint8>(model.input3(), {5, 6, 7, 8}); + model.PopulateTensor<uint8_t>(model.input2(), {1, 2, 3, 4}); + model.PopulateTensor<uint8_t>(model.input3(), {5, 6, 7, 8}); model.Invoke(); - EXPECT_THAT(model.GetOutput<uint8>(), ElementsAreArray({5, 2, 7, 8})); + EXPECT_THAT(model.GetOutput<uint8_t>(), ElementsAreArray({5, 2, 7, 8})); + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); +} + +TEST(SelectOpTest, SelectInt16) { + SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4}, + TensorType_INT16); + + model.PopulateTensor<bool>(model.input1(), {false, true, false, false}); + model.PopulateTensor<int16_t>(model.input2(), {1, 2, 3, 4}); + model.PopulateTensor<int16_t>(model.input3(), {5, 6, 7, 8}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput<int16_t>(), ElementsAreArray({5, 2, 7, 8})); EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); } @@ -101,11 +114,11 @@ TEST(SelectOpTest, SelectInt32) { TensorType_INT32); model.PopulateTensor<bool>(model.input1(), {false, true, false, false}); - model.PopulateTensor<int32>(model.input2(), {1, 2, 3, 4}); - model.PopulateTensor<int32>(model.input3(), {5, 6, 7, 8}); + model.PopulateTensor<int32_t>(model.input2(), {1, 2, 3, 4}); + model.PopulateTensor<int32_t>(model.input3(), {5, 6, 7, 8}); model.Invoke(); - EXPECT_THAT(model.GetOutput<int32>(), ElementsAreArray({5, 2, 7, 8})); + EXPECT_THAT(model.GetOutput<int32_t>(), ElementsAreArray({5, 2, 7, 8})); EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4})); } @@ -113,11 +126,11 @@ TEST(SelectOpTest, RankOneSelectInt32) { SelectOpModel model({2}, {2, 1, 2, 1}, {2, 1, 2, 1}, TensorType_INT32); model.PopulateTensor<bool>(model.input1(), {false, true}); - model.PopulateTensor<int32>(model.input2(), {1, 2, 3, 4}); - model.PopulateTensor<int32>(model.input3(), {5, 6, 7, 8}); + model.PopulateTensor<int32_t>(model.input2(), {1, 2, 3, 4}); + model.PopulateTensor<int32_t>(model.input3(), {5, 6, 7, 8}); model.Invoke(); - EXPECT_THAT(model.GetOutput<int32>(), ElementsAreArray({5, 6, 3, 4})); + EXPECT_THAT(model.GetOutput<int32_t>(), ElementsAreArray({5, 6, 3, 4})); EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 2, 1})); } @@ -125,11 +138,11 @@ TEST(SelectOpTest, RankZeroSelectInt32) { SelectOpModel model({1}, {1, 2, 2, 1}, {1, 2, 2, 1}, TensorType_INT32); model.PopulateTensor<bool>(model.input1(), {false}); - model.PopulateTensor<int32>(model.input2(), {1, 2, 3, 4}); - model.PopulateTensor<int32>(model.input3(), {5, 6, 7, 8}); + model.PopulateTensor<int32_t>(model.input2(), {1, 2, 3, 4}); + model.PopulateTensor<int32_t>(model.input3(), {5, 6, 7, 8}); model.Invoke(); - EXPECT_THAT(model.GetOutput<int32>(), ElementsAreArray({5, 6, 7, 8})); + EXPECT_THAT(model.GetOutput<int32_t>(), ElementsAreArray({5, 6, 7, 8})); EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 2, 1})); } diff --git a/tensorflow/contrib/lite/kernels/shape.cc b/tensorflow/contrib/lite/kernels/shape.cc new file mode 100644 index 0000000000..66d4c9e5c1 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/shape.cc @@ -0,0 +1,93 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace shape { + +constexpr int kInputTensor = 0; +constexpr int kOutputTensor = 0; + +template <typename OutType> +void ExtractShape(const TfLiteTensor* input, OutType* output_data) { + for (int i = 0; i < NumDimensions(input); ++i) { + output_data[i] = SizeOfDimension(input, i); + } +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + auto* params = reinterpret_cast<TfLiteShapeParams*>(node->builtin_data); + switch (params->out_type) { + case kTfLiteInt32: + output->type = kTfLiteInt32; + break; + case kTfLiteInt64: + output->type = kTfLiteInt64; + break; + default: + context->ReportError(context, "Unknown shape output data type: %d", + params->out_type); + return kTfLiteError; + } + + // Shape always produces a 1-dimensional output tensor, where each output + // element is the length of the corresponding input tensor's dimension. + TfLiteIntArray* output_size = TfLiteIntArrayCreate(1); + output_size->data[0] = NumDimensions(input); + return context->ResizeTensor(context, output, output_size); +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TFLITE_DCHECK_EQ(NumDimensions(output), 1); + TFLITE_DCHECK_EQ(SizeOfDimension(output, 0), NumDimensions(input)); + + switch (output->type) { + case kTfLiteInt32: + ExtractShape(input, GetTensorData<int32_t>(output)); + break; + case kTfLiteInt64: + ExtractShape(input, GetTensorData<int64_t>(output)); + break; + default: + return kTfLiteError; + } + + return kTfLiteOk; +} + +} // namespace shape + +TfLiteRegistration* Register_SHAPE() { + static TfLiteRegistration r = {nullptr, nullptr, shape::Prepare, shape::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/shape_test.cc b/tensorflow/contrib/lite/kernels/shape_test.cc new file mode 100644 index 0000000000..27b48f4e99 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/shape_test.cc @@ -0,0 +1,95 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include <initializer_list> + +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; + +template <typename T> +class ShapeOpModel : public SingleOpModel { + public: + ShapeOpModel(std::initializer_list<int> input_shape, TensorType input_type, + TensorType output_type) { + input_ = AddInput(input_type); + output_ = AddOutput(output_type); + SetBuiltinOp(BuiltinOperator_SHAPE, BuiltinOptions_ShapeOptions, + CreateShapeOptions(builder_, output_type).Union()); + BuildInterpreter({input_shape}); + } + + TfLiteStatus InvokeWithResult() { return interpreter_->Invoke(); } + + int input() { return input_; } + + int32_t GetOutputSize() { return GetTensorSize(output_); } + std::vector<T> GetOutput() { return ExtractVector<T>(output_); } + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + private: + int input_; + int output_; +}; + +TEST(ShapeOpTest, OutTypeInt) { + ShapeOpModel<int32_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32, + TensorType_INT32); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5})); + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5})); +} + +TEST(ShapeOpTest, OutTypeInt64) { + ShapeOpModel<int64_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32, + TensorType_INT64); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5})); + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5})); +} + +TEST(ShapeOpTest, ScalarTensor) { + ShapeOpModel<int32_t> model({}, TensorType_FLOAT32, TensorType_INT32); + model.Invoke(); + + EXPECT_EQ(model.GetOutputSize(), 0); + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({0})); +} + +TEST(ShapeOpTest, EmptyTensor) { + ShapeOpModel<int32_t> model({1, 0}, TensorType_FLOAT32, TensorType_INT32); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0})); + EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2})); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/skip_gram.cc b/tensorflow/contrib/lite/kernels/skip_gram.cc index c90a15b3a2..de80a4016e 100644 --- a/tensorflow/contrib/lite/kernels/skip_gram.cc +++ b/tensorflow/contrib/lite/kernels/skip_gram.cc @@ -33,8 +33,8 @@ limitations under the License. #include <string> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" #include "tensorflow/contrib/lite/string_util.h" diff --git a/tensorflow/contrib/lite/kernels/slice.cc b/tensorflow/contrib/lite/kernels/slice.cc index b28934e2f7..ccfee41b9c 100644 --- a/tensorflow/contrib/lite/kernels/slice.cc +++ b/tensorflow/contrib/lite/kernels/slice.cc @@ -16,8 +16,8 @@ limitations under the License. #include <string.h> #include <cmath> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -85,7 +85,8 @@ TfLiteStatus ResizeOutputShape(TfLiteContext* context, TF_LITE_ENSURE_STATUS(CalculateOutputShapeVector<int64_t>( context, input, begin, size, &output_shape_vector)); } else { - context->ReportError(context, "Type is currently not supported by Slice."); + context->ReportError( + context, "Type %d is currently not supported by Slice.", begin->type); return kTfLiteError; } @@ -148,7 +149,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { GetBeginAndSizeVectors<int64_t>(NumDimensions(input), begin, size, &begins, &sizes); } else { - context->ReportError(context, "Type is currently not supported by Slice."); + context->ReportError( + context, "Type %d is currently not supported by Slice.", begin->type); return kTfLiteError; } @@ -157,10 +159,28 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { sizes.push_back(1); } -#define TF_LITE_SLICE(data_type) \ - optimized_ops::Slice<data_type>( \ - GetTensorData<data_type>(input), GetTensorDims(input), begins, sizes, \ - GetTensorData<data_type>(output), GetTensorDims(output)) + // The original Slice op implementation only accepted 4-D sizes. That + // constraint is, for the present, maintained here. + // + // The dimensions in the kernel used to be in reverse-order, and TFLite + // arranged the begins and sizes vectors accordingly. This macro incorporates + // the needed reversing. +#define TF_LITE_SLICE(data_type) \ + { \ + TF_LITE_ENSURE_EQ(context, begins.size(), 4); \ + TF_LITE_ENSURE_EQ(context, sizes.size(), 4); \ + tflite::SliceParams op_params; \ + op_params.begin_count = 4; \ + op_params.size_count = 4; \ + for (int i = 0; i < 4; ++i) { \ + op_params.begin[i] = begins[3 - i]; \ + op_params.size[i] = sizes[3 - i]; \ + } \ + \ + optimized_ops::Slice<data_type>( \ + op_params, GetTensorShape(input), GetTensorData<data_type>(input), \ + GetTensorShape(output), GetTensorData<data_type>(output)); \ + } switch (input->type) { case kTfLiteFloat32: @@ -179,8 +199,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_SLICE(bool); break; default: - context->ReportError(context, - "Type is currently not supported by Slice."); + context->ReportError( + context, "Type %d is currently not supported by Slice.", input->type); return kTfLiteError; } #undef TF_LITE_SLICE diff --git a/tensorflow/contrib/lite/kernels/softmax_test.cc b/tensorflow/contrib/lite/kernels/softmax_test.cc index 6c5338ff0f..bd66980226 100644 --- a/tensorflow/contrib/lite/kernels/softmax_test.cc +++ b/tensorflow/contrib/lite/kernels/softmax_test.cc @@ -92,10 +92,11 @@ TEST(SoftmaxOpTest, CompareWithTFminiBetaEq1) { m.Invoke(); std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]); - static tflite::Dims<4> input_dims = {{input_size, 1, 1, batch_size}, - {1, 0, 0, input_size}}; - tflite::reference_ops::Softmax(input_buffer, input_dims, beta, - output_buffer.get(), input_dims); + auto input_shape = RuntimeShape({batch_size, 1, 1, input_size}); + SoftmaxParams params; + params.beta = beta; + tflite::reference_ops::Softmax(params, input_shape, input_buffer, input_shape, + output_buffer.get()); std::vector<float> expected; expected.insert(expected.end(), output_buffer.get(), @@ -120,10 +121,11 @@ TEST(SoftmaxOpTest, CompareWithTFminiBetaNotEq1) { m.Invoke(); std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]); - static tflite::Dims<4> input_dims = {{input_size, 1, 1, batch_size}, - {1, 0, 0, input_size}}; - tflite::reference_ops::Softmax(input_buffer, input_dims, beta, - output_buffer.get(), input_dims); + auto input_shape = RuntimeShape({batch_size, 1, 1, input_size}); + SoftmaxParams params; + params.beta = beta; + tflite::reference_ops::Softmax(params, input_shape, input_buffer, input_shape, + output_buffer.get()); std::vector<float> expected; expected.insert(expected.end(), output_buffer.get(), diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc index 1e35869958..3a10d2e60c 100644 --- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc +++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include <string.h> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -113,47 +113,52 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); } -#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar) \ - type::SpaceToBatchND(GetTensorData<scalar>(op_context.input), \ - GetTensorDims(op_context.input), \ +#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar, pad_value) \ + tflite::SpaceToBatchParams op_params; \ + op_params.output_offset = pad_value; \ + type::SpaceToBatchND(op_params, GetTensorShape(op_context.input), \ + GetTensorData<scalar>(op_context.input), \ + GetTensorShape(op_context.block_shape), \ GetTensorData<int32_t>(op_context.block_shape), \ - GetTensorDims(op_context.block_shape), \ + GetTensorShape(op_context.paddings), \ GetTensorData<int32_t>(op_context.paddings), \ - GetTensorDims(op_context.paddings), \ - GetTensorData<scalar>(op_context.output), \ - GetTensorDims(op_context.output)) + GetTensorShape(op_context.output), \ + GetTensorData<scalar>(op_context.output)) switch (op_context.input->type) { // Already know in/out types are same. case kTfLiteFloat32: if (kernel_type == kReference) { - TF_LITE_SPACE_TO_BATCH_ND(reference_ops, float); + TF_LITE_SPACE_TO_BATCH_ND(reference_ops, float, 0); } else { - TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, float); + TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, float, 0); } break; case kTfLiteUInt8: if (kernel_type == kReference) { - TF_LITE_SPACE_TO_BATCH_ND(reference_ops, uint8_t); + TF_LITE_SPACE_TO_BATCH_ND(reference_ops, uint8_t, + op_context.output->params.zero_point); } else { - TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, uint8_t); + TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, uint8_t, + op_context.output->params.zero_point); } break; case kTfLiteInt32: if (kernel_type == kReference) { - TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t); + TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t, 0); } else { - TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int32_t); + TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int32_t, 0); } break; case kTfLiteInt64: if (kernel_type == kReference) { - TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int64_t); + TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int64_t, 0); } else { - TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int64_t); + TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int64_t, 0); } break; default: - context->ReportError(context, - "Type is currently not supported by SpaceToBatch."); + context->ReportError( + context, "Type %d is currently not supported by SpaceToBatch.", + op_context.input->type); return kTfLiteError; } #undef TF_LITE_SPACE_TO_BATCH_ND diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc index 92a4a037d5..5756573629 100644 --- a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc +++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc @@ -23,6 +23,7 @@ namespace tflite { namespace { using ::testing::ElementsAreArray; +using ::testing::Matcher; class SpaceToBatchNDOpModel : public SingleOpModel { public: @@ -30,6 +31,10 @@ class SpaceToBatchNDOpModel : public SingleOpModel { PopulateTensor<float>(input_, data); } + void SetQuantizedInput(std::initializer_list<float> data) { + QuantizeAndPopulate<uint8_t>(input_, data); + } + void SetBlockShape(std::initializer_list<int> data) { PopulateTensor<int>(block_shape_, data); } @@ -41,6 +46,11 @@ class SpaceToBatchNDOpModel : public SingleOpModel { std::vector<float> GetOutput() { return ExtractVector<float>(output_); } std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + std::vector<float> GetDequantizedOutput() { + return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_), + GetScale(output_), GetZeroPoint(output_)); + } + protected: int input_; int block_shape_; @@ -56,18 +66,19 @@ class SpaceToBatchNDOpModel : public SingleOpModel { // m.Invoke(); class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel { public: - SpaceToBatchNDOpConstModel(std::initializer_list<int> input_shape, + SpaceToBatchNDOpConstModel(const TensorData& input, std::initializer_list<int> block_shape, - std::initializer_list<int> paddings) { - input_ = AddInput(TensorType_FLOAT32); + std::initializer_list<int> paddings, + const TensorData& output) { + input_ = AddInput(input); block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2}); paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2}); - output_ = AddOutput(TensorType_FLOAT32); + output_ = AddOutput(output); SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND, BuiltinOptions_SpaceToBatchNDOptions, CreateSpaceToBatchNDOptions(builder_).Union()); - BuildInterpreter({input_shape}); + BuildInterpreter({input.shape}); } }; @@ -81,26 +92,30 @@ class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel { // m.Invoke(); class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel { public: - SpaceToBatchNDOpDynamicModel(std::initializer_list<int> input_shape) { - input_ = AddInput(TensorType_FLOAT32); + SpaceToBatchNDOpDynamicModel(const TensorData& input, + const TensorData& output) { + input_ = AddInput(input); block_shape_ = AddInput(TensorType_INT32); paddings_ = AddInput(TensorType_INT32); - output_ = AddOutput(TensorType_FLOAT32); + output_ = AddOutput(output); SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND, BuiltinOptions_SpaceToBatchNDOptions, CreateSpaceToBatchNDOptions(builder_).Union()); - BuildInterpreter({input_shape, {2}, {2, 2}}); + BuildInterpreter({input.shape, {2}, {2, 2}}); } }; TEST(SpaceToBatchNDOpTest, InvalidShapeTest) { - EXPECT_DEATH(SpaceToBatchNDOpConstModel({1, 3, 3, 1}, {2, 2}, {0, 0, 0, 0}), - "Cannot allocate tensors"); + EXPECT_DEATH( + SpaceToBatchNDOpConstModel({TensorType_FLOAT32, {1, 3, 3, 1}}, {2, 2}, + {0, 0, 0, 0}, {TensorType_FLOAT32}), + "Cannot allocate tensors"); } TEST(SpaceToBatchNDOpTest, SimpleConstTest) { - SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0}); + SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 4, 1}}, {2, 2}, + {0, 0, 0, 0}, {TensorType_FLOAT32}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1})); @@ -109,7 +124,8 @@ TEST(SpaceToBatchNDOpTest, SimpleConstTest) { } TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) { - SpaceToBatchNDOpDynamicModel m({1, 4, 4, 1}); + SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 4, 4, 1}}, + {TensorType_FLOAT32}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); m.SetBlockShape({2, 2}); m.SetPaddings({0, 0, 0, 0}); @@ -120,7 +136,8 @@ TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) { } TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) { - SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0}); + SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {2, 2, 4, 1}}, {2, 2}, + {0, 0, 0, 0}, {TensorType_FLOAT32}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1})); @@ -129,7 +146,8 @@ TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) { } TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) { - SpaceToBatchNDOpDynamicModel m({2, 2, 4, 1}); + SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {2, 2, 4, 1}}, + {TensorType_FLOAT32}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); m.SetBlockShape({2, 2}); m.SetPaddings({0, 0, 0, 0}); @@ -140,7 +158,8 @@ TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) { } TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) { - SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0}); + SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 5, 2, 1}}, {3, 2}, + {1, 0, 2, 0}, {TensorType_FLOAT32}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1})); @@ -151,7 +170,8 @@ TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) { } TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) { - SpaceToBatchNDOpDynamicModel m({1, 5, 2, 1}); + SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 5, 2, 1}}, + {TensorType_FLOAT32}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); m.SetBlockShape({3, 2}); m.SetPaddings({1, 0, 2, 0}); @@ -164,7 +184,8 @@ TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) { } TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) { - SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4}); + SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 2, 1}}, {3, 2}, + {1, 1, 2, 4}, {TensorType_FLOAT32}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1})); @@ -176,7 +197,8 @@ TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) { } TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) { - SpaceToBatchNDOpDynamicModel m({1, 4, 2, 1}); + SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 4, 2, 1}}, + {TensorType_FLOAT32}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8}); m.SetBlockShape({3, 2}); m.SetPaddings({1, 1, 2, 4}); @@ -189,6 +211,88 @@ TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) { })); } +class QuantizedSpaceToBatchNDOpTest : public ::testing::Test { + protected: + std::vector<Matcher<float>> DequantizedArrayNear( + const std::vector<float>& values, const float min, const float max) { + const float quantization_tolerance = (max - min) / 255.0; + return ArrayFloatNear(values, quantization_tolerance); + } +}; + +TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) { + // The test_util and actual quantization code currently ensure that the range + // must include zero, but if that ever changes, this test will catch it. + EXPECT_DEATH(SpaceToBatchNDOpConstModel m( + {TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2}, + {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {}, 1.0, 2.0}), + ".*Check failed: f_min <= 0.*"); +} + +TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTest) { + SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0}, + {3, 2}, {1, 0, 2, 0}, + {TensorType_UINT8, {}, -1.0, 1.0}); + m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(DequantizedArrayNear( + {0, 0, 0, -0.5, 0, 0, 0, 0.6, 0, -0.1, 0, -0.7, + 0, 0.2, 0, 0.8, 0, -0.3, 0, -0.9, 0, 0.4, 0, 0.1}, + -1.0, 1.0))); +} + +TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTest) { + SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0}, + {TensorType_UINT8, {}, -1.0, 1.0}); + m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1}); + m.SetBlockShape({3, 2}); + m.SetPaddings({1, 0, 2, 0}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(DequantizedArrayNear( + {0, 0, 0, -0.5, 0, 0, 0, 0.6, 0, -0.1, 0, -0.7, + 0, 0.2, 0, 0.8, 0, -0.3, 0, -0.9, 0, 0.4, 0, 0.1}, + -1.0, 1.0))); +} + +TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) { + SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0}, + {3, 2}, {1, 1, 2, 4}, + {TensorType_UINT8, {}, -1.0, 1.0}); + m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(DequantizedArrayNear( + { + 0, 0, 0, 0, 0, -0.5, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, + 0, -0.1, 0, 0, 0, -0.7, 0, 0, 0, 0.2, 0, 0, 0, 0.8, 0, 0, + 0, -0.3, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0, 0, + }, + -1.0, 1.0))); +} + +TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) { + SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0}, + {TensorType_UINT8, {}, -1.0, 1.0}); + m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8}); + m.SetBlockShape({3, 2}); + m.SetPaddings({1, 1, 2, 4}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1})); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(DequantizedArrayNear( + { + 0, 0, 0, 0, 0, -0.5, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0, + 0, -0.1, 0, 0, 0, -0.7, 0, 0, 0, 0.2, 0, 0, 0, 0.8, 0, 0, + 0, -0.3, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0, 0, 0, 0, 0, + }, + -1.0, 1.0))); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/space_to_depth.cc b/tensorflow/contrib/lite/kernels/space_to_depth.cc index aafce89512..64c56c017b 100644 --- a/tensorflow/contrib/lite/kernels/space_to_depth.cc +++ b/tensorflow/contrib/lite/kernels/space_to_depth.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -79,10 +79,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input = GetInput(context, node, kInputTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); -#define TF_LITE_SPACE_TO_DEPTH(type, scalar) \ - type::SpaceToDepth<scalar>( \ - GetTensorData<scalar>(input), GetTensorDims(input), params->block_size, \ - GetTensorData<scalar>(output), GetTensorDims(output)) +#define TF_LITE_SPACE_TO_DEPTH(type, scalar) \ + tflite::SpaceToDepthParams op_params; \ + op_params.block_size = params->block_size; \ + type::SpaceToDepth(op_params, GetTensorShape(input), \ + GetTensorData<scalar>(input), GetTensorShape(output), \ + GetTensorData<scalar>(output)) switch (input->type) { // Already know in/out types are same. case kTfLiteFloat32: if (kernel_type == kReference) { @@ -113,7 +115,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } break; default: - context->ReportError(context, "Type not currently supported."); + context->ReportError(context, "Type %d not currently supported.", + input->type); return kTfLiteError; } #undef TF_LITE_SPACE_TO_DEPTH diff --git a/tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc b/tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc new file mode 100644 index 0000000000..843ed0768c --- /dev/null +++ b/tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc @@ -0,0 +1,235 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// SparseOutputFullyConnected is a fully connected layer that uses a single +// row in the weights and bias via a lookup. +#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +namespace tflite { +namespace ops { +namespace custom { +namespace sparse_output_fully_connected { + +// Input tensors of size {n_batch, n_input} +constexpr int kInputTensor = 0; +// Auxiliary input tensor of size { 1 } +constexpr int kInputLookupTensor = 1; + +// Weights tensor of size { n_embeddings , n_input } +constexpr int kWeightsTensor = 2; +// Bias tensor of size { n_embeddings } +constexpr int kBiasTensor = 3; + +// Output tensor. +constexpr int kOutputTensor = 0; + +// Temporary tensors. +enum TemporaryTensor { + kInputQuantized = 0, + kScalingFactors = 1, + kNumTemporaryTensors = 2 +}; + +// Struct to hold op data. +struct OpData { + int scratch_tensor_index; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* data = new OpData; + context->AddTensors(context, /*tensors_to_add=*/kNumTemporaryTensors, + &data->scratch_tensor_index); + return data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<OpData*>(buffer); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + OpData* op_data = reinterpret_cast<OpData*>(node->user_data); + + TF_LITE_ENSURE_EQ(context, node->inputs->size, 4); + TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2); + const int n_batch = SizeOfDimension(input, 0); + const int n_input = SizeOfDimension(input, 1); + + const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor); + TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32); + TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1); + // Only support single lookup. + TF_LITE_ENSURE_EQ(context, SizeOfDimension(lookup, 0), 1); + + const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor); + TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 2); + TF_LITE_ENSURE_EQ(context, SizeOfDimension(weights, 1), n_input); + + const TfLiteTensor* bias = GetInput(context, node, kBiasTensor); + TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(weights, 0)); + + const bool is_hybrid_op = + (weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32); + + if (is_hybrid_op) { + TfLiteIntArrayFree(node->temporaries); + node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors); + + // Allocate temporary tensors to store quantized values of input. + node->temporaries->data[kInputQuantized] = op_data->scratch_tensor_index; + TfLiteTensor* input_quantized = + GetTemporary(context, node, /*index=*/kInputQuantized); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } + + // Tell interpreter to allocate temporary tensors to store scaling factors. + node->temporaries->data[kScalingFactors] = + op_data->scratch_tensor_index + kScalingFactors; + TfLiteTensor* scaling_factors = + GetTemporary(context, node, /*index=*/kScalingFactors); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + scaling_factors_size->data[0] = n_batch; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } + } + return kTfLiteOk; +} + +TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* lookup, + const TfLiteTensor* weights, const TfLiteTensor* bias, + TfLiteTensor* output) { + const int n_batch = SizeOfDimension(input, 0); + const int n_input = SizeOfDimension(input, 1); + + const float* input_ptr_batch = input->data.f; + + // Initialize pointer to right row according to lookup value. + int32 lookup_index = lookup->data.i32[0]; + const float* weights_ptr = weights->data.f + lookup_index * n_input; + + // Initialize output to bias. + if (bias) { + float* bias_ptr = bias->data.f + lookup_index; + tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f); + } else { + tensor_utils::ZeroVector(output->data.f, n_batch * 1); + } + + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + weights_ptr, /*m_rows=*/1, n_input, input_ptr_batch, n_batch, + output->data.f, /*result_stride=*/1); + + return kTfLiteOk; +} + +TfLiteStatus EvalHybrid(const TfLiteTensor* input, const TfLiteTensor* lookup, + const TfLiteTensor* weights, const TfLiteTensor* bias, + TfLiteTensor* scaling_factors, + TfLiteTensor* input_quantized, TfLiteTensor* output) { + const int n_batch = SizeOfDimension(input, 0); + const int n_input = SizeOfDimension(input, 1); + + const float* input_ptr_batch = input->data.f; + // Initialize the pointer to storage for quantized values and + // scaling factors. + int8_t* quantized_input_ptr_batch = + reinterpret_cast<int8_t*>(input_quantized->data.uint8); + float* scaling_factors_ptr = scaling_factors->data.f; + + // Initialize pointer to right row according to lookup value. + int32 lookup_index = lookup->data.i32[0]; + int8_t* weights_ptr = + reinterpret_cast<int8_t*>(weights->data.uint8) + lookup_index * n_input; + + // Initialize output to bias. + if (bias) { + float* bias_ptr = bias->data.f + lookup_index; + tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f); + } else { + tensor_utils::ZeroVector(output->data.f, n_batch * 1); + } + + if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) { + // Quantize input from float to int8. + float unused_min, unused_max; + for (int b = 0; b < n_batch; ++b) { + const int offset = b * n_input; + tensor_utils::SymmetricQuantizeFloats( + input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset, + &unused_min, &unused_max, &scaling_factors_ptr[b]); + scaling_factors_ptr[b] *= weights->params.scale; + } + + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + weights_ptr, /*m_rows=*/1, n_input, quantized_input_ptr_batch, + scaling_factors_ptr, n_batch, output->data.f, /*result_stride=*/1); + } + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor); + const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor); + const TfLiteTensor* bias = GetInput(context, node, kBiasTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + switch (weights->type) { + case kTfLiteFloat32: { + return EvalFloat(input, lookup, weights, bias, output); + } + case kTfLiteUInt8: { + TfLiteTensor* input_quantized = + GetTemporary(context, node, /*index=*/kInputQuantized); + TfLiteTensor* scaling_factors = + GetTemporary(context, node, /*index=*/kScalingFactors); + return EvalHybrid(input, lookup, weights, bias, scaling_factors, + input_quantized, output); + } + default: + context->ReportError(context, "Type %d is not currently supported.", + weights->type); + return kTfLiteError; + } + return kTfLiteOk; +} + +} // namespace sparse_output_fully_connected + +TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED() { + static TfLiteRegistration r = {sparse_output_fully_connected::Init, + sparse_output_fully_connected::Free, + sparse_output_fully_connected::Prepare, + sparse_output_fully_connected::Eval}; + return &r; +} + +} // namespace custom +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc b/tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc new file mode 100644 index 0000000000..365986a5c1 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc @@ -0,0 +1,158 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// Unit test for TFLite sparse output fully connected op. +#include <iomanip> +#include <random> +#include <vector> + +#include <gtest/gtest.h> +#include "flatbuffers/flexbuffers.h" // TF:flatbuffers +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" + +namespace tflite { + +namespace ops { +namespace custom { + +TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED(); + +namespace { + +using ::testing::ElementsAreArray; + +class BaseSparseOutputFullyConnectedOpModel : public SingleOpModel { + public: + BaseSparseOutputFullyConnectedOpModel(const TensorData& input, + const TensorData& weights, + const TensorData& output = { + TensorType_FLOAT32}) { + input_ = AddInput(input); + lookup_ = AddInput({TensorType_INT32, {1}}); + weights_ = AddInput(weights); + int bias_size = GetShape(weights_)[0]; + bias_ = AddInput({TensorType_FLOAT32, {bias_size}}); + output_ = AddOutput(output); + + // Create empty (required) options map. + flexbuffers::Builder fbb; + fbb.Map([&]() {}); + fbb.Finish(); + + SetCustomOp("SPARSE_OUTPUT_FULLY_CONNECTED", fbb.GetBuffer(), + Register_SPARSE_OUTPUT_FULLY_CONNECTED); + BuildInterpreter({GetShape(input_), GetShape(lookup_), GetShape(weights_), + GetShape(bias_)}); + } + + void SetInput(const std::vector<float>& data) { + PopulateTensor(input_, data); + } + + void SetLookup(const std::vector<int32>& f) { PopulateTensor(lookup_, f); } + + void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); } + + std::vector<float> GetOutput() { return ExtractVector<float>(output_); } + + protected: + int input_; + int lookup_; + int weights_; + int bias_; + int output_; +}; + +class FloatSparseOutputFullyConnectedOpModel + : public BaseSparseOutputFullyConnectedOpModel { + public: + using BaseSparseOutputFullyConnectedOpModel:: + BaseSparseOutputFullyConnectedOpModel; + + void SetWeights(const std::vector<float>& f) { PopulateTensor(weights_, f); } +}; + +class HybridSparseOutputFullyConnectedOpModel + : public BaseSparseOutputFullyConnectedOpModel { + public: + using BaseSparseOutputFullyConnectedOpModel:: + BaseSparseOutputFullyConnectedOpModel; + + void SetWeights(const std::vector<float>& f) { + SymmetricQuantizeAndPopulate(weights_, f); + } +}; + +TEST(SparseOutputFullyConnectedOpTest, SimpleTestFloat) { + FloatSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}}, + {TensorType_FLOAT32, {3, 5}}, + {TensorType_FLOAT32, {}}); + + m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0}); + + m.SetLookup({2}); + + m.SetWeights({ + -1.0, 0.0, 1.0, 2.0, 3.0, // + 0.0, 1.0, 2.0, 3.0, 4.0, // + 1.0, 2.0, 3.0, 4.0, 5.0, // + }); + + m.SetBias({1.0, 2.0, 3.0}); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray({28})); +} + +TEST(SparseOutputFullyConnectedOpTest, SimpleTestHybrid) { + HybridSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}}, + {TensorType_UINT8, {3, 5}}, + {TensorType_FLOAT32, {}}); + + m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0}); + + m.SetLookup({2}); + + m.SetWeights({ + -1.0, 0.0, 1.0, 2.0, 3.0, // + 0.0, 1.0, 2.0, 3.0, 4.0, // + 1.0, 2.0, 3.0, 4.0, 5.0, // + }); + + m.SetBias({1.0, 2.0, 3.0}); + + m.Invoke(); + + // We get 28.0552 instead of 28. + // + // Input -> -42, 0, 42, 85, 127 with scale factor of 127/3. + // Looked up weights -> 25, 51, 76, 102, 127 with scale factor of 127/5. + // + // (-42 * 25 + 0 * 51 + 42 * 76 + 85 * 102 + 127 * 127) * (3*5/127^2) + 3.0 + // gives us the expected result. + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({28}, 0.0553))); +} + +} // namespace +} // namespace custom +} // namespace ops +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc new file mode 100644 index 0000000000..349fa0bd28 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc @@ -0,0 +1,275 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <cassert> +#include <cmath> +#include <cstdio> +#include <cstdlib> +#include <iostream> +#include <limits> + +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" +#include "tensorflow/contrib/lite/kernels/padding.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace sparse_to_dense { + +constexpr int kIndicesTensor = 0; +constexpr int kOutputShapeTensor = 1; +constexpr int kValueInputTensor = 2; +constexpr int kDefaultValueTensor = 3; +constexpr int kOutputTensor = 0; + +constexpr int kMaxDimensions = 4; + +template <typename T> +TfLiteStatus Resize(TfLiteContext* context, const TfLiteTensor* output_shape, + TfLiteTensor* output) { + const int output_dimensions = NumElements(output_shape); + TfLiteIntArray* output_shape_array = TfLiteIntArrayCreate(output_dimensions); + for (int i = 0; i < output_dimensions; ++i) { + output_shape_array->data[i] = GetTensorData<T>(output_shape)[i]; + } + + return context->ResizeTensor(context, output, output_shape_array); +} + +TfLiteStatus CheckDimensionsMatch(TfLiteContext* context, + const TfLiteTensor* indices, + const TfLiteTensor* output_shape, + const TfLiteTensor* values) { + switch (NumDimensions(indices)) { + case 0: + case 1: { + if (NumDimensions(values) == 0) { + TF_LITE_ENSURE_EQ(context, NumElements(indices), NumElements(values)); + } + TF_LITE_ENSURE_EQ(context, NumElements(output_shape), 1); + break; + } + case 2: { + TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 1), + NumElements(output_shape)); + if (NumDimensions(values) == 0) + TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0), + NumElements(values)); + break; + } + default: + context->ReportError( + context, "Wrong indices dimensions %d, should be less than 3.", + NumDimensions(indices)); + return kTfLiteError; + } + return kTfLiteOk; +} + +// Convert indices into a vector of 4-d vectors. +// TODO(renjieliu): Revisit here to improve the performance, since multiple +// allocations of std::vectors will be quite slow on phones. +template <typename T> +TfLiteStatus GetIndicesVector(TfLiteContext* context, + const TfLiteTensor* indices, + const int num_indices, + std::vector<std::vector<T>>* indices_vector) { + // Note because TfLite will reverse the dimensions, so pad zeros upfront. + switch (NumDimensions(indices)) { + case 0: + case 1: { + const auto indices_data = GetTensorData<T>(indices); + for (int i = 0; i < num_indices; ++i) { + std::vector<T> index({0, 0, 0, indices_data[i]}); + indices_vector->push_back(index); + } + break; + } + case 2: { + const int true_dimensions = SizeOfDimension(indices, 1); + TF_LITE_ENSURE(context, true_dimensions <= kMaxDimensions); + for (int i = 0; i < num_indices; ++i) { + std::vector<T> index; + index.reserve(kMaxDimensions); + // Fill the index with 1 up to kMaxDimensions - true_dimensions to + // satisfy the needs for 4-dimension index. + for (int j = 0; j < kMaxDimensions - true_dimensions; ++j) { + index.push_back(0); + } + for (int j = 0; j < true_dimensions; ++j) { + index.push_back(GetTensorData<T>(indices)[i * true_dimensions + j]); + } + + indices_vector->push_back(index); + } + break; + } + default: + context->ReportError(context, + "Indices dimensions problem, got %d dimensions", + NumDimensions(indices)); + return kTfLiteError; + } + return kTfLiteOk; +} + +TfLiteStatus ResizeOutputShape(TfLiteContext* context, + const TfLiteTensor* output_shape, + TfLiteTensor* output) { + if (output_shape->type == kTfLiteInt32) { + return Resize<int32_t>(context, output_shape, output); + } else if (output_shape->type == kTfLiteInt64) { + return Resize<int64_t>(context, output_shape, output); + } else { + context->ReportError(context, "Dense shape type %d not supported.", + output_shape->type); + return kTfLiteError; + } +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 4); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor); + const TfLiteTensor* output_shape = + GetInput(context, node, kOutputShapeTensor); + const TfLiteTensor* values = GetInput(context, node, kValueInputTensor); + const TfLiteTensor* default_value = + GetInput(context, node, kDefaultValueTensor); + + // TODO(renjieliu): Handle validate_indices. + + // Indices can be 0-D, 1-D or 2-D. + TF_LITE_ASSERT(NumDimensions(indices) >= 0); + TF_LITE_ENSURE(context, NumDimensions(indices) < 3); + TF_LITE_ASSERT(NumDimensions(output_shape) >= 0); + TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1); + // Values can be 0-D or 1-D. + TF_LITE_ASSERT(NumDimensions(values) >= 0); + TF_LITE_ENSURE(context, NumDimensions(values) < 2); + + TF_LITE_ENSURE_EQ(context, NumElements(default_value), 1); + + TF_LITE_ENSURE( + context, indices->type == kTfLiteInt32 || indices->type == kTfLiteInt64); + TF_LITE_ENSURE(context, output_shape->type == kTfLiteInt32 || + output_shape->type == kTfLiteInt64); + TF_LITE_ENSURE_EQ(context, values->type, default_value->type); + + // Ensure dimensions match. + TF_LITE_ENSURE_OK( + context, CheckDimensionsMatch(context, indices, output_shape, values)); + + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1); + + if (!IsConstantTensor(output_shape)) { + SetTensorToDynamic(output); + return kTfLiteOk; + } + return ResizeOutputShape(context, output_shape, output); +} + +template <typename T, typename TI> +TfLiteStatus SparseToDenseImpl(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor); + const TfLiteTensor* output_shape = + GetInput(context, node, kOutputShapeTensor); + const TfLiteTensor* values = GetInput(context, node, kValueInputTensor); + const TfLiteTensor* default_value = + GetInput(context, node, kDefaultValueTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + if (IsDynamicTensor(output)) { + TF_LITE_ENSURE_OK(context, + ResizeOutputShape(context, output_shape, output)); + } + + const int num_indices = SizeOfDimension(indices, 0); + const bool value_is_scalar = NumDimensions(values) == 0; + std::vector<std::vector<TI>> indices_vector; + indices_vector.reserve(num_indices); + TF_LITE_ENSURE_OK(context, GetIndicesVector<TI>(context, indices, num_indices, + &indices_vector)); + reference_ops::SparseToDense(indices_vector, GetTensorData<T>(values), + *GetTensorData<T>(default_value), + value_is_scalar, GetTensorShape(output), + GetTensorData<T>(output)); + + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor); + const TfLiteTensor* values = GetInput(context, node, kValueInputTensor); + + // Currently only supports float32 and int32. + switch (values->type) { + case kTfLiteFloat32: { + switch (indices->type) { + case kTfLiteInt32: { + return SparseToDenseImpl<float, int32_t>(context, node); + } + case kTfLiteInt64: { + return SparseToDenseImpl<float, int64_t>(context, node); + } + default: + context->ReportError( + context, "Type %d is currently not supported by sparse to dense.", + indices->type); + return kTfLiteError; + } + break; + } + case kTfLiteInt32: { + switch (indices->type) { + case kTfLiteInt32: { + return SparseToDenseImpl<int32_t, int32_t>(context, node); + } + case kTfLiteInt64: { + return SparseToDenseImpl<int32_t, int64_t>(context, node); + } + default: + context->ReportError( + context, "Type %d is currently not supported by sparse to dense.", + indices->type); + return kTfLiteError; + } + break; + } + default: + context->ReportError( + context, "Type %d is currently not supported by sparse to dense.", + values->type); + return kTfLiteError; + } +} + +} // namespace sparse_to_dense + +TfLiteRegistration* Register_SPARSE_TO_DENSE() { + static TfLiteRegistration r = {nullptr, nullptr, sparse_to_dense::Prepare, + sparse_to_dense::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc b/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc new file mode 100644 index 0000000000..a51ec17afc --- /dev/null +++ b/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc @@ -0,0 +1,155 @@ + +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <cstdarg> +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; + +template <typename T> +class SparseToDenseOpModel : public SingleOpModel { + public: + SparseToDenseOpModel(std::initializer_list<int> indices_shape, + std::initializer_list<int> output_shape_shape, + std::initializer_list<int> values_shape, T default_value, + TensorType tensor_index_type, + TensorType tensor_input_type) { + indices_ = AddInput(tensor_index_type); + output_shape_ = AddInput(TensorType_INT32); + values_ = AddInput(tensor_input_type); + default_value_ = AddInput(tensor_input_type); + output_ = AddOutput(tensor_input_type); + + SetBuiltinOp(BuiltinOperator_SPARSE_TO_DENSE, + BuiltinOptions_SparseToDenseOptions, + CreateSparseToDenseOptions(builder_, false).Union()); + BuildInterpreter({indices_shape, output_shape_shape, values_shape, {1}}); + + PopulateTensor<T>(default_value_, {default_value}); + } + + int indices() { return indices_; } + int output_shape() { return output_shape_; } + int values() { return values_; } + + std::vector<T> GetOutput() { return ExtractVector<T>(output_); } + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + private: + int indices_; + int output_shape_; + int values_; + int default_value_; + int output_; +}; + +TEST(SparseToDenseOpModelTest, ZeroDimensionTest) { + SparseToDenseOpModel<float> m({1}, {1}, {1}, 0, TensorType_INT32, + TensorType_FLOAT32); + m.PopulateTensor<int32_t>(m.indices(), {3}); + m.PopulateTensor<int32_t>(m.output_shape(), {5}); + m.PopulateTensor<float>(m.values(), {7}); + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 7, 0})); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({5})); +} + +TEST(SparseToDenseOpModelTest, OneDimensionTest) { + SparseToDenseOpModel<float> m({3}, {1}, {3}, 0, TensorType_INT32, + TensorType_FLOAT32); + m.PopulateTensor<int32_t>(m.indices(), {1, 3, 5}); + m.PopulateTensor<int32_t>(m.output_shape(), {7}); + m.PopulateTensor<float>(m.values(), {2, 4, 6}); + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 0, 4, 0, 6, 0})); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({7})); +} + +TEST(SparseToDenseOpModelTest, TwoDimensionsTest) { + SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, 0, TensorType_INT32, + TensorType_FLOAT32); + m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1}); + m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3}); + m.PopulateTensor<float>(m.values(), {2, 4, 6}); + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), + ElementsAreArray({2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0})); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3})); +} + +TEST(SparseToDenseOpModelTest, DefaultValueTest) { + SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, -1, TensorType_INT32, + TensorType_FLOAT32); + m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1}); + m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3}); + m.PopulateTensor<float>(m.values(), {2, 4, 6}); + m.Invoke(); + + EXPECT_THAT( + m.GetOutput(), + ElementsAreArray({2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 4, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1})); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3})); +} + +TEST(SparseToDenseOpModelTest, IntegerValueTest) { + SparseToDenseOpModel<int32_t> m({3, 3}, {3}, {3}, -1, TensorType_INT32, + TensorType_INT32); + m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1}); + m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3}); + m.PopulateTensor<int32_t>(m.values(), {2, 4, 6}); + m.Invoke(); + + EXPECT_THAT( + m.GetOutput(), + ElementsAreArray({2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 4, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1})); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3})); +} + +TEST(SparseToDenseOpModelTest, Int64IndexTest) { + SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, -1, TensorType_INT64, + TensorType_FLOAT32); + m.PopulateTensor<int64_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1}); + m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3}); + m.PopulateTensor<float>(m.values(), {2, 4, 6}); + m.Invoke(); + + EXPECT_THAT( + m.GetOutput(), + ElementsAreArray({2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 4, -1, -1, 6, -1, -1, -1, -1, -1, -1, -1})); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3})); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/split.cc b/tensorflow/contrib/lite/kernels/split.cc index c6b94c25be..dab887bf9c 100644 --- a/tensorflow/contrib/lite/kernels/split.cc +++ b/tensorflow/contrib/lite/kernels/split.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include <string.h> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" @@ -76,8 +76,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumOutputs(node), op_context.params->num_splits); auto input_type = op_context.input->type; - TF_LITE_ENSURE(context, - input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8); + TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 || + input_type == kTfLiteUInt8 || + input_type == kTfLiteInt16); for (int i = 0; i < NumOutputs(node); ++i) { GetOutput(context, node, i)->type = input_type; } @@ -108,25 +109,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { if (axis_value < 0) { axis_value += NumDimensions(op_context.input); } - axis_value = RemapDim(NumDimensions(op_context.input), axis_value); // TODO(ahentz): Our usage of VectorOfTensors could be optimized by // calculating it in Prepare, unless we defer shape calculation. // TODO(ahentz): We can improve the optimized_ops version to handle other // cases too. -#define TF_LITE_SPLIT(scalar) \ - VectorOfTensors<scalar> all_outputs(*context, *node->outputs); \ - if (axis_value == NumDimensions(op_context.input)) { \ - optimized_ops::TensorFlowSplit<FusedActivationFunctionType::kNone, \ - scalar>( \ - GetTensorData<scalar>(op_context.input), \ - GetTensorDims(op_context.input), NumOutputs(node), all_outputs.data(), \ - all_outputs.dims()); \ - } else { \ - reference_ops::TensorFlowSplit<scalar>( \ - GetTensorData<scalar>(op_context.input), \ - GetTensorDims(op_context.input), axis_value, NumOutputs(node), \ - all_outputs.data(), all_outputs.dims()); \ +#define TF_LITE_SPLIT(scalar) \ + VectorOfTensors<scalar> all_outputs(*context, *node->outputs); \ + tflite::SplitParams op_params; \ + op_params.num_split = NumOutputs(node); \ + op_params.axis = axis_value; \ + if (axis_value == 0) { \ + optimized_ops::Split(op_params, GetTensorShape(op_context.input), \ + GetTensorData<scalar>(op_context.input), \ + all_outputs.shapes(), all_outputs.data()); \ + } else { \ + reference_ops::Split(op_params, GetTensorShape(op_context.input), \ + GetTensorData<scalar>(op_context.input), \ + all_outputs.shapes(), all_outputs.data()); \ } switch (op_context.input->type) { case kTfLiteFloat32: { @@ -137,9 +137,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_SPLIT(uint8_t); break; } + case kTfLiteInt16: { + TF_LITE_SPLIT(int16_t); + break; + } default: - context->ReportError(context, - "Only float32 and uint8 are currently supported."); + context->ReportError( + context, + "Only float32, uint8 and int16 are currently supported, got %d.", + op_context.input->type); return kTfLiteError; } #undef TF_LITE_SPLIT diff --git a/tensorflow/contrib/lite/kernels/squeeze.cc b/tensorflow/contrib/lite/kernels/squeeze.cc index 09a5662fd9..080c51cd18 100644 --- a/tensorflow/contrib/lite/kernels/squeeze.cc +++ b/tensorflow/contrib/lite/kernels/squeeze.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include <string.h> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc index 9417be32b3..06b36dd196 100644 --- a/tensorflow/contrib/lite/kernels/strided_slice.cc +++ b/tensorflow/contrib/lite/kernels/strided_slice.cc @@ -15,8 +15,8 @@ limitations under the License. #include <string.h> #include <cmath> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -57,17 +57,6 @@ struct StridedSliceContext { int dims; }; -// Reverse order of bits in the mask to match the expected order in kernel -inline int ReverseMaskBits(int mask, int num_dimensions) { - int out = 0; - for (int dim = 0; dim < num_dimensions; dim++) { - out <<= 1; - out += (mask & 1); - mask >>= 1; - } - return out; -} - // This Op only supports 1-4D cases and since we use the reference 4D // implementation, the 1-3D tensors are mapped to 4D. const int kMaxDim = 4; @@ -121,10 +110,19 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context, int32_t begin = GetBeginValueAtIndex(op_context, idx); int32_t end = GetEndValueAtIndex(op_context, idx); + // When shrinking an axis, the end position does not matter (and can be + // incorrect when negative indexing is used, see Issue #19260). Always use + // begin + 1 to generate a length 1 slice, since begin has + // already been adjusted for negative indices by GetBeginValueAtIndex. + const bool shrink_axis = op_context->params->shrink_axis_mask & (1 << idx); + if (shrink_axis) { + end = begin + 1; + } + // This is valid for both positive and negative strides int32_t dim_shape = ceil((end - begin) / static_cast<float>(stride)); dim_shape = dim_shape < 0 ? 0 : dim_shape; - if (!(op_context->params->shrink_axis_mask & (1 << idx))) { + if (!shrink_axis) { output_shape_vector.push_back(dim_shape); } } @@ -189,28 +187,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { std::vector<int32_t> stops; std::vector<int32_t> strides; - for (int idx = op_context.dims - 1; idx >= 0; --idx) { - starts.emplace_back(GetTensorData<int32_t>(op_context.begin)[idx]); - stops.emplace_back(GetTensorData<int32_t>(op_context.end)[idx]); - strides.emplace_back(GetTensorData<int32_t>(op_context.strides)[idx]); - } - for (int i = op_context.dims; i < kMaxDim; i++) { starts.emplace_back(0); stops.emplace_back(1); strides.emplace_back(1); } - int begin_mask = - ReverseMaskBits(op_context.params->begin_mask, op_context.dims); - int end_mask = ReverseMaskBits(op_context.params->end_mask, op_context.dims); + for (int idx = 0; idx < op_context.dims; ++idx) { + starts.emplace_back(GetTensorData<int32_t>(op_context.begin)[idx]); + stops.emplace_back(GetTensorData<int32_t>(op_context.end)[idx]); + strides.emplace_back(GetTensorData<int32_t>(op_context.strides)[idx]); + } + + int begin_mask = op_context.params->begin_mask << (4 - op_context.dims); + int end_mask = op_context.params->end_mask << (4 - op_context.dims); + int shrink_axis_mask = op_context.params->shrink_axis_mask + << (4 - op_context.dims); + TF_LITE_ENSURE_EQ(context, starts.size(), 4); + auto op_params = ::tflite::strided_slice::BuildStridedSliceParams( + begin_mask, end_mask, shrink_axis_mask, starts, stops, strides); #define TF_LITE_STRIDED_SLICE(kernel_type, data_type) \ - kernel_type::StridedSlice(GetTensorData<data_type>(op_context.input), \ - GetTensorDims(op_context.input), begin_mask, \ - end_mask, starts, stops, strides, \ - GetTensorData<data_type>(op_context.output), \ - GetTensorDims(op_context.output)) + kernel_type::StridedSlice(op_params, GetTensorShape(op_context.input), \ + GetTensorData<data_type>(op_context.input), \ + GetTensorShape(op_context.output), \ + GetTensorData<data_type>(op_context.output)) switch (op_context.input->type) { case kTfLiteFloat32: @@ -235,8 +236,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { break; default: context->ReportError(context, - "Type is currently not supported " - "by StridedSlice."); + "Type %d is currently not supported " + "by StridedSlice.", + op_context.input->type); return kTfLiteError; } #undef TF_LITE_STRIDED_SLICE diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc index cc39179bc7..c5d4f9affb 100644 --- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc +++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc @@ -21,7 +21,6 @@ limitations under the License. namespace tflite { namespace { -using ::int32; using ::testing::ElementsAreArray; template <typename input_type = float, @@ -50,14 +49,14 @@ class StridedSliceOpModel : public SingleOpModel { void SetInput(std::initializer_list<input_type> data) { PopulateTensor<input_type>(input_, data); } - void SetBegin(std::initializer_list<int32> data) { - PopulateTensor<int32>(begin_, data); + void SetBegin(std::initializer_list<int32_t> data) { + PopulateTensor<int32_t>(begin_, data); } - void SetEnd(std::initializer_list<int32> data) { - PopulateTensor<int32>(end_, data); + void SetEnd(std::initializer_list<int32_t> data) { + PopulateTensor<int32_t>(end_, data); } - void SetStrides(std::initializer_list<int32> data) { - PopulateTensor<int32>(strides_, data); + void SetStrides(std::initializer_list<int32_t> data) { + PopulateTensor<int32_t>(strides_, data); } std::vector<input_type> GetOutput() { @@ -384,6 +383,45 @@ TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) { EXPECT_THAT(m.GetOutput(), ElementsAreArray({2})); } +TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1_NegativeSlice) { + // This is equivalent to tf.range(4)[-1]. + StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1); + m.SetInput({0, 1, 2, 3}); + m.SetBegin({-1}); + m.SetEnd({0}); + m.SetStrides({1}); + + m.Invoke(); + EXPECT_TRUE(m.GetOutputShape().empty()); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({3})); +} + +TEST(StridedSliceOpTest, In2D_ShrinkAxis3_NegativeSlice) { + // This is equivalent to tf.range(4)[:, tf.newaxis][-2, -1]. + StridedSliceOpModel<> m({4, 1}, {2}, {2}, {2}, 0, 0, 0, 0, 3); + m.SetInput({0, 1, 2, 3}); + m.SetBegin({-2, -1}); + m.SetEnd({-1, 0}); + m.SetStrides({1, 1}); + + m.Invoke(); + EXPECT_TRUE(m.GetOutputShape().empty()); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({2})); +} + +TEST(StridedSliceOpTest, In2D_ShrinkAxis2_BeginEndAxis1_NegativeSlice) { + // This is equivalent to tf.range(4)[:, tf.newaxis][:, -1]. + StridedSliceOpModel<> m({4, 1}, {2}, {2}, {2}, 1, 1, 0, 0, 2); + m.SetInput({0, 1, 2, 3}); + m.SetBegin({0, -1}); + m.SetEnd({0, 0}); + m.SetStrides({1, 1}); + + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 1, 2, 3})); +} + TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) { StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1); m.SetInput({1, 2, 3, 4}); @@ -395,17 +433,6 @@ TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) { EXPECT_THAT(m.GetOutput(), ElementsAreArray({1})); } -TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) { - StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1); - m.SetInput({1, 2, 3, 4}); - m.SetBegin({-2}); - m.SetEnd({-3}); - m.SetStrides({-1}); - m.Invoke(); - EXPECT_TRUE(m.GetOutputShape().empty()); - EXPECT_THAT(m.GetOutput(), ElementsAreArray({3})); -} - TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) { StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1); m.SetInput({1, 2, 3, 4, 5, 6}); @@ -538,7 +565,7 @@ TEST(StridedSliceOpTest, RunTwice) { } TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) { - StridedSliceOpModel<uint8, TensorType_UINT8> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, + StridedSliceOpModel<uint8_t, TensorType_UINT8> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); m.SetBegin({0, 0, 0}); diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc index 9531ecba98..1be0c83f17 100644 --- a/tensorflow/contrib/lite/kernels/sub.cc +++ b/tensorflow/contrib/lite/kernels/sub.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" @@ -78,29 +78,47 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } template <KernelType kernel_type> -void EvalFloat(TfLiteContext* context, TfLiteNode* node, - TfLiteSubParams* params, const OpData* data, - const TfLiteTensor* input1, const TfLiteTensor* input2, - TfLiteTensor* output) { - float output_activation_min, output_activation_max; - CalculateActivationRangeFloat(params->activation, &output_activation_min, - &output_activation_max); -#define TF_LITE_SUB(type, opname) \ - type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \ - GetTensorData<float>(input2), GetTensorDims(input2), \ - output_activation_min, output_activation_max, \ - GetTensorData<float>(output), GetTensorDims(output)) - if (kernel_type == kReference) { - if (data->requires_broadcast) { - TF_LITE_SUB(reference_ops, BroadcastSub); +void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params, + const OpData* data, const TfLiteTensor* input1, + const TfLiteTensor* input2, TfLiteTensor* output) { +#define TF_LITE_SUB(type, opname, data_type) \ + data_type output_activation_min, output_activation_max; \ + CalculateActivationRange(params->activation, &output_activation_min, \ + &output_activation_max); \ + tflite::ArithmeticParams op_params; \ + SetActivationParams(output_activation_min, output_activation_max, \ + &op_params); \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<data_type>(input1), GetTensorShape(input2), \ + GetTensorData<data_type>(input2), GetTensorShape(output), \ + GetTensorData<data_type>(output)) + if (output->type == kTfLiteInt32) { + if (kernel_type == kReference) { + if (data->requires_broadcast) { + TF_LITE_SUB(reference_ops, BroadcastSub4DSlow, int32_t); + } else { + TF_LITE_SUB(reference_ops, SubWithActivation, int32_t); + } } else { - TF_LITE_SUB(reference_ops, Sub); + if (data->requires_broadcast) { + TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow, int32_t); + } else { + TF_LITE_SUB(optimized_ops, SubWithActivation, int32_t); + } } - } else { - if (data->requires_broadcast) { - TF_LITE_SUB(optimized_ops, BroadcastSub); + } else if (output->type == kTfLiteFloat32) { + if (kernel_type == kReference) { + if (data->requires_broadcast) { + TF_LITE_SUB(reference_ops, BroadcastSub4DSlow, float); + } else { + TF_LITE_SUB(reference_ops, SubWithActivation, float); + } } else { - TF_LITE_SUB(optimized_ops, Sub); + if (data->requires_broadcast) { + TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow, float); + } else { + TF_LITE_SUB(optimized_ops, SubWithActivation, float); + } } } #undef TF_LITE_SUB @@ -126,35 +144,45 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, int32 input1_multiplier; int input1_shift; - QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &input1_multiplier, - &input1_shift); + QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier, + &input1_multiplier, &input1_shift); int32 input2_multiplier; int input2_shift; - QuantizeMultiplierSmallerThanOne(real_input2_multiplier, &input2_multiplier, - &input2_shift); + QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier, + &input2_multiplier, &input2_shift); int32 output_multiplier; int output_shift; - QuantizeMultiplierSmallerThanOne(real_output_multiplier, &output_multiplier, - &output_shift); + QuantizeMultiplierSmallerThanOneExp(real_output_multiplier, + &output_multiplier, &output_shift); int32 output_activation_min, output_activation_max; CalculateActivationRangeUint8(params->activation, output, &output_activation_min, &output_activation_max); -#define TF_LITE_SUB(type, opname) \ - type::opname(left_shift, GetTensorData<uint8_t>(input1), \ - GetTensorDims(input1), input1_offset, input1_multiplier, \ - input1_shift, GetTensorData<uint8_t>(input2), \ - GetTensorDims(input2), input2_offset, input2_multiplier, \ - input2_shift, output_offset, output_multiplier, output_shift, \ - output_activation_min, output_activation_max, \ - GetTensorData<uint8_t>(output), GetTensorDims(output)); +#define TF_LITE_SUB(type, opname) \ + tflite::ArithmeticParams op_params; \ + op_params.left_shift = left_shift; \ + op_params.input1_offset = input1_offset; \ + op_params.input1_multiplier = input1_multiplier; \ + op_params.input1_shift = input1_shift; \ + op_params.input2_offset = input2_offset; \ + op_params.input2_multiplier = input2_multiplier; \ + op_params.input2_shift = input2_shift; \ + op_params.output_offset = output_offset; \ + op_params.output_multiplier = output_multiplier; \ + op_params.output_shift = output_shift; \ + SetActivationParams(output_activation_min, output_activation_max, \ + &op_params); \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData<uint8_t>(input1), GetTensorShape(input2), \ + GetTensorData<uint8_t>(input2), GetTensorShape(output), \ + GetTensorData<uint8_t>(output)) // The quantized version of Sub doesn't support activations, so we // always use BroadcastSub. if (kernel_type == kReference) { - TF_LITE_SUB(reference_ops, BroadcastSub); + TF_LITE_SUB(reference_ops, BroadcastSub4DSlow); } else { - TF_LITE_SUB(optimized_ops, BroadcastSub); + TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow); } #undef TF_LITE_SUB } @@ -168,14 +196,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - if (output->type == kTfLiteFloat32) { - EvalFloat<kernel_type>(context, node, params, data, input1, input2, output); + if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) { + EvalSub<kernel_type>(context, node, params, data, input1, input2, output); } else if (output->type == kTfLiteUInt8) { EvalQuantized<kernel_type>(context, node, params, data, input1, input2, output); } else { - context->ReportError(context, - "Inputs and outputs not all float|uint8 types."); + context->ReportError( + context, + "output type %d is not supported, requires float|uint8|int32 types.", + output->type); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/sub_test.cc b/tensorflow/contrib/lite/kernels/sub_test.cc index ff07aeec49..5978c574d3 100644 --- a/tensorflow/contrib/lite/kernels/sub_test.cc +++ b/tensorflow/contrib/lite/kernels/sub_test.cc @@ -52,6 +52,13 @@ class FloatSubOpModel : public BaseSubOpModel { std::vector<float> GetOutput() { return ExtractVector<float>(output_); } }; +class IntegerSubOpModel : public BaseSubOpModel { + public: + using BaseSubOpModel::BaseSubOpModel; + + std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); } +}; + class QuantizedSubOpModel : public BaseSubOpModel { public: using BaseSubOpModel::BaseSubOpModel; @@ -125,6 +132,57 @@ TEST(FloatSubOpModel, WithBroadcast) { } } +TEST(IntegerSubOpModel, NoActivation) { + IntegerSubOpModel m({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, {TensorType_INT32, {}}, + ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-21, 0, 4, 3})); +} + +TEST(IntegerSubOpModel, ActivationRELU_N1_TO_1) { + IntegerSubOpModel m({TensorType_INT32, {1, 2, 2, 1}}, + {TensorType_INT32, {1, 2, 2, 1}}, {TensorType_INT32, {}}, + ActivationFunctionType_RELU_N1_TO_1); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1, 0, 1, 1})); +} + +TEST(IntegerSubOpModel, VariousInputShapes) { + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + IntegerSubOpModel m({TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8, 11, 20}); + m.PopulateTensor<int32_t>(m.input2(), {1, 2, 3, 5, 11, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({-21, 0, 4, 3, 0, 19})) + << "With shape number " << i; + } +} + +TEST(IntegerSubOpModel, WithBroadcast) { + std::vector<std::initializer_list<int>> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + IntegerSubOpModel m({TensorType_INT32, test_shapes[i]}, + {TensorType_INT32, {}}, // always a scalar + {TensorType_INT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 7, 8, 11, 20}); + m.PopulateTensor<int32_t>(m.input2(), {1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear({-21, 1, 6, 7, 10, 19}))) + << "With shape number " << i; + } +} + TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) { float kQuantizedTolerance = GetTolerance(-1.0, 1.0); std::vector<std::initializer_list<float>> inputs1 = { diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc index 308860c299..9903fd5c35 100644 --- a/tensorflow/contrib/lite/kernels/svdf.cc +++ b/tensorflow/contrib/lite/kernels/svdf.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> + +// SVDF op that compresses a fully connected op via low-rank matrix +// factorization. See https://research.google.com/pubs/archive/43813.pdf for +// details. #include <cassert> #include <cmath> #include <cstdio> @@ -20,8 +23,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -32,37 +35,113 @@ namespace ops { namespace builtin { namespace svdf { +namespace { + +struct OpData { + int scratch_tensor_index; + bool float_weights_time_initialized; + + int activation_state_tensor_index; +}; + +static inline void ApplyTimeWeightsBiasAndActivation( + int batch_size, int memory_size, int num_filters, int num_units, int rank, + const TfLiteTensor* weights_time, const TfLiteTensor* bias, + TfLiteFusedActivation activation, TfLiteTensor* activation_state, + TfLiteTensor* scratch, TfLiteTensor* output) { + // Compute matmul(state, weights_time). + // The right most column is used to save temporary output (with the size of + // num_filters). This is achieved by starting at activation_state->data.f, + // and having the stride equal to memory_size. + for (int b = 0; b < batch_size; ++b) { + float* state_ptr_batch = + activation_state->data.f + b * memory_size * num_filters; + float* scratch_ptr_batch = scratch->data.f + b * num_filters; + tensor_utils::BatchVectorBatchVectorDotProduct( + weights_time->data.f, state_ptr_batch, memory_size, num_filters, + scratch_ptr_batch, /*result_stride=*/1); + } + + // Initialize output with bias if provided. + if (bias) { + tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size, + output->data.f); + } else { + tensor_utils::ZeroVector(output->data.f, batch_size * num_units); + } + + // Reduction sum. + for (int b = 0; b < batch_size; ++b) { + float* output_ptr_batch = output->data.f + b * num_units; + float* scratch_ptr_batch = scratch->data.f + b * num_filters; + tensor_utils::ReductionSumVector(scratch_ptr_batch, output_ptr_batch, + num_units, rank); + } + + // Apply activation. + for (int b = 0; b < batch_size; ++b) { + float* output_ptr_batch = output->data.f + b * num_units; + tensor_utils::ApplyActivationToVector(output_ptr_batch, num_units, + activation, output_ptr_batch); + } + + // Left shift the activation_state to make room for next cycle's activation. + // TODO(alanchiao): explore collapsing this into a single loop. + for (int b = 0; b < batch_size; ++b) { + float* state_ptr_batch = + activation_state->data.f + b * memory_size * num_filters; + for (int f = 0; f < num_filters; ++f) { + tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size, + /*shift_value=*/0.0f); + state_ptr_batch += memory_size; + } + } +} + +} // namespace + +// Input tensors. constexpr int kInputTensor = 0; constexpr int kWeightsFeatureTensor = 1; constexpr int kWeightsTimeTensor = 2; constexpr int kBiasTensor = 3; -constexpr int kStateTensor = 0; -constexpr int kOutputTensor = 1; +// This is a variable tensor, and will be modified by this op. +constexpr int kInputActivationStateTensor = 4; + +// Output tensor. +constexpr int kOutputTensor = 0; void* Init(TfLiteContext* context, const char* buffer, size_t length) { - auto* scratch_tensor_index = new int; - context->AddTensors(context, 1, scratch_tensor_index); - return scratch_tensor_index; + auto* op_data = new OpData(); + op_data->float_weights_time_initialized = false; + context->AddTensors(context, /*tensors_to_add=*/4, + &op_data->scratch_tensor_index); + return op_data; } void Free(TfLiteContext* context, void* buffer) { - delete reinterpret_cast<int*>(buffer); + delete reinterpret_cast<OpData*>(buffer); } TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data); - int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data); + const auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data); + OpData* op_data = reinterpret_cast<OpData*>(node->user_data); + int scratch_tensor_index = op_data->scratch_tensor_index; // Check we have all the inputs and outputs we need. - TF_LITE_ENSURE_EQ(context, node->inputs->size, 4); - TF_LITE_ENSURE_EQ(context, node->outputs->size, 2); + TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 5); + op_data->activation_state_tensor_index = + node->inputs->data[kInputActivationStateTensor]; - TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]]; + const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* weights_feature = GetInput(context, node, kWeightsFeatureTensor); const TfLiteTensor* weights_time = GetInput(context, node, kWeightsTimeTensor); + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + // Check all the parameters of tensor match within themselves and match the // input configuration. const int rank = params->rank; @@ -79,22 +158,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units); } - TfLiteTensor* state = GetOutput(context, node, kStateTensor); + TfLiteTensor* activation_state = + &context->tensors[op_data->activation_state_tensor_index]; TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - // Resize state. - // For each batch, the state is a 2-D tensor: memory_size * num_filters - // The left most column is used to save current cycle activation. - // The right most column is used to save temporary output which will be - // reduced to num_units outputs. - TfLiteIntArray* state_size_array = TfLiteIntArrayCreate(2); - state_size_array->data[0] = batch_size; - state_size_array->data[1] = memory_size * num_filters; - TF_LITE_ENSURE_OK(context, - context->ResizeTensor(context, state, state_size_array)); - - // Mark state as a persistent tensor. - state->allocation_type = kTfLiteArenaRwPersistent; + // Check the shape of input state tensors. + TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2); + TF_LITE_ENSURE_EQ(context, SizeOfDimension(activation_state, 0), batch_size); + TF_LITE_ENSURE_EQ(context, SizeOfDimension(activation_state, 1), + memory_size * num_filters); // Resize output. TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2); @@ -103,10 +175,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output, output_size_array)); + // The weights are of consistent type, so it suffices to check one. + const bool is_hybrid_op = + (input->type == kTfLiteFloat32 && weights_feature->type == kTfLiteUInt8); + // Resize scratch. TfLiteIntArrayFree(node->temporaries); - node->temporaries = TfLiteIntArrayCreate(1); - node->temporaries->data[0] = *scratch_tensor_index; + if (is_hybrid_op) { + node->temporaries = TfLiteIntArrayCreate(4); + } else { + node->temporaries = TfLiteIntArrayCreate(1); + } + node->temporaries->data[0] = scratch_tensor_index; TfLiteIntArray* scratch_size_array = TfLiteIntArrayCreate(2); scratch_size_array->data[0] = batch_size; @@ -118,24 +198,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor, scratch_size_array)); - return kTfLiteOk; -} - -TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data); - - const TfLiteTensor* input = GetInput(context, node, kInputTensor); - const TfLiteTensor* weights_feature = - GetInput(context, node, kWeightsFeatureTensor); - const TfLiteTensor* weights_time = - GetInput(context, node, kWeightsTimeTensor); + if (is_hybrid_op) { + // Tell interpreter to allocate temporary tensors to store quantized values + // of input tensors. + node->temporaries->data[1] = scratch_tensor_index + 1; + TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } - TfLiteTensor* state = GetOutput(context, node, kStateTensor); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0); + // Tell interpreter to allocate temporary tensors to store scaling factors. + node->temporaries->data[2] = scratch_tensor_index + 2; + TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + scaling_factors_size->data[0] = batch_size; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } - const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor); + // Used to store dequantized weights_time matrix for hybrid computation of + // matmul(activation_state, weights_time), which occurs in floating point. + node->temporaries->data[3] = scratch_tensor_index + 3; + TfLiteTensor* float_weights_time = GetTemporary(context, node, /*index=*/3); + float_weights_time->type = kTfLiteFloat32; + // Persistent so that we can compute the dequantized weights only once. + float_weights_time->allocation_type = kTfLiteArenaRwPersistent; + if (!TfLiteIntArrayEqual(float_weights_time->dims, weights_time->dims)) { + TfLiteIntArray* float_weights_time_size = + TfLiteIntArrayCopy(weights_time->dims); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, float_weights_time, + float_weights_time_size)); + } + } + return kTfLiteOk; +} +TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, + const TfLiteTensor* input, + const TfLiteTensor* weights_feature, + const TfLiteTensor* weights_time, + const TfLiteTensor* bias, const TfLiteSVDFParams* params, + TfLiteTensor* scratch, TfLiteTensor* state, + TfLiteTensor* output) { const int rank = params->rank; const int batch_size = input->dims->data[0]; const int input_size = input->dims->data[1]; @@ -144,69 +256,156 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const int memory_size = weights_time->dims->data[1]; // Clear the activation (state left most column). - // TODO(ghodrat): Add a test which initialize state with invalid values in - // left most column and make sure it passes. - for (int b = 0; b < batch_size; b++) { + // TODO(ghodrat): Add a test which initialize activation_state with invalid + // values in left most column and make sure it passes. + for (int b = 0; b < batch_size; ++b) { float* state_ptr_batch = state->data.f + b * memory_size * num_filters; - for (int c = 0; c < num_filters; c++) { + for (int c = 0; c < num_filters; ++c) { float* state_ptr = state_ptr_batch + c * memory_size; - state_ptr[memory_size - 1] = 0.0; + state_ptr[memory_size - 1] = 0.0f; } } // Compute conv1d(inputs, weights_feature). - // The state left most column is used to save current cycle activation. This + // The state right most column is used to save current cycle activation. This // is achieved by starting at state->data.f[memory_size - 1] and having the // stride equal to memory_size. tensor_utils::MatrixBatchVectorMultiplyAccumulate( weights_feature->data.f, num_filters, input_size, input->data.f, batch_size, &state->data.f[memory_size - 1], memory_size); - // Compute matmul(state, weights_time). - // The right most column is used to save temporary output (with the size of - // num_filters). This is achieved by starting at state->data.f and having the - // stride equal to memory_size. - for (int b = 0; b < batch_size; b++) { + ApplyTimeWeightsBiasAndActivation(batch_size, memory_size, num_filters, + num_units, rank, weights_time, bias, + params->activation, state, scratch, output); + return kTfLiteOk; +} + +TfLiteStatus EvalHybrid( + TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input, + const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time, + const TfLiteTensor* bias, const TfLiteSVDFParams* params, + TfLiteTensor* scratch, TfLiteTensor* scaling_factors, + TfLiteTensor* input_quantized, TfLiteTensor* state, TfLiteTensor* output) { + const int rank = params->rank; + const int batch_size = input->dims->data[0]; + const int input_size = input->dims->data[1]; + const int num_filters = weights_feature->dims->data[0]; + const int num_units = num_filters / rank; + const int memory_size = weights_time->dims->data[1]; + + // Initialize the pointer to input. + const float* input_ptr_batch = input->data.f; + + // Initialize the pointer to storage for quantized values and + // scaling factors. + int8_t* quantized_input_ptr_batch = + reinterpret_cast<int8_t*>(input_quantized->data.uint8); + + float* scaling_factors_ptr = scaling_factors->data.f; + + // Other initializations. + const int8_t* weights_feature_ptr = + reinterpret_cast<int8_t*>(weights_feature->data.uint8); + const float weights_feature_scale = weights_feature->params.scale; + + // Clear the activation (state left most column). + // TODO(ghodrat): Add a test which initialize state with invalid values in + // the left most column and make sure it passes. + for (int b = 0; b < batch_size; ++b) { float* state_ptr_batch = state->data.f + b * memory_size * num_filters; - float* scratch_ptr_batch = scratch->data.f + b * num_filters; - tensor_utils::BatchVectorBatchVectorDotProduct( - weights_time->data.f, state_ptr_batch, memory_size, num_filters, - scratch_ptr_batch, /*result_stride=*/1); + for (int c = 0; c < num_filters; ++c) { + float* state_ptr = state_ptr_batch + c * memory_size; + state_ptr[memory_size - 1] = 0.0; + } } - // Initialize output with bias if provided. - if (bias) { - tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size, - output->data.f); - } else { - tensor_utils::ZeroVector(output->data.f, batch_size * num_units); - } + if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) { + // Quantize input from float to int8. + float unused_min, unused_max; + for (int b = 0; b < batch_size; ++b) { + const int offset = b * input_size; + tensor_utils::SymmetricQuantizeFloats( + input_ptr_batch + offset, input_size, + quantized_input_ptr_batch + offset, &unused_min, &unused_max, + &scaling_factors_ptr[b]); + scaling_factors_ptr[b] *= weights_feature_scale; + } - // Reduction sum - for (int b = 0; b < batch_size; b++) { - float* output_ptr_batch = output->data.f + b * num_units; - float* scratch_ptr_batch = scratch->data.f + b * num_filters; - tensor_utils::ReductionSumVector(scratch_ptr_batch, output_ptr_batch, - num_units, rank); + // Compute conv1d(inputs, weights_feature). + // The rightmost column of state is used to save the current cycle + // activation. + // This is achieved by starting at state->data.f[memory_size - 1] + // and having the stride equal to memory_size. + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + weights_feature_ptr, num_filters, input_size, quantized_input_ptr_batch, + scaling_factors_ptr, batch_size, &state->data.f[memory_size - 1], + memory_size); } - // Apply activation. - for (int b = 0; b < batch_size; b++) { - float* output_ptr_batch = output->data.f + b * num_units; - tensor_utils::ApplyActivationToVector(output_ptr_batch, num_units, - params->activation, output_ptr_batch); - } + // TODO(alanchiao): can optimize hybrid case ~5% by unrolling loop in applying + // time weights so that the inner loop multiplies eight elements at a time. + ApplyTimeWeightsBiasAndActivation(batch_size, memory_size, num_filters, + num_units, rank, weights_time, bias, + params->activation, state, scratch, output); + return kTfLiteOk; +} - // Right shift the state. - for (int b = 0; b < batch_size; b++) { - float* state_ptr_batch = state->data.f + b * memory_size * num_filters; - for (int f = 0; f < num_filters; f++) { - tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size, - /*shift_value=*/0.0); - state_ptr_batch += memory_size; +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data); + OpData* op_data = reinterpret_cast<OpData*>(node->user_data); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + const TfLiteTensor* weights_feature = + GetInput(context, node, kWeightsFeatureTensor); + const TfLiteTensor* weights_time = + GetInput(context, node, kWeightsTimeTensor); + const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor); + + TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0); + + TfLiteTensor* activation_state = + &context->tensors[op_data->activation_state_tensor_index]; + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + switch (weights_feature->type) { + case kTfLiteFloat32: { + return EvalFloat(context, node, input, weights_feature, weights_time, + bias, params, scratch, activation_state, output); + break; } + case kTfLiteUInt8: { + TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1); + TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2); + TfLiteTensor* float_weights_time = + GetTemporary(context, node, /*index=*/3); + + // Dequantize weights time. + // TODO(alanchiao): this dequantization initialization only needs to + // happen once per model and should theoretically be placed in either Init + // or Prepare. However, TFLite doesn't allocate float_weights_time until + // the Eval function. + // TODO(alanchiao): refactor logic out into dequantize function. + if (!op_data->float_weights_time_initialized) { + const float dequantization_scale = weights_time->params.scale; + const int8_t* weights_time_ptr = + reinterpret_cast<int8_t*>(weights_time->data.uint8); + for (int i = 0; i < NumElements(float_weights_time); ++i) { + float_weights_time->data.f[i] = + weights_time_ptr[i] * dequantization_scale; + } + op_data->float_weights_time_initialized = true; + } + return EvalHybrid(context, node, input, weights_feature, + float_weights_time, bias, params, scratch, + scaling_factors, input_quantized, activation_state, + output); + break; + } + default: + context->ReportError(context, "Type %d not currently supported.", + weights_feature->type); + return kTfLiteError; } - return kTfLiteOk; } } // namespace svdf diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc index 0f166dc69b..6d60dc63f4 100644 --- a/tensorflow/contrib/lite/kernels/svdf_test.cc +++ b/tensorflow/contrib/lite/kernels/svdf_test.cc @@ -126,28 +126,35 @@ static float svdf_golden_output_rank_2[] = { }; // Derived class of SingleOpModel, which is used to test SVDF TFLite op. -class SVDFOpModel : public SingleOpModel { +class BaseSVDFOpModel : public SingleOpModel { public: - SVDFOpModel(int batches, int units, int input_size, int memory_size, int rank) + BaseSVDFOpModel(int batches, int units, int input_size, int memory_size, + int rank, + TensorType weights_feature_type = TensorType_FLOAT32, + TensorType weights_time_type = TensorType_FLOAT32) : batches_(batches), units_(units), input_size_(input_size), memory_size_(memory_size), rank_(rank) { input_ = AddInput(TensorType_FLOAT32); - weights_feature_ = AddInput(TensorType_FLOAT32); - weights_time_ = AddInput(TensorType_FLOAT32); + weights_feature_ = AddInput(weights_feature_type); + weights_time_ = AddInput(weights_time_type); bias_ = AddNullInput(); - state_ = AddOutput(TensorType_FLOAT32); + const int num_filters = units * rank; + activation_state_ = AddInput( + TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}}, + /*is_variable=*/true); output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp( BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions, CreateSVDFOptions(builder_, rank, ActivationFunctionType_NONE).Union()); BuildInterpreter({ - {batches_, input_size_}, // Input tensor - {units_ * rank, input_size_}, // weights_feature tensor - {units_ * rank, memory_size_}, // weights_time tensor - {units_} // bias tensor + {batches_, input_size_}, // input tensor + {units_ * rank, input_size_}, // weights_feature tensor + {units_ * rank, memory_size_}, // weights_time tensor + {units_}, // bias tensor + {batches, memory_size * num_filters} // activation_state tensor }); } @@ -166,15 +173,6 @@ class SVDFOpModel : public SingleOpModel { PopulateTensor(input_, offset, begin, end); } - // Resets the state of SVDF op by filling it with 0's. - void ResetState() { - const int zero_buffer_size = rank_ * units_ * batches_ * memory_size_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - // Extracts the output tensor from the SVDF op. std::vector<float> GetOutput() { return ExtractVector<float>(output_); } @@ -182,12 +180,12 @@ class SVDFOpModel : public SingleOpModel { int num_units() { return units_; } int num_batches() { return batches_; } - private: + protected: int input_; int weights_feature_; int weights_time_; int bias_; - int state_; + int activation_state_; int output_; int batches_; @@ -197,7 +195,61 @@ class SVDFOpModel : public SingleOpModel { int rank_; }; -TEST(SVDFOpTest, BlackBoxTestRank1) { +class SVDFOpModel : public BaseSVDFOpModel { + public: + using BaseSVDFOpModel::BaseSVDFOpModel; +}; + +class HybridSVDFOpModel : public BaseSVDFOpModel { + public: + HybridSVDFOpModel(int batches, int units, int input_size, int memory_size, + int rank) + : BaseSVDFOpModel(batches, units, input_size, memory_size, rank, + TensorType_UINT8, TensorType_UINT8) {} + + void SetWeightsFeature(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(weights_feature_, f); + } + + void SetWeightsTime(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(weights_time_, f); + } +}; + +class SVDFOpTest : public ::testing::Test { + protected: + void VerifyGoldens(float golden_input[], float golden_output[], + int golden_size, BaseSVDFOpModel* svdf, + float tolerance = 1e-5) { + const int svdf_num_batches = svdf->num_batches(); + const int svdf_input_size = svdf->input_size(); + const int svdf_num_units = svdf->num_units(); + const int input_sequence_size = + golden_size / sizeof(float) / (svdf_input_size * svdf_num_batches); + // Going over each input batch, setting the input tensor, invoking the SVDF + // op and checking the output with the expected golden values. + for (int i = 0; i < input_sequence_size; i++) { + float* batch_start = + golden_input + i * svdf_input_size * svdf_num_batches; + float* batch_end = batch_start + svdf_input_size * svdf_num_batches; + svdf->SetInput(0, batch_start, batch_end); + + svdf->Invoke(); + + const float* golden_start = + golden_output + i * svdf_num_units * svdf_num_batches; + const float* golden_end = + golden_start + svdf_num_units * svdf_num_batches; + std::vector<float> expected; + expected.insert(expected.end(), golden_start, golden_end); + + EXPECT_THAT(svdf->GetOutput(), + ElementsAreArray(ArrayFloatNear(expected, tolerance))); + } + } +}; + +TEST_F(SVDFOpTest, BlackBoxTestRank1) { SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, /*memory_size=*/10, /*rank=*/1); svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347, @@ -217,32 +269,11 @@ TEST(SVDFOpTest, BlackBoxTestRank1) { -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166, -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657}); - svdf.ResetState(); - const int svdf_num_batches = svdf.num_batches(); - const int svdf_input_size = svdf.input_size(); - const int svdf_num_units = svdf.num_units(); - const int input_sequence_size = - sizeof(svdf_input) / sizeof(float) / (svdf_input_size * svdf_num_batches); - // Going over each input batch, setting the input tensor, invoking the SVDF op - // and checking the output with the expected golden values. - for (int i = 0; i < input_sequence_size; i++) { - float* batch_start = svdf_input + i * svdf_input_size * svdf_num_batches; - float* batch_end = batch_start + svdf_input_size * svdf_num_batches; - svdf.SetInput(0, batch_start, batch_end); - - svdf.Invoke(); - - float* golden_start = - svdf_golden_output_rank_1 + i * svdf_num_units * svdf_num_batches; - float* golden_end = golden_start + svdf_num_units * svdf_num_batches; - std::vector<float> expected; - expected.insert(expected.end(), golden_start, golden_end); - - EXPECT_THAT(svdf.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); - } + VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input), + &svdf); } -TEST(SVDFOpTest, BlackBoxTestRank2) { +TEST_F(SVDFOpTest, BlackBoxTestRank2) { SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, /*memory_size=*/10, /*rank=*/2); svdf.SetWeightsFeature({-0.31930989, 0.0079667, 0.39296314, 0.37613347, @@ -277,29 +308,73 @@ TEST(SVDFOpTest, BlackBoxTestRank2) { 0.27179423, -0.04710215, 0.31069002, 0.22672787, 0.09580326, 0.08682203, 0.1258215, 0.1851041, 0.29228821, 0.12366763}); - svdf.ResetState(); - const int svdf_num_batches = svdf.num_batches(); - const int svdf_input_size = svdf.input_size(); - const int svdf_num_units = svdf.num_units(); - const int input_sequence_size = - sizeof(svdf_input) / sizeof(float) / (svdf_input_size * svdf_num_batches); - // Going over each input batch, setting the input tensor, invoking the SVDF op - // and checking the output with the expected golden values. - for (int i = 0; i < input_sequence_size; i++) { - float* batch_start = svdf_input + i * svdf_input_size * svdf_num_batches; - float* batch_end = batch_start + svdf_input_size * svdf_num_batches; - svdf.SetInput(0, batch_start, batch_end); - - svdf.Invoke(); - - float* golden_start = - svdf_golden_output_rank_2 + i * svdf_num_units * svdf_num_batches; - float* golden_end = golden_start + svdf_num_units * svdf_num_batches; - std::vector<float> expected; - expected.insert(expected.end(), golden_start, golden_end); - - EXPECT_THAT(svdf.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); - } + VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input), + &svdf); +} + +TEST_F(SVDFOpTest, BlackBoxTestHybridRank1) { + HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, + /*memory_size=*/10, /*rank=*/1); + svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347, + 0.22197971, 0.12416199, 0.27901134, 0.27557442, + 0.3905206, -0.36137494, -0.06634006, -0.10640851}); + + svdf.SetWeightsTime( + {-0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156, + 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199, + + 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518, + -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296, + + -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236, + 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846, + + -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166, + -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657}); + + VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input), + &svdf, + /*tolerance=*/0.002945); +} + +TEST_F(SVDFOpTest, BlackBoxTestHybridRank2) { + HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, + /*memory_size=*/10, /*rank=*/2); + svdf.SetWeightsFeature({-0.31930989, 0.0079667, 0.39296314, 0.37613347, + 0.12416199, 0.15785322, 0.27901134, 0.3905206, + 0.21931258, -0.36137494, -0.10640851, 0.31053296, + -0.36118156, -0.0976817, -0.36916667, 0.22197971, + 0.15294972, 0.38031587, 0.27557442, 0.39635518, + -0.21580373, -0.06634006, -0.02702999, 0.27072677}); + + svdf.SetWeightsTime( + {-0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156, + 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199, + + 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518, + -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296, + + -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236, + 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846, + + -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166, + -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657, + + -0.14884081, 0.19931212, -0.36002168, 0.34663299, -0.11405486, + 0.12672701, 0.39463779, -0.07886535, -0.06384811, 0.08249187, + + -0.26816407, -0.19905911, 0.29211238, 0.31264046, -0.28664589, + 0.05698794, 0.11613581, 0.14078894, 0.02187902, -0.21781836, + + -0.15567942, 0.08693647, -0.38256618, 0.36580828, -0.22922277, + -0.0226903, 0.12878349, -0.28122205, -0.10850525, -0.11955214, + + 0.27179423, -0.04710215, 0.31069002, 0.22672787, 0.09580326, + 0.08682203, 0.1258215, 0.1851041, 0.29228821, 0.12366763}); + + VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input), + &svdf, + /*tolerance=*/0.00625109); } } // namespace diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc index 1a01ee0936..05a7c23ba1 100644 --- a/tensorflow/contrib/lite/kernels/test_util.cc +++ b/tensorflow/contrib/lite/kernels/test_util.cc @@ -32,8 +32,8 @@ std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values, return matchers; } -int SingleOpModel::AddInput(const TensorData& t) { - int id = AddTensor<float>(t, {}); +int SingleOpModel::AddInput(const TensorData& t, bool is_variable) { + int id = AddTensor<float>(t, {}, is_variable); inputs_.push_back(id); return id; } @@ -74,8 +74,8 @@ void SingleOpModel::SetCustomOp( CustomOptionsFormat_FLEXBUFFERS)); } -void SingleOpModel::BuildInterpreter( - std::vector<std::vector<int>> input_shapes) { +void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes, + bool allow_fp32_relax_to_fp16) { auto opcodes = builder_.CreateVector(opcodes_); auto operators = builder_.CreateVector(operators_); auto tensors = builder_.CreateVector(tensors_); @@ -112,8 +112,17 @@ void SingleOpModel::BuildInterpreter( if (shape.empty()) continue; CHECK(interpreter_->ResizeInputTensor(input_idx, shape) == kTfLiteOk); } + + interpreter_->SetAllowFp16PrecisionForFp32(allow_fp32_relax_to_fp16); + + // Modify delegate with function. + if (apply_delegate_fn_) { + apply_delegate_fn_(interpreter_.get()); + } + CHECK(interpreter_->AllocateTensors() == kTfLiteOk) << "Cannot allocate tensors"; + interpreter_->ResetVariableTensors(); } void SingleOpModel::Invoke() { CHECK(interpreter_->Invoke() == kTfLiteOk); } diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h index 55edc97d19..84deb0e0e8 100644 --- a/tensorflow/contrib/lite/kernels/test_util.h +++ b/tensorflow/contrib/lite/kernels/test_util.h @@ -114,13 +114,22 @@ class SingleOpModel { SingleOpModel() {} ~SingleOpModel() {} + // Set a function callback that is run right after graph is prepared + // that allows applying external delegates. This is useful for testing + // other runtimes like NN API or GPU. + void SetApplyDelegate(std::function<void(Interpreter*)> apply_delegate_fn) { + apply_delegate_fn_ = apply_delegate_fn; + } + // Copying or assignment is disallowed to simplify ownership semantics. SingleOpModel(const SingleOpModel&) = delete; SingleOpModel& operator=(const SingleOpModel&) = delete; // Add a TensorType input tensor and return its index. - int AddInput(TensorType type) { return AddInput(TensorData{type}); } - int AddInput(const TensorData& t); + int AddInput(TensorType type, bool is_variable = false) { + return AddInput(TensorData{type}, is_variable); + } + int AddInput(const TensorData& t, bool is_variable = false); // Templated version of AddConstInput(). template <typename T> @@ -139,20 +148,18 @@ class SingleOpModel { int AddOutput(const TensorData& t); template <typename T> - void QuantizeAndPopulate(int index, std::initializer_list<float> data) { + void QuantizeAndPopulate(int index, const std::vector<float>& data) { TfLiteTensor* t = interpreter_->tensor(index); auto q = Quantize<T>(data, t->params.scale, t->params.zero_point); PopulateTensor(index, 0, q.data(), q.data() + q.size()); } - void SymmetricQuantizeAndPopulate(int index, - std::initializer_list<float> data) { + void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) { TfLiteTensor* t = interpreter_->tensor(index); - std::vector<float> values(data); - const int length = values.size(); + const int length = data.size(); std::vector<int8_t> q(length); float min, max, scaling_factor; - tensor_utils::SymmetricQuantizeFloats(values.data(), length, q.data(), &min, + tensor_utils::SymmetricQuantizeFloats(data.data(), length, q.data(), &min, &max, &scaling_factor); // Update quantization params. t->params.scale = scaling_factor; @@ -175,7 +182,8 @@ class SingleOpModel { // Build the interpreter for this model. Also, resize and allocate all // tensors given the shapes of the inputs. - void BuildInterpreter(std::vector<std::vector<int>> input_shapes); + void BuildInterpreter(std::vector<std::vector<int>> input_shapes, + bool allow_fp32_relax_to_fp16 = false); void Invoke(); @@ -189,8 +197,22 @@ class SingleOpModel { } // Populate the tensor given its index. + // TODO(b/110696148) clean up and merge with vector-taking variant below. + template <typename T> + void PopulateTensor(int index, const std::initializer_list<T>& data) { + T* v = interpreter_->typed_tensor<T>(index); + CHECK(v) << "No tensor with index '" << index << "'."; + for (T f : data) { + *v = f; + ++v; + } + } + + // Populate the tensor given its index. + // TODO(b/110696148) clean up and merge with initializer_list-taking variant + // above. template <typename T> - void PopulateTensor(int index, std::initializer_list<T> data) { + void PopulateTensor(int index, const std::vector<T>& data) { T* v = interpreter_->typed_tensor<T>(index); CHECK(v) << "No tensor with index '" << index << "'."; for (T f : data) { @@ -253,7 +275,8 @@ class SingleOpModel { } template <typename T> - int AddTensor(TensorData t, std::initializer_list<T> data) { + int AddTensor(TensorData t, std::initializer_list<T> data, + bool is_variable = false) { int id = tensors_.size(); // This is slightly different depending on whether we are adding a @@ -270,6 +293,9 @@ class SingleOpModel { } else if (t.type == TensorType_INT32) { std::tie(t.scale, t.zero_point) = QuantizationParams<int32_t>(t.min, t.max); + } else if (t.type == TensorType_INT16) { + std::tie(t.scale, t.zero_point) = + QuantizationParams<int16_t>(t.min, t.max); } else { LOG(FATAL) << "No support for the requested quantized type"; } @@ -302,7 +328,7 @@ class SingleOpModel { tensors_.push_back(CreateTensor(builder_, builder_.CreateVector<int>(t.shape), t.type, /*buffer=*/buffer_id, - /*name=*/0, q_params)); + /*name=*/0, q_params, is_variable)); tensor_data_[id] = t; @@ -317,6 +343,9 @@ class SingleOpModel { std::vector<flatbuffers::Offset<Operator>> operators_; std::vector<flatbuffers::Offset<Buffer>> buffers_; std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_; + // A function pointer that gets called after the interpreter is created but + // before evaluation happens. This is useful for applying a delegate. + std::function<void(Interpreter*)> apply_delegate_fn_; }; // Base class for single op unit tests. diff --git a/tensorflow/contrib/lite/kernels/test_util_test.cc b/tensorflow/contrib/lite/kernels/test_util_test.cc index 1e10e89061..2365803472 100644 --- a/tensorflow/contrib/lite/kernels/test_util_test.cc +++ b/tensorflow/contrib/lite/kernels/test_util_test.cc @@ -22,22 +22,22 @@ using ::testing::ElementsAreArray; TEST(TestUtilTest, QuantizeVector) { std::vector<float> data = {-1.0, -0.5, 0.0, 0.5, 1.0, 1000.0}; - auto q_data = Quantize<uint8>(data, /*scale=*/1.0, /*zero_point=*/0); - std::vector<uint8> expected = {0, 0, 0, 1, 1, 255}; + auto q_data = Quantize<uint8_t>(data, /*scale=*/1.0, /*zero_point=*/0); + std::vector<uint8_t> expected = {0, 0, 0, 1, 1, 255}; EXPECT_THAT(q_data, ElementsAreArray(expected)); } TEST(TestUtilTest, QuantizeVectorScalingDown) { std::vector<float> data = {-1.0, -0.5, 0.0, 0.5, 1.0, 1000.0}; - auto q_data = Quantize<uint8>(data, /*scale=*/10.0, /*zero_point=*/0); - std::vector<uint8> expected = {0, 0, 0, 0, 0, 100}; + auto q_data = Quantize<uint8_t>(data, /*scale=*/10.0, /*zero_point=*/0); + std::vector<uint8_t> expected = {0, 0, 0, 0, 0, 100}; EXPECT_THAT(q_data, ElementsAreArray(expected)); } TEST(TestUtilTest, QuantizeVectorScalingUp) { std::vector<float> data = {-1.0, -0.5, 0.0, 0.5, 1.0, 1000.0}; - auto q_data = Quantize<uint8>(data, /*scale=*/0.1, /*zero_point=*/0); - std::vector<uint8> expected = {0, 0, 0, 5, 10, 255}; + auto q_data = Quantize<uint8_t>(data, /*scale=*/0.1, /*zero_point=*/0); + std::vector<uint8_t> expected = {0, 0, 0, 5, 10, 255}; EXPECT_THAT(q_data, ElementsAreArray(expected)); } diff --git a/tensorflow/contrib/lite/kernels/tile.cc b/tensorflow/contrib/lite/kernels/tile.cc new file mode 100644 index 0000000000..49421eb870 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/tile.cc @@ -0,0 +1,195 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <string.h> +#include <vector> +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" +namespace tflite { +namespace ops { +namespace builtin { +namespace tile { + +constexpr int kInputTensor = 0; +constexpr int kInputMultipliers = 1; +constexpr int kOutputTensor = 0; + +namespace { +template <typename T> +TfLiteIntArray* MultiplyShapeDims(const TfLiteIntArray& shape, + const TfLiteTensor* multipliers, + int num_dimensions) { + const T* multipliers_v = GetTensorData<T>(multipliers); + + TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions); + for (int i = 0; i < num_dimensions; ++i) { + output_shape->data[i] = shape.data[i] * multipliers_v[i]; + } + return output_shape; +} + +TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers); + + const int num_dimensions = NumDimensions(input); + const int num_multipliers = NumElements(multipliers); + TF_LITE_ENSURE_EQ(context, num_dimensions, num_multipliers); + switch (multipliers->type) { + case kTfLiteInt32: + return context->ResizeTensor( + context, output, + MultiplyShapeDims<int32_t>(*input->dims, multipliers, + num_dimensions)); + case kTfLiteInt64: + return context->ResizeTensor( + context, output, + MultiplyShapeDims<int64_t>(*input->dims, multipliers, + num_dimensions)); + default: + context->ReportError(context, "Tile not supported multiply tensor type."); + return kTfLiteError; + } +} + +template <typename T> +void CopyMultipleTimes(const T* in_data, int32_t in_size, int32_t multiplier, + T* out_data) { + for (int i = 0; i < multiplier; ++i) { + const T* in_end = in_data + in_size; + T* new_out_data = std::copy(in_data, in_end, out_data); + in_data = out_data; + out_data = new_out_data; + } +} + +template <typename T, typename M> +std::pair<int, int> TileOneDimension(const TfLiteIntArray& in_dimensions, + const T* in_data, const M* multipliers, + T* out_data, int dimension) { + const int dimension_size = in_dimensions.data[dimension]; + if (dimension == in_dimensions.size - 1) { + CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], + out_data); + return std::make_pair( + dimension_size, + dimension_size * static_cast<int>(multipliers[dimension])); + } + int total_stride_size = 0, total_tiled_stride_size = 0; + const T* copy_from_data = in_data; + T* copy_to_data = out_data; + for (int i = 0; i < dimension_size; ++i) { + int stride_size = 0, tiled_stride_size = 0; + std::tie(stride_size, tiled_stride_size) = + TileOneDimension(in_dimensions, copy_from_data, multipliers, + copy_to_data, dimension + 1); + copy_from_data += stride_size; + copy_to_data += tiled_stride_size; + total_stride_size += stride_size; + total_tiled_stride_size += tiled_stride_size; + } + CopyMultipleTimes(out_data, total_tiled_stride_size, + multipliers[dimension] - 1, + out_data + total_tiled_stride_size); + return std::make_pair(total_stride_size, + total_tiled_stride_size * multipliers[dimension]); +} + +template <typename T> +void Tile(const TfLiteIntArray& in_dimensions, const TfLiteTensor* in_data, + const TfLiteTensor* multipliers, TfLiteTensor* out_data) { + // Doing recursively tiling from top to down dimension. + switch (multipliers->type) { + case kTfLiteInt32: + TileOneDimension(in_dimensions, GetTensorData<T>(in_data), + GetTensorData<int32_t>(multipliers), + GetTensorData<T>(out_data), 0); + break; + case kTfLiteInt64: + TileOneDimension(in_dimensions, GetTensorData<T>(in_data), + GetTensorData<int64_t>(multipliers), + GetTensorData<T>(out_data), 0); + break; + default: + break; + } +} +} // namespace + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TF_LITE_ENSURE_EQ(context, input->type, output->type); + + const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers); + // Only int32 and int64 multipliers type is supported. + TF_LITE_ENSURE_MSG(context, + (multipliers->type == kTfLiteInt32) || + (multipliers->type == kTfLiteInt64), + "Tile only supports int32 and int64 mutlipliers."); + + if (IsConstantTensor(multipliers)) { + TF_LITE_ENSURE_OK(context, ResizeOutput(context, node)); + } else { + SetTensorToDynamic(output); + } + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers); + + if (IsDynamicTensor(output)) { + TF_LITE_ENSURE_OK(context, ResizeOutput(context, node)); + } + + switch (output->type) { + case kTfLiteFloat32: + Tile<float>(*(input->dims), input, multipliers, output); + break; + case kTfLiteUInt8: + Tile<uint8_t>(*(input->dims), input, multipliers, output); + break; + case kTfLiteInt32: + Tile<int32_t>(*(input->dims), input, multipliers, output); + break; + case kTfLiteInt64: + Tile<int64_t>(*(input->dims), input, multipliers, output); + break; + default: + context->ReportError(context, "Type is currently not supported by Tile."); + return kTfLiteError; + } + return kTfLiteOk; +} + +} // namespace tile +TfLiteRegistration* Register_TILE() { + static TfLiteRegistration r = {nullptr, nullptr, tile::Prepare, tile::Eval}; + return &r; +} +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/tile_test.cc b/tensorflow/contrib/lite/kernels/tile_test.cc new file mode 100644 index 0000000000..e73ca7b750 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/tile_test.cc @@ -0,0 +1,256 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; +class TileOpModel : public SingleOpModel { + public: + TileOpModel(std::initializer_list<int> input_shape, TensorType input_type, + TensorType multiply_type) { + input_ = AddInput(input_type); + multipliers_ = AddInput(TensorType_INT32); + output_ = AddOutput(input_type); + SetBuiltinOp(BuiltinOperator_TILE, BuiltinOptions_TileOptions, 0); + BuildInterpreter({input_shape, {static_cast<int>(input_shape.size())}}); + } + + void SetInputFloat(std::initializer_list<float> data) { + PopulateTensor<float>(input_, data); + } + + void SetInputUInt8(std::initializer_list<uint8_t> data) { + PopulateTensor<uint8_t>(input_, data); + } + + void SetInputInt32(std::initializer_list<int32_t> data) { + PopulateTensor<int32_t>(input_, data); + } + + void SetInputInt64(std::initializer_list<int64_t> data) { + PopulateTensor<int64_t>(input_, data); + } + + void SetMultipliers(std::initializer_list<int32_t> data) { + PopulateTensor<int32_t>(multipliers_, data); + } + + std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); } + + std::vector<uint8_t> GetOutputUInt8() { return ExtractVector<uint8_t>(output_); } + + std::vector<int32_t> GetOutputInt32() { return ExtractVector<int32_t>(output_); } + + std::vector<int64_t> GetOutputInt64() { + return ExtractVector<int64_t>(output_); + } + + std::vector<int> GetOutputShape() { return GetTensorShape(output_); } + + protected: + int input_; + int multipliers_; + int output_; +}; + +TEST(TileTest, Float32Vector) { + TileOpModel m({3}, TensorType_FLOAT32, TensorType_INT32); + m.SetInputFloat({1.f, 2.f, 3.f}); + m.SetMultipliers({2}); + m.Invoke(); + EXPECT_THAT(m.GetOutputFloat(), + ElementsAreArray({1.f, 2.f, 3.f, 1.f, 2.f, 3.f})); +} + +TEST(TileTest, Float32Matrix) { + TileOpModel m({2, 3}, TensorType_FLOAT32, TensorType_INT32); + m.SetInputFloat({ + 11.f, + 12.f, + 13.f, + 21.f, + 22.f, + 23.f, + }); + m.SetMultipliers({2, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray({ + 11.f, + 12.f, + 13.f, + 21.f, + 22.f, + 23.f, + 11.f, + 12.f, + 13.f, + 21.f, + 22.f, + 23.f, + })); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); +} + +TEST(TileTest, Float32HighDimension) { + TileOpModel m({1, 2, 3}, TensorType_FLOAT32, TensorType_INT32); + m.SetInputFloat({ + 11.f, + 12.f, + 13.f, + 21.f, + 22.f, + 23.f, + }); + m.SetMultipliers({2, 3, 1}); + m.Invoke(); + EXPECT_THAT( + m.GetOutputFloat(), + ElementsAreArray({11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f, + 21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f, + 11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f, + 21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f})); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 3})); +} + +TEST(TileTest, Uint8Matrix) { + TileOpModel m({2, 3}, TensorType_UINT8, TensorType_INT32); + m.SetInputUInt8({ + 11, + 12, + 13, + 21, + 22, + 23, + }); + m.SetMultipliers({2, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputUInt8(), ElementsAreArray({ + 11, + 12, + 13, + 21, + 22, + 23, + 11, + 12, + 13, + 21, + 22, + 23, + })); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); +} + +TEST(TileTest, Int32Matrix) { + TileOpModel m({2, 3}, TensorType_INT32, TensorType_INT32); + m.SetInputInt32({ + 11, + 12, + 13, + 21, + 22, + 23, + }); + m.SetMultipliers({2, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputInt32(), ElementsAreArray({ + 11, + 12, + 13, + 21, + 22, + 23, + 11, + 12, + 13, + 21, + 22, + 23, + })); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); +} + +TEST(TileTest, Int64Matrix) { + TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT32); + m.SetInputInt64({ + 11, + 12, + 13, + 21, + 22, + 23, + }); + m.SetMultipliers({2, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({ + 11, + 12, + 13, + 21, + 22, + 23, + 11, + 12, + 13, + 21, + 22, + 23, + })); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); +} + +TEST(TileTest, Int64Matrix64Multipliers) { + TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT64); + m.SetInputInt64({ + 11, + 12, + 13, + 21, + 22, + 23, + }); + m.SetMultipliers({2, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({ + 11, + 12, + 13, + 21, + 22, + 23, + 11, + 12, + 13, + 21, + 22, + 23, + })); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); +} +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/topk_v2.cc b/tensorflow/contrib/lite/kernels/topk_v2.cc index 0feb42b85b..6c38b6739e 100644 --- a/tensorflow/contrib/lite/kernels/topk_v2.cc +++ b/tensorflow/contrib/lite/kernels/topk_v2.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include <algorithm> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" @@ -56,11 +56,13 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) { output_values_shape->data[num_dimensions - 1] = k; TfLiteTensor* output_indexes = GetOutput(context, node, kOutputIndexes); TfLiteTensor* output_values = GetOutput(context, node, kOutputValues); + // Force output types. + output_indexes->type = kTfLiteInt32; + output_values->type = input->type; auto resize_tensor = [context](TfLiteTensor* tensor, TfLiteIntArray* new_size, TfLiteIntArray* delete_on_error) { TfLiteStatus status = context->ResizeTensor(context, tensor, new_size); if (status != kTfLiteOk) { - TfLiteIntArrayFree(new_size); if (delete_on_error != nullptr) { TfLiteIntArrayFree(delete_on_error); } @@ -214,7 +216,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { output_values->data.i64); break; default: - context->ReportError(context, "Type is currently not supported by TopK."); + context->ReportError(context, + "Type %d is currently not supported by TopK.", + output_values->type); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/topk_v2_test.cc b/tensorflow/contrib/lite/kernels/topk_v2_test.cc index 212f8acc76..16106fdafe 100644 --- a/tensorflow/contrib/lite/kernels/topk_v2_test.cc +++ b/tensorflow/contrib/lite/kernels/topk_v2_test.cc @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include <gtest/gtest.h> -#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" #include "tensorflow/contrib/lite/interpreter.h" #include "tensorflow/contrib/lite/kernels/register.h" #include "tensorflow/contrib/lite/kernels/test_util.h" @@ -42,32 +42,32 @@ class TopKV2OpModel : public SingleOpModel { PopulateTensor<float>(input_, data); } - void SetInputUInt8(std::initializer_list<uint8> data) { - PopulateTensor<uint8>(input_, data); + void SetInputUInt8(std::initializer_list<uint8_t> data) { + PopulateTensor<uint8_t>(input_, data); } - void SetInputInt32(std::initializer_list<int32> data) { - PopulateTensor<int32>(input_, data); + void SetInputInt32(std::initializer_list<int32_t> data) { + PopulateTensor<int32_t>(input_, data); } void SetInputInt64(std::initializer_list<int64_t> data) { PopulateTensor<int64_t>(input_, data); } - std::vector<int32> GetIndexes() { - return ExtractVector<int32>(output_indexes_); + std::vector<int32_t> GetIndexes() { + return ExtractVector<int32_t>(output_indexes_); } std::vector<float> GetValuesFloat() { return ExtractVector<float>(output_values_); } - std::vector<uint8> GetValuesUInt8() { - return ExtractVector<uint8>(output_values_); + std::vector<uint8_t> GetValuesUInt8() { + return ExtractVector<uint8_t>(output_values_); } - std::vector<int32> GetValuesInt32() { - return ExtractVector<int32>(output_values_); + std::vector<int32_t> GetValuesInt32() { + return ExtractVector<int32_t>(output_values_); } std::vector<int64_t> GetValuesInt64() { @@ -119,7 +119,7 @@ TEST(TopKV2OpTest, VectorFloat) { EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(ArrayFloatNear({0.8, 0.2}))); } -// Check that uint8 works. +// Check that uint8_t works. TEST(TopKV2OpTest, TypeUint8) { TopKV2OpModel m({2, 3}, TensorType_UINT8, 2); m.SetInputUInt8({1, 2, 3, 251, 250, 249}); @@ -128,7 +128,7 @@ TEST(TopKV2OpTest, TypeUint8) { EXPECT_THAT(m.GetValuesUInt8(), ElementsAreArray({3, 2, 251, 250})); } -// Check that int32 works. +// Check that int32_t works. TEST(TopKV2OpTest, TypeInt32) { TopKV2OpModel m({2, 3}, TensorType_INT32, 2); m.SetInputInt32({1, 2, 3, 10251, 10250, 10249}); diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/contrib/lite/kernels/transpose.cc index 8316a23c18..e42a30420b 100644 --- a/tensorflow/contrib/lite/kernels/transpose.cc +++ b/tensorflow/contrib/lite/kernels/transpose.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ #include <string.h> #include <vector> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -92,26 +92,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); } - // Reverse the permuted axes and convert to 4D due to the way Dims are - // constructed in GetTensorDims. const int* perm_data = GetTensorData<int32_t>(op_context.perm); const int size = op_context.perm->dims->data[0]; - const int kOutputDimensionNum = 4; - int reversed_perm[kOutputDimensionNum]; - - for (int output_k = 0, input_k = size - 1; output_k < size; - ++output_k, --input_k) { - reversed_perm[output_k] = size - perm_data[input_k] - 1; - } - for (int k = size; k < kOutputDimensionNum; ++k) { - reversed_perm[k] = k; + TransposeParams params; + params.perm_count = size; + for (int i = 0; i < size; ++i) { + params.perm[i] = perm_data[i]; } #define TF_LITE_TRANSPOSE(type, scalar) \ - type::Transpose(GetTensorData<scalar>(op_context.input), \ - GetTensorDims(op_context.input), \ - GetTensorData<scalar>(op_context.output), \ - GetTensorDims(op_context.output), reversed_perm) + type::Transpose(params, GetTensorShape(op_context.input), \ + GetTensorData<scalar>(op_context.input), \ + GetTensorShape(op_context.output), \ + GetTensorData<scalar>(op_context.output)) switch (op_context.input->type) { case kTfLiteFloat32: @@ -136,7 +129,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { break; default: context->ReportError(context, - "Type is currently not supported by Transpose."); + "Type %d is currently not supported by Transpose.", + op_context.input->type); return kTfLiteError; } #undef TF_LITE_TRANSPOSE diff --git a/tensorflow/contrib/lite/kernels/transpose_conv.cc b/tensorflow/contrib/lite/kernels/transpose_conv.cc index 3c99661029..1c4a5ee91d 100644 --- a/tensorflow/contrib/lite/kernels/transpose_conv.cc +++ b/tensorflow/contrib/lite/kernels/transpose_conv.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -20,8 +19,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -70,7 +69,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4); TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 4); - // Currenlty only supports float32. + // Currently only supports float32. const TfLiteType data_type = input->type; TF_LITE_ENSURE(context, data_type == kTfLiteFloat32); TF_LITE_ENSURE_EQ(context, output->type, data_type); @@ -79,7 +78,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Ensure that weights and inputs have the same channel dimension. // Note: TOCO will reorder weights in the following format: OHWI. TF_LITE_ENSURE_EQ(context, SizeOfDimension(input, 3), - SizeOfDimension(weights, 0)); + SizeOfDimension(weights, 3)); if (!IsConstantTensor(output_shape)) { SetTensorToDynamic(output); @@ -118,13 +117,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { // Currently only support float32. switch (input->type) { - case kTfLiteFloat32: - optimized_ops::TransposeConv( - GetTensorData<float>(input), GetTensorDims(input), - GetTensorData<float>(weights), GetTensorDims(weights), stride_width, - stride_height, padding_size.width, padding_size.height, - GetTensorData<float>(output), GetTensorDims(output)); + case kTfLiteFloat32: { + tflite::ConvParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = padding_size.width; + op_params.padding_values.height = padding_size.height; + op_params.stride_width = stride_width; + op_params.stride_height = stride_height; + + reference_ops::TransposeConv( + op_params, GetTensorShape(input), GetTensorData<float>(input), + GetTensorShape(weights), GetTensorData<float>(weights), + GetTensorShape(output), GetTensorData<float>(output), + // Last two args specify im2col which reference_ops ignores. + // (Note this does not lead to a performance regression, as the + // previous optimized version was just a copy of the reference code.) + // TODO(b/110208176): Allocate im2col tensors and switch to + // optimized_ops. + GetTensorShape(output), GetTensorData<float>(output)); break; + } default: context->ReportError(context, "Type %d, not currently supported.", input->type); diff --git a/tensorflow/contrib/lite/kernels/transpose_conv_test.cc b/tensorflow/contrib/lite/kernels/transpose_conv_test.cc index 52be089349..55df897180 100644 --- a/tensorflow/contrib/lite/kernels/transpose_conv_test.cc +++ b/tensorflow/contrib/lite/kernels/transpose_conv_test.cc @@ -88,10 +88,10 @@ TEST(TransposeConvOpModelTest, SimpleTest) { // And filter value is derived by: // filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[18, 1]) TEST(TransposeConvOpModelTest, TwoFiltersTest) { - TransposeConvOpModel m({1, 4, 4, 2}, {2, 3, 3, 1}, Padding_SAME, 1, 1); + TransposeConvOpModel m({1, 4, 4, 2}, {1, 3, 3, 2}, Padding_SAME, 1, 1); m.PopulateTensor<int>(m.output_shape(), {1, 4, 4, 1}); - m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, - 8, 10, 12, 14, 16, 18}); + m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18}); m.PopulateTensor<float>( m.input(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, @@ -117,10 +117,10 @@ TEST(TransposeConvOpModelTest, TwoFiltersTest) { // And filter value is derived by: // filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[1, 18]) TEST(TransposeConvOpModelTest, PaddingValidTest) { - TransposeConvOpModel m({1, 4, 4, 2}, {2, 3, 3, 1}, Padding_VALID, 1, 1); + TransposeConvOpModel m({1, 4, 4, 2}, {1, 3, 3, 2}, Padding_VALID, 1, 1); m.PopulateTensor<int>(m.output_shape(), {1, 6, 6, 1}); - m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, - 8, 10, 12, 14, 16, 18}); + m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18}); m.PopulateTensor<float>( m.input(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, @@ -171,10 +171,10 @@ TEST(TransposeConvOpModelTest, StrideValidTest) { // [1, 2, 2, 1 ], // "VALID") TEST(TransposeConvOpModelTest, MultiChannelTest) { - TransposeConvOpModel m({1, 2, 2, 1}, {1, 3, 3, 2}, Padding_VALID, 2, 2); + TransposeConvOpModel m({1, 2, 2, 1}, {2, 3, 3, 1}, Padding_VALID, 2, 2); m.PopulateTensor<int>(m.output_shape(), {1, 5, 5, 2}); - m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 16, 17, 18}); + m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, + 8, 10, 12, 14, 16, 18}); m.PopulateTensor<float>(m.input(), {1, 2, 3, 4}); m.Invoke(); diff --git a/tensorflow/contrib/lite/kernels/transpose_test.cc b/tensorflow/contrib/lite/kernels/transpose_test.cc index 337bc144b9..79ef0a7c56 100644 --- a/tensorflow/contrib/lite/kernels/transpose_test.cc +++ b/tensorflow/contrib/lite/kernels/transpose_test.cc @@ -51,21 +51,21 @@ void RunTestPermutation(const std::vector<int>& shape, reversed_perms[k] = k; } - // Make input and output dims (i.e. reversed shape and dest_shape). - Dims<4> input_dims = GetTensorDims(shape); - Dims<4> output_dims; - for (int i = 0; i < 4; i++) { - output_dims.sizes[i] = input_dims.sizes[reversed_perms[i]]; + // Make input and output shapes. + const RuntimeShape input_shape = GetTensorShape(shape); + RuntimeShape output_shape(perms.size()); + for (int i = 0; i < perms.size(); i++) { + output_shape.SetDim(i, input_shape.Dims(perms[i])); } - output_dims.strides[0] = 1; - for (int k = 1; k < 4; k++) { - output_dims.strides[k] = - output_dims.strides[k - 1] * output_dims.sizes[k - 1]; + + TransposeParams params; + params.perm_count = perms.size(); + for (int i = 0; i < perms.size(); ++i) { + params.perm[i] = perms[i]; } - reference_ops::Transpose<float>(input.data(), input_dims, - input_transposed->data(), output_dims, - reversed_perms); + reference_ops::Transpose<float>(params, input_shape, input.data(), + output_shape, input_transposed->data()); } TEST(TransposeTest, TestRefOps1D) { diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc index 1c28123a24..89d57e4599 100644 --- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc +++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -21,12 +20,13 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/lstm_eval.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" namespace tflite { @@ -65,14 +65,30 @@ constexpr int kProjectionWeightsTensor = 16; // Optional // Projection bias tensor of size {n_output} constexpr int kProjectionBiasTensor = 17; // Optional +// Stateful input tensors that are variables and will be modified by the Op. +// Activation state tensor of size {n_batch, n_output} +constexpr int kInputActivationStateTensor = 18; +// Cell state tensor of size {n_batch, n_cell} +constexpr int kInputCellStateTensor = 19; + // Output tensors. -constexpr int kOutputStateTensor = 0; -constexpr int kCellStateTensor = 1; -constexpr int kOutputTensor = 2; +constexpr int kOutputTensor = 0; + +// Temporary tensors +enum TemporaryTensor { + kScratchBuffer = 0, + kInputQuantized = 1, + kOutputStateQuantized = 2, + kCellStateQuantized = 3, + kScalingFactors = 4, + kProductScalingFactors = 5, + kRecoveredCellWeights = 6, + kNumTemporaryTensors = 7 +}; void* Init(TfLiteContext* context, const char* buffer, size_t length) { - auto* scratch_tensor_index = new int; - context->AddTensors(context, 1, scratch_tensor_index); + auto* scratch_tensor_index = new int(); + context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index); return scratch_tensor_index; } @@ -84,7 +100,7 @@ void Free(TfLiteContext* context, void* buffer) { TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context, TfLiteNode* node, int n_input, int n_output, int n_cell) { - auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data); + const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data); // Making sure clipping parameters have valid values. // == 0 means no clipping @@ -236,12 +252,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data); // Check we have all the inputs and outputs we need. - TF_LITE_ENSURE_EQ(context, node->inputs->size, 18); - TF_LITE_ENSURE_EQ(context, node->outputs->size, 3); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 20); + TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); // Inferring batch size, number of outputs and sequence length and // number of cells from the input tensors. const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE(context, input->dims->size > 1); const int max_time = input->dims->data[0]; const int n_batch = input->dims->data[1]; @@ -261,14 +278,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const int n_output = recurrent_to_output_weights->dims->data[1]; // Check that input tensor dimensions matches with each other. - CheckInputTensorDimensions(context, node, n_input, n_output, n_cell); + TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input, + n_output, n_cell)); - // Get the pointer to output, output_state and cell_state buffer tensors. + // Get the pointer to output, activation_state and cell_state buffer tensors. TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor); - TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor); - // Resize the output, output_state and cell_state tensors. + TfLiteTensor* activation_state = + GetVariableInput(context, node, kInputActivationStateTensor); + TfLiteTensor* cell_state = + GetVariableInput(context, node, kInputCellStateTensor); + + // Check the shape of input state tensors. + // These tensor may be 1D or 2D. It's fine as long as the total size is + // correct. + TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output); + TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell); + + // Resize the output tensors. TfLiteIntArray* output_size = TfLiteIntArrayCreate(3); output_size->data[0] = max_time; output_size->data[1] = n_batch; @@ -276,54 +303,137 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output, output_size)); - TfLiteIntArray* output_state_size = TfLiteIntArrayCreate(2); - output_state_size->data[0] = n_batch; - output_state_size->data[1] = n_output; - TF_LITE_ENSURE_OK( - context, context->ResizeTensor(context, output_state, output_state_size)); - - TfLiteIntArray* cell_size = TfLiteIntArrayCreate(2); - cell_size->data[0] = n_batch; - cell_size->data[1] = n_cell; - TF_LITE_ENSURE_OK(context, - context->ResizeTensor(context, cell_state, cell_size)); + // The weights are of consistent type, so it suffices to check one. + // TODO(mirkov): create a utility/macro for this check, so all Ops can use it. + const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 && + input->type == kTfLiteFloat32); - // Create a scratch buffer tensor. TfLiteIntArrayFree(node->temporaries); - node->temporaries = TfLiteIntArrayCreate(1); + if (is_hybrid_op) { + node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors); + } else { + node->temporaries = TfLiteIntArrayCreate(1); + } node->temporaries->data[0] = *scratch_tensor_index; - TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0); + + // Create a scratch buffer tensor. + TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer); scratch_buffer->type = input->type; scratch_buffer->allocation_type = kTfLiteArenaRw; - // Mark state tensors as persistent tensors. - output_state->allocation_type = kTfLiteArenaRwPersistent; - cell_state->allocation_type = kTfLiteArenaRwPersistent; - const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(context, node, kInputToInputWeightsTensor); const bool use_cifg = (input_to_input_weights == nullptr); + TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2); + scratch_buffer_size->data[0] = n_batch; if (use_cifg) { - TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2); - scratch_buffer_size->data[0] = n_batch; // Reserving space for Cell, Forget, Output gates scratch_buffer_size->data[1] = n_cell * 3; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer, - scratch_buffer_size)); } else { - TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2); - scratch_buffer_size->data[0] = n_batch; // Reserving space for Input, Cell, Forget, Output gates scratch_buffer_size->data[1] = n_cell * 4; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer, - scratch_buffer_size)); + } + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer, + scratch_buffer_size)); + + if (is_hybrid_op) { + // Allocate temporary tensors to store quantized values of input, + // activation_state and cell_state tensors. + node->temporaries->data[kInputQuantized] = + *scratch_tensor_index + kInputQuantized; + TfLiteTensor* input_quantized = + GetTemporary(context, node, kInputQuantized); + input_quantized->type = kTfLiteUInt8; + input_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { + TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, + input_quantized_size)); + } + node->temporaries->data[kOutputStateQuantized] = + *scratch_tensor_index + kOutputStateQuantized; + TfLiteTensor* activation_state_quantized = + GetTemporary(context, node, kOutputStateQuantized); + activation_state_quantized->type = kTfLiteUInt8; + activation_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(activation_state_quantized->dims, + activation_state->dims)) { + TfLiteIntArray* activation_state_quantized_size = + TfLiteIntArrayCopy(activation_state->dims); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, activation_state_quantized, + activation_state_quantized_size)); + } + node->temporaries->data[kCellStateQuantized] = + *scratch_tensor_index + kCellStateQuantized; + TfLiteTensor* cell_state_quantized = + GetTemporary(context, node, kCellStateQuantized); + cell_state_quantized->type = kTfLiteUInt8; + cell_state_quantized->allocation_type = kTfLiteArenaRw; + if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) { + TfLiteIntArray* cell_state_quantized_size = + TfLiteIntArrayCopy(cell_state->dims); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, cell_state_quantized, + cell_state_quantized_size)); + } + + // Allocate temporary tensors to store scaling factors and product scaling + // factors. The latter is a convenience storage which allows to quantize + // a vector once (which produces the scaling factors) and multiply it with + // different matrices (which requires multiplying the scaling factors with + // the scaling factor of the matrix). + node->temporaries->data[kScalingFactors] = + *scratch_tensor_index + kScalingFactors; + TfLiteTensor* scaling_factors = + GetTemporary(context, node, kScalingFactors); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + scaling_factors_size->data[0] = n_batch; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } + node->temporaries->data[kProductScalingFactors] = + *scratch_tensor_index + kProductScalingFactors; + TfLiteTensor* prod_scaling_factors = + GetTemporary(context, node, kProductScalingFactors); + prod_scaling_factors->type = kTfLiteFloat32; + prod_scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1); + prod_scaling_factors_size->data[0] = n_batch; + if (!TfLiteIntArrayEqual(prod_scaling_factors->dims, + prod_scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, prod_scaling_factors, + prod_scaling_factors_size)); + } + + // Allocate a temporary tensor to store the recovered cell weights. Since + // this is used for diagonal matrices, only need to store n_cell values. + node->temporaries->data[kRecoveredCellWeights] = + *scratch_tensor_index + kRecoveredCellWeights; + TfLiteTensor* recovered_cell_weights = + GetTemporary(context, node, kRecoveredCellWeights); + recovered_cell_weights->type = kTfLiteFloat32; + recovered_cell_weights->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1); + recovered_cell_weights_size->data[0] = n_cell; + if (!TfLiteIntArrayEqual(recovered_cell_weights->dims, + recovered_cell_weights_size)) { + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, recovered_cell_weights, + recovered_cell_weights_size)); + } } return kTfLiteOk; } -// The LSTM Op engine. TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { - auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data); + const auto* params = + reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>( + node->builtin_data); const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* input_to_input_weights = @@ -364,94 +474,76 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* projection_bias = GetOptionalInputTensor(context, node, kProjectionBiasTensor); - TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor); - TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor); - TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - - const int max_time = input->dims->data[0]; - const int n_batch = input->dims->data[1]; - const int n_input = input->dims->data[2]; - // n_cell and n_output will be the same size when there is no projection. - const int n_cell = input_to_output_weights->dims->data[0]; - const int n_output = recurrent_to_output_weights->dims->data[1]; - - // Since we have already checked that weights are all there or none, we can - // check the existence of only one to get the condition. - const bool use_cifg = (input_to_input_weights == nullptr); - const bool use_peephole = (cell_to_output_weights != nullptr); - // Index the scratch buffers pointers to the global scratch buffer. TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0); - float* input_gate_scratch = nullptr; - float* cell_scratch = nullptr; - float* forget_gate_scratch = nullptr; - float* output_gate_scratch = nullptr; - if (use_cifg) { - cell_scratch = scratch_buffer->data.f; - forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch; - output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; - } else { - input_gate_scratch = scratch_buffer->data.f; - cell_scratch = scratch_buffer->data.f + n_cell * n_batch; - forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch; - output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch; - } - // Check optional tensors, the respective pointers can be null. - const float* input_to_input_weights_ptr = - (use_cifg) ? nullptr : input_to_input_weights->data.f; - const float* recurrent_to_input_weights_ptr = - (use_cifg) ? nullptr : recurrent_to_input_weights->data.f; - const float* input_gate_bias_ptr = - (use_cifg) ? nullptr : input_gate_bias->data.f; - const float* cell_to_input_weights_ptr = - (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr; - const float* cell_to_forget_weights_ptr = - (use_peephole) ? cell_to_forget_weights->data.f : nullptr; - const float* cell_to_output_weights_ptr = - (use_peephole) ? cell_to_output_weights->data.f : nullptr; - const float* projection_weights_ptr = - (projection_weights == nullptr) ? nullptr : projection_weights->data.f; - const float* projection_bias_ptr = - (projection_bias == nullptr) ? nullptr : projection_bias->data.f; - - // Required tensors, pointers are non-null. - const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f; - const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f; - const float* input_to_output_weights_ptr = input_to_output_weights->data.f; - const float* recurrent_to_forget_weights_ptr = - recurrent_to_forget_weights->data.f; - const float* recurrent_to_cell_weights_ptr = - recurrent_to_cell_weights->data.f; - const float* recurrent_to_output_weights_ptr = - recurrent_to_output_weights->data.f; - const float* forget_gate_bias_ptr = forget_gate_bias->data.f; - const float* cell_bias_ptr = cell_bias->data.f; - const float* output_gate_bias_ptr = output_gate_bias->data.f; - - float* output_state_ptr = output_state->data.f; - float* cell_state_ptr = cell_state->data.f; - - for (int t = 0; t < max_time; t++) { - const float* input_ptr_batch = input->data.f + t * n_batch * n_input; - float* output_ptr_batch = output->data.f + t * n_batch * n_output; - - kernel_utils::LstmStep( - input_ptr_batch, input_to_input_weights_ptr, - input_to_forget_weights_ptr, input_to_cell_weights_ptr, - input_to_output_weights_ptr, recurrent_to_input_weights_ptr, - recurrent_to_forget_weights_ptr, recurrent_to_cell_weights_ptr, - recurrent_to_output_weights_ptr, cell_to_input_weights_ptr, - cell_to_forget_weights_ptr, cell_to_output_weights_ptr, - input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr, - output_gate_bias_ptr, projection_weights_ptr, projection_bias_ptr, - params, n_batch, n_cell, n_input, n_output, output_state_ptr, - cell_state_ptr, input_gate_scratch, forget_gate_scratch, cell_scratch, - output_gate_scratch, output_ptr_batch); + TfLiteTensor* activation_state = + GetVariableInput(context, node, kInputActivationStateTensor); + TfLiteTensor* cell_state = + GetVariableInput(context, node, kInputCellStateTensor); + + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + // Copy out the LSTM specific params so they can be passed in the function. + TfLiteLSTMParams lstm_params; + lstm_params.activation = params->activation; + lstm_params.cell_clip = params->cell_clip; + lstm_params.proj_clip = params->proj_clip; + + switch (input_to_output_weights->type) { + case kTfLiteFloat32: { + return lstm_eval::EvalFloat( + input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, + cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights, + /*aux_input=*/nullptr, + /*aux_input_to_input_weights=*/nullptr, + /*aux_input_to_forget_weights=*/nullptr, + /*aux_input_to_cell_weights=*/nullptr, + /*aux_input_to_output_weights=*/nullptr, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, projection_weights, + projection_bias, &lstm_params, /*forward_sequence=*/true, + /*output_offset=*/0, scratch_buffer, activation_state, cell_state, + output); + } + case kTfLiteUInt8: { + TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1); + TfLiteTensor* activation_state_quantized = + GetTemporary(context, node, /*index=*/2); + TfLiteTensor* cell_state_quantized = + GetTemporary(context, node, /*index=*/3); + TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4); + TfLiteTensor* prod_scaling_factors = + GetTemporary(context, node, /*index=*/5); + TfLiteTensor* recovered_cell_weights = + GetTemporary(context, node, /*index=*/6); + return lstm_eval::EvalHybrid( + input, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, + cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights, + /*aux_input=*/nullptr, + /*aux_input_to_input_weights=*/nullptr, + /*aux_input_to_forget_weights=*/nullptr, + /*aux_input_to_cell_weights=*/nullptr, + /*aux_input_to_output_weights=*/nullptr, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, projection_weights, + projection_bias, &lstm_params, /*forward_sequence=*/true, + /*output_offset=*/0, scratch_buffer, scaling_factors, + prod_scaling_factors, recovered_cell_weights, input_quantized, + /*aux_input_quantized=*/nullptr, activation_state_quantized, + cell_state_quantized, activation_state, cell_state, output); + } + default: + context->ReportError(context, "Type %d is not currently supported.", + input_to_output_weights->type); + return kTfLiteError; } return kTfLiteOk; } - } // namespace unidirectional_sequence_lstm TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM() { diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc index 5881ced7c7..c97b0fdd61 100644 --- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc +++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc @@ -14,7 +14,6 @@ limitations under the License. ==============================================================================*/ // Unit test for TFLite Sequential LSTM op. -#include <iomanip> #include <memory> #include <vector> @@ -37,7 +36,8 @@ class UnidirectionalLSTMOpModel : public SingleOpModel { bool use_peephole, bool use_projection_weights, bool use_projection_bias, float cell_clip, float proj_clip, - const std::vector<std::vector<int>>& input_shapes) + const std::vector<std::vector<int>>& input_shapes, + const TensorType& weights_type = TensorType_FLOAT32) : n_batch_(n_batch), n_input_(n_input), n_cell_(n_cell), @@ -48,31 +48,31 @@ class UnidirectionalLSTMOpModel : public SingleOpModel { if (use_cifg) { input_to_input_weights_ = AddNullInput(); } else { - input_to_input_weights_ = AddInput(TensorType_FLOAT32); + input_to_input_weights_ = AddInput(weights_type); } - input_to_forget_weights_ = AddInput(TensorType_FLOAT32); - input_to_cell_weights_ = AddInput(TensorType_FLOAT32); - input_to_output_weights_ = AddInput(TensorType_FLOAT32); + input_to_forget_weights_ = AddInput(weights_type); + input_to_cell_weights_ = AddInput(weights_type); + input_to_output_weights_ = AddInput(weights_type); if (use_cifg) { recurrent_to_input_weights_ = AddNullInput(); } else { - recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32); + recurrent_to_input_weights_ = AddInput(weights_type); } - recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32); - recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32); - recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32); + recurrent_to_forget_weights_ = AddInput(weights_type); + recurrent_to_cell_weights_ = AddInput(weights_type); + recurrent_to_output_weights_ = AddInput(weights_type); if (use_peephole) { if (use_cifg) { cell_to_input_weights_ = AddNullInput(); } else { - cell_to_input_weights_ = AddInput(TensorType_FLOAT32); + cell_to_input_weights_ = AddInput(weights_type); } - cell_to_forget_weights_ = AddInput(TensorType_FLOAT32); - cell_to_output_weights_ = AddInput(TensorType_FLOAT32); + cell_to_forget_weights_ = AddInput(weights_type); + cell_to_output_weights_ = AddInput(weights_type); } else { cell_to_input_weights_ = AddNullInput(); cell_to_forget_weights_ = AddNullInput(); @@ -89,7 +89,7 @@ class UnidirectionalLSTMOpModel : public SingleOpModel { output_gate_bias_ = AddInput(TensorType_FLOAT32); if (use_projection_weights) { - projection_weights_ = AddInput(TensorType_FLOAT32); + projection_weights_ = AddInput(weights_type); if (use_projection_bias) { projection_bias_ = AddInput(TensorType_FLOAT32); } else { @@ -100,15 +100,22 @@ class UnidirectionalLSTMOpModel : public SingleOpModel { projection_bias_ = AddNullInput(); } - output_state_ = AddOutput(TensorType_FLOAT32); - cell_state_ = AddOutput(TensorType_FLOAT32); + // Adding the 2 input state tensors. + input_activation_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, + /*is_variable=*/true); + input_cell_state_ = + AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, + /*is_variable=*/true); + output_ = AddOutput(TensorType_FLOAT32); - SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, - BuiltinOptions_LSTMOptions, - CreateLSTMOptions(builder_, ActivationFunctionType_TANH, - cell_clip, proj_clip) - .Union()); + SetBuiltinOp( + BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, + BuiltinOptions_UnidirectionalSequenceLSTMOptions, + CreateUnidirectionalSequenceLSTMOptions( + builder_, ActivationFunctionType_TANH, cell_clip, proj_clip) + .Union()); BuildInterpreter(input_shapes); } @@ -180,24 +187,9 @@ class UnidirectionalLSTMOpModel : public SingleOpModel { PopulateTensor(projection_bias_, f); } - void ResetOutputState() { - const int zero_buffer_size = n_cell_ * n_batch_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(output_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - - void ResetCellState() { - const int zero_buffer_size = n_cell_ * n_batch_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(cell_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - - void SetInput(int offset, float* begin, float* end) { - PopulateTensor(input_, offset, begin, end); + void SetInput(int offset, const float* begin, const float* end) { + PopulateTensor(input_, offset, const_cast<float*>(begin), + const_cast<float*>(end)); } std::vector<float> GetOutput() { return ExtractVector<float>(output_); } @@ -208,7 +200,7 @@ class UnidirectionalLSTMOpModel : public SingleOpModel { int num_batches() { return n_batch_; } int sequence_length() { return sequence_length_; } - private: + protected: int input_; int input_to_input_weights_; int input_to_forget_weights_; @@ -232,9 +224,10 @@ class UnidirectionalLSTMOpModel : public SingleOpModel { int projection_weights_; int projection_bias_; + int input_activation_state_; + int input_cell_state_; + int output_; - int output_state_; - int cell_state_; int n_batch_; int n_input_; @@ -243,7 +236,183 @@ class UnidirectionalLSTMOpModel : public SingleOpModel { int sequence_length_; }; -TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { +// The hybrid model has quantized weights. +class HybridUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel { + public: + HybridUnidirectionalLSTMOpModel( + int n_batch, int n_input, int n_cell, int n_output, int sequence_length, + bool use_cifg, bool use_peephole, bool use_projection_weights, + bool use_projection_bias, float cell_clip, float proj_clip, + const std::vector<std::vector<int>>& input_shapes) + : UnidirectionalLSTMOpModel( + n_batch, n_input, n_cell, n_output, sequence_length, use_cifg, + use_peephole, use_projection_weights, use_projection_bias, + cell_clip, proj_clip, input_shapes, TensorType_UINT8) {} + + void SetInputToInputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_input_weights_, f); + } + + void SetInputToForgetWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_forget_weights_, f); + } + + void SetInputToCellWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_cell_weights_, f); + } + + void SetInputToOutputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(input_to_output_weights_, f); + } + + void SetRecurrentToInputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f); + } + + void SetRecurrentToForgetWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f); + } + + void SetRecurrentToCellWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f); + } + + void SetRecurrentToOutputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f); + } + + void SetCellToInputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(cell_to_input_weights_, f); + } + + void SetCellToForgetWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f); + } + + void SetCellToOutputWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(cell_to_output_weights_, f); + } + + void SetProjectionWeights(std::initializer_list<float> f) { + SymmetricQuantizeAndPopulate(projection_weights_, f); + } +}; + +class BaseLstmTest : public ::testing::Test { + protected: + // Weights of the LSTM model. Some are optional. + std::initializer_list<float> input_to_input_weights_; + std::initializer_list<float> input_to_cell_weights_; + std::initializer_list<float> input_to_forget_weights_; + std::initializer_list<float> input_to_output_weights_; + std::initializer_list<float> input_gate_bias_; + std::initializer_list<float> cell_gate_bias_; + std::initializer_list<float> forget_gate_bias_; + std::initializer_list<float> output_gate_bias_; + std::initializer_list<float> recurrent_to_input_weights_; + std::initializer_list<float> recurrent_to_cell_weights_; + std::initializer_list<float> recurrent_to_forget_weights_; + std::initializer_list<float> recurrent_to_output_weights_; + std::initializer_list<float> cell_to_input_weights_; + std::initializer_list<float> cell_to_forget_weights_; + std::initializer_list<float> cell_to_output_weights_; + std::initializer_list<float> projection_weights_; + + // LSTM input is stored as num_batch x num_inputs vector. + std::vector<std::vector<float>> lstm_input_; + // LSTM output is stored as num_batch x num_outputs vector. + std::vector<std::vector<float>> lstm_golden_output_; + + // Compares output up to tolerance to the result of the lstm given the input. + void VerifyGoldens(const std::vector<std::vector<float>>& input, + const std::vector<std::vector<float>>& output, + UnidirectionalLSTMOpModel* lstm, float tolerance = 1e-5) { + const int num_batches = input.size(); + EXPECT_GT(num_batches, 0); + const int num_inputs = lstm->num_inputs(); + EXPECT_GT(num_inputs, 0); + const int input_sequence_size = input[0].size() / num_inputs; + EXPECT_GT(input_sequence_size, 0); + // Feed the whole sequence as input. + for (int i = 0; i < input_sequence_size; ++i) { + for (int b = 0; b < num_batches; ++b) { + const float* batch_start = input[b].data() + i * num_inputs; + const float* batch_end = batch_start + num_inputs; + + lstm->SetInput(((i * num_batches) + b) * lstm->num_inputs(), + batch_start, batch_end); + } + } + + lstm->Invoke(); + + const int num_outputs = lstm->num_outputs(); + EXPECT_GT(num_outputs, 0); + std::vector<float> expected; + for (int i = 0; i < input_sequence_size; ++i) { + for (int b = 0; b < num_batches; ++b) { + const float* golden_start_batch = output[b].data() + i * num_outputs; + const float* golden_end_batch = golden_start_batch + num_outputs; + + expected.insert(expected.end(), golden_start_batch, golden_end_batch); + } + } + + EXPECT_THAT(lstm->GetOutput(), + ElementsAreArray(ArrayFloatNear(expected, tolerance))); + } +}; + +class NoCifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest { + void SetUp() override { + input_to_input_weights_ = {-0.45018822, -0.02338299, -0.0870589, + -0.34550029, 0.04266912, -0.15680569, + -0.34856534, 0.43890524}; + input_to_cell_weights_ = {-0.50013041, 0.1370284, 0.11810488, 0.2013163, + -0.20583314, 0.44344562, 0.22077113, -0.29909778}; + input_to_forget_weights_ = {0.09701663, 0.20334584, -0.50592935, + -0.31343272, -0.40032279, 0.44781327, + 0.01387155, -0.35593212}; + input_to_output_weights_ = {-0.25065863, -0.28290087, 0.04613829, + 0.40525138, 0.44272184, 0.03897077, + -0.1556896, 0.19487578}; + input_gate_bias_ = {0., 0., 0., 0.}; + cell_gate_bias_ = {0., 0., 0., 0.}; + forget_gate_bias_ = {1., 1., 1., 1.}; + output_gate_bias_ = {0., 0., 0., 0.}; + + recurrent_to_input_weights_ = { + -0.0063535, -0.2042388, 0.31454784, -0.35746509, + 0.28902304, 0.08183324, -0.16555229, 0.02286911, + -0.13566875, 0.03034258, 0.48091322, -0.12528998, + 0.24077177, -0.51332325, -0.33502164, 0.10629296}; + + recurrent_to_cell_weights_ = { + -0.3407414, 0.24443203, -0.2078532, 0.26320225, + 0.05695659, -0.00123841, -0.4744786, -0.35869038, + -0.06418842, -0.13502428, -0.501764, 0.22830659, + -0.46367589, 0.26016325, -0.03894562, -0.16368064}; + + recurrent_to_forget_weights_ = { + -0.48684245, -0.06655136, 0.42224967, 0.2112639, + 0.27654213, 0.20864892, -0.07646349, 0.45877004, + 0.00141793, -0.14609534, 0.36447752, 0.09196436, + 0.28053468, 0.01560611, -0.20127171, -0.01140004}; + + recurrent_to_output_weights_ = { + 0.43385774, -0.17194885, 0.2718237, 0.09215671, + 0.24107647, -0.39835793, 0.18212086, 0.01301402, + 0.48572797, -0.50656658, 0.20047462, -0.20607421, + -0.51818722, -0.15390486, 0.0468148, 0.39922136}; + + lstm_input_ = {{2., 3., 3., 4., 1., 1.}}; + lstm_golden_output_ = {{-0.02973187, 0.1229473, 0.20885126, -0.15358765, + -0.03716109, 0.12507336, 0.41193449, -0.20860538, + -0.15053082, 0.09120187, 0.24278517, -0.12222792}}; + } +}; + +TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) { const int n_batch = 1; const int n_input = 2; // n_cell and n_output have the same size when there is no projection. @@ -252,9 +421,11 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { const int sequence_length = 3; UnidirectionalLSTMOpModel lstm( - n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false, - /*use_peephole=*/false, /*use_projection_weights=*/false, - /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0, + n_batch, n_input, n_cell, n_output, sequence_length, + /*use_cifg=*/false, /*use_peephole=*/false, + /*use_projection_weights=*/false, + /*use_projection_bias=*/false, + /*cell_clip=*/0.0, /*proj_clip=*/0.0, { {sequence_length, n_batch, n_input}, // input tensor @@ -279,79 +450,138 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) { {0, 0}, // projection_weight tensor {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor }); - lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589, - -0.34550029, 0.04266912, -0.15680569, - -0.34856534, 0.43890524}); + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); - lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163, - -0.20583314, 0.44344562, 0.22077113, - -0.29909778}); + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); - lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935, - -0.31343272, -0.40032279, 0.44781327, - 0.01387155, -0.35593212}); + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); - lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829, - 0.40525138, 0.44272184, 0.03897077, -0.1556896, - 0.19487578}); + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); +} + +TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) { + const int n_batch = 1; + const int n_input = 2; + // n_cell and n_output have the same size when there is no projection. + const int n_cell = 4; + const int n_output = 4; + const int sequence_length = 3; - lstm.SetInputGateBias({0., 0., 0., 0.}); + HybridUnidirectionalLSTMOpModel lstm( + n_batch, n_input, n_cell, n_output, sequence_length, + /*use_cifg=*/false, /*use_peephole=*/false, + /*use_projection_weights=*/false, + /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0, + { + {sequence_length, n_batch, n_input}, // input tensor - lstm.SetCellBias({0., 0., 0., 0.}); + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor - lstm.SetForgetGateBias({1., 1., 1., 1.}); + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor - lstm.SetOutputGateBias({0., 0., 0., 0.}); + {0}, // cell_to_input_weight tensor + {0}, // cell_to_forget_weight tensor + {0}, // cell_to_output_weight tensor - lstm.SetRecurrentToInputWeights( - {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324, - -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322, - -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296}); + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor - lstm.SetRecurrentToCellWeights( - {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841, - -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659, - -0.46367589, 0.26016325, -0.03894562, -0.16368064}); + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor - lstm.SetRecurrentToForgetWeights( - {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892, - -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436, - 0.28053468, 0.01560611, -0.20127171, -0.01140004}); + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + }); - lstm.SetRecurrentToOutputWeights( - {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793, - 0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421, - -0.51818722, -0.15390486, 0.0468148, 0.39922136}); + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); - // Input should have n_input * sequence_length many values. - static float lstm_input[] = {2., 3., 3., 4., 1., 1.}; - static float lstm_golden_output[] = {-0.02973187, 0.1229473, 0.20885126, - -0.15358765, -0.03716109, 0.12507336, - 0.41193449, -0.20860538, -0.15053082, - 0.09120187, 0.24278517, -0.12222792}; + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); - float* batch0_start = lstm_input; - float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length(); + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, + /*tolerance=*/0.0157651); +} - lstm.SetInput(0, batch0_start, batch0_end); +class CifgPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest { + void SetUp() override { + input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726, + 0.05100781, 0.04717243, 0.48944736, + -0.38535351, -0.17212132}; - lstm.Invoke(); + input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988, + -0.3633365, -0.22755712, 0.28253698, + 0.24407166, 0.33826375}; - float* golden_start = lstm_golden_output; - float* golden_end = - golden_start + lstm.num_outputs() * lstm.sequence_length(); - std::vector<float> expected; - expected.insert(expected.end(), golden_start, golden_end); - EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); -} + input_to_output_weights_ = {0.10725588, -0.02335852, -0.55932593, + -0.09426838, -0.44257352, 0.54939759, + 0.01533556, 0.42751634}; + cell_gate_bias_ = {0., 0., 0., 0.}; + forget_gate_bias_ = {1., 1., 1., 1.}; + output_gate_bias_ = {0., 0., 0., 0.}; + + recurrent_to_cell_weights_ = { + 0.54066205, -0.32668582, -0.43562764, -0.56094903, + 0.42957711, 0.01841056, -0.32764608, -0.33027974, + -0.10826075, 0.20675004, 0.19069612, -0.03026325, + -0.54532051, 0.33003211, 0.44901288, 0.21193194}; + + recurrent_to_forget_weights_ = { + -0.13832897, -0.0515101, -0.2359007, -0.16661474, + -0.14340827, 0.36986142, 0.23414481, 0.55899, + 0.10798943, -0.41174671, 0.17751795, -0.34484994, + -0.35874045, -0.11352962, 0.27268326, 0.54058349}; + + recurrent_to_output_weights_ = { + 0.41613156, 0.42610586, -0.16495961, -0.5663873, + 0.30579174, -0.05115908, -0.33941799, 0.23364776, + 0.11178309, 0.09481031, -0.26424935, 0.46261835, + 0.50248802, 0.26114327, -0.43736315, 0.33149987}; + + cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408, + 0.31544167}; + cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703, + -0.77109635}; + + lstm_input_ = {{2., 3., 3., 4., 1., 1.}}; + lstm_golden_output_ = {{-0.36444446, -0.00352185, 0.12886585, -0.05163646, + -0.42312205, -0.01218222, 0.24201041, -0.08124574, + -0.358325, -0.04621704, 0.21641694, -0.06471302}}; + } +}; -TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { +TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) { const int n_batch = 1; const int n_input = 2; // n_cell and n_output have the same size when there is no projection. @@ -360,9 +590,11 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { const int sequence_length = 3; UnidirectionalLSTMOpModel lstm( - n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true, - /*use_peephole=*/true, /*use_projection_weights=*/false, - /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0, + n_batch, n_input, n_cell, n_output, sequence_length, + /*use_cifg=*/true, /*use_peephole=*/true, + /*use_projection_weights=*/false, + /*use_projection_bias=*/false, + /*cell_clip=*/0.0, /*proj_clip=*/0.0, { {sequence_length, n_batch, n_input}, // input tensor @@ -387,73 +619,690 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { {0, 0}, // projection_weight tensor {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor }); - lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781, - 0.04717243, 0.48944736, -0.38535351, - -0.17212132}); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); - lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988, - -0.3633365, -0.22755712, 0.28253698, 0.24407166, - 0.33826375}); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); - lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593, - -0.09426838, -0.44257352, 0.54939759, - 0.01533556, 0.42751634}); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); - lstm.SetCellBias({0., 0., 0., 0.}); + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); - lstm.SetForgetGateBias({1., 1., 1., 1.}); + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); +} - lstm.SetOutputGateBias({0., 0., 0., 0.}); +TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) { + const int n_batch = 1; + const int n_input = 2; + // n_cell and n_output have the same size when there is no projection. + const int n_cell = 4; + const int n_output = 4; + const int sequence_length = 3; - lstm.SetRecurrentToCellWeights( - {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711, - 0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004, - 0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288, - 0.21193194}); + HybridUnidirectionalLSTMOpModel lstm( + n_batch, n_input, n_cell, n_output, sequence_length, + /*use_cifg=*/true, /*use_peephole=*/true, + /*use_projection_weights=*/false, + /*use_projection_bias=*/false, + /*cell_clip=*/0.0, /*proj_clip=*/0.0, + { + {sequence_length, n_batch, n_input}, // input tensor - lstm.SetRecurrentToForgetWeights( - {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827, - 0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795, - -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349}); + {0, 0}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor - lstm.SetRecurrentToOutputWeights( - {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908, - -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835, - 0.50248802, 0.26114327, -0.43736315, 0.33149987}); + {0, 0}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor - lstm.SetCellToForgetWeights( - {0.47485286, -0.51955009, -0.24458408, 0.31544167}); - lstm.SetCellToOutputWeights( - {-0.17135078, 0.82760304, 0.85573703, -0.77109635}); + {0}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor - static float lstm_input[] = {2., 3., 3., 4., 1., 1.}; - static float lstm_golden_output[] = {-0.36444446, -0.00352185, 0.12886585, - -0.05163646, -0.42312205, -0.01218222, - 0.24201041, -0.08124574, -0.358325, - -0.04621704, 0.21641694, -0.06471302}; + {0}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + }); - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); - float* batch0_start = lstm_input; - float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length(); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); - lstm.SetInput(0, batch0_start, batch0_end); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); - lstm.Invoke(); + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); - float* golden_start = lstm_golden_output; - float* golden_end = - golden_start + lstm.num_outputs() * lstm.sequence_length(); - std::vector<float> expected; - expected.insert(expected.end(), golden_start, golden_end); - EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573); } -TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) { +class NoCifgPeepholeProjectionClippingLstmTest : public BaseLstmTest { + void SetUp() override { + input_to_input_weights_ = { + 0.021393683, 0.06124551, 0.046905167, -0.014657677, -0.03149463, + 0.09171803, 0.14647801, 0.10797193, -0.0057968358, 0.0019193048, + -0.2726754, 0.10154029, -0.018539885, 0.080349885, -0.10262385, + -0.022599787, -0.09121155, -0.008675967, -0.045206103, -0.0821282, + -0.008045952, 0.015478081, 0.055217247, 0.038719587, 0.044153627, + -0.06453243, 0.05031825, -0.046935108, -0.008164439, 0.014574226, + -0.1671009, -0.15519552, -0.16819797, -0.13971269, -0.11953059, + 0.25005487, -0.22790983, 0.009855087, -0.028140958, -0.11200698, + 0.11295408, -0.0035217577, 0.054485075, 0.05184695, 0.064711206, + 0.10989193, 0.11674786, 0.03490607, 0.07727357, 0.11390585, + -0.1863375, -0.1034451, -0.13945189, -0.049401227, -0.18767063, + 0.042483903, 0.14233552, 0.13832581, 0.18350165, 0.14545603, + -0.028545704, 0.024939531, 0.050929718, 0.0076203286, -0.0029723682, + -0.042484224, -0.11827596, -0.09171104, -0.10808628, -0.16327988, + -0.2273378, -0.0993647, -0.017155107, 0.0023917493, 0.049272764, + 0.0038534778, 0.054764505, 0.089753784, 0.06947234, 0.08014476, + -0.04544234, -0.0497073, -0.07135631, -0.048929106, -0.004042012, + -0.009284026, 0.018042054, 0.0036860977, -0.07427302, -0.11434604, + -0.018995456, 0.031487543, 0.012834908, 0.019977754, 0.044256654, + -0.39292613, -0.18519334, -0.11651281, -0.06809892, 0.011373677}; + + input_to_forget_weights_ = { + -0.0018401089, -0.004852237, 0.03698424, 0.014181704, + 0.028273236, -0.016726194, -0.05249759, -0.10204261, + 0.00861066, -0.040979505, -0.009899187, 0.01923892, + -0.028177269, -0.08535103, -0.14585495, 0.10662567, + -0.01909731, -0.017883534, -0.0047269356, -0.045103323, + 0.0030784295, 0.076784775, 0.07463696, 0.094531395, + 0.0814421, -0.12257899, -0.033945758, -0.031303465, + 0.045630626, 0.06843887, -0.13492945, -0.012480007, + -0.0811829, -0.07224499, -0.09628791, 0.045100946, + 0.0012300825, 0.013964662, 0.099372394, 0.02543059, + 0.06958324, 0.034257296, 0.0482646, 0.06267997, + 0.052625068, 0.12784666, 0.07077897, 0.025725935, + 0.04165009, 0.07241905, 0.018668644, -0.037377294, + -0.06277783, -0.08833636, -0.040120605, -0.011405586, + -0.007808335, -0.010301386, -0.005102167, 0.027717464, + 0.05483423, 0.11449111, 0.11289652, 0.10939839, + 0.13396506, -0.08402166, -0.01901462, -0.044678304, + -0.07720565, 0.014350063, -0.11757958, -0.0652038, + -0.08185733, -0.076754324, -0.092614375, 0.10405491, + 0.052960336, 0.035755895, 0.035839386, -0.012540553, + 0.036881298, 0.02913376, 0.03420159, 0.05448447, + -0.054523353, 0.02582715, 0.02327355, -0.011857179, + -0.0011980024, -0.034641717, -0.026125094, -0.17582615, + -0.15923657, -0.27486774, -0.0006143371, 0.0001771948, + -8.470171e-05, 0.02651807, 0.045790765, 0.06956496}; + + input_to_cell_weights_ = { + -0.04580283, -0.09549462, -0.032418985, -0.06454633, + -0.043528453, 0.043018587, -0.049152344, -0.12418144, + -0.078985475, -0.07596889, 0.019484362, -0.11434962, + -0.0074034138, -0.06314844, -0.092981495, 0.0062155537, + -0.025034338, -0.0028890965, 0.048929527, 0.06235075, + 0.10665918, -0.032036792, -0.08505916, -0.10843358, + -0.13002433, -0.036816437, -0.02130134, -0.016518239, + 0.0047691227, -0.0025825808, 0.066017866, 0.029991534, + -0.10652836, -0.1037554, -0.13056071, -0.03266643, + -0.033702414, -0.006473424, -0.04611692, 0.014419339, + -0.025174323, 0.0396852, 0.081777506, 0.06157468, + 0.10210095, -0.009658194, 0.046511717, 0.03603906, + 0.0069369148, 0.015960095, -0.06507666, 0.09551598, + 0.053568836, 0.06408714, 0.12835667, -0.008714329, + -0.20211966, -0.12093674, 0.029450472, 0.2849013, + -0.029227901, 0.1164364, -0.08560263, 0.09941786, + -0.036999565, -0.028842626, -0.0033637602, -0.017012902, + -0.09720865, -0.11193351, -0.029155117, -0.017936034, + -0.009768936, -0.04223324, -0.036159635, 0.06505112, + -0.021742892, -0.023377212, -0.07221364, -0.06430552, + 0.05453865, 0.091149814, 0.06387331, 0.007518393, + 0.055960953, 0.069779344, 0.046411168, 0.10509911, + 0.07463894, 0.0075130584, 0.012850982, 0.04555431, + 0.056955688, 0.06555285, 0.050801456, -0.009862683, + 0.00826772, -0.026555609, -0.0073611983, -0.0014897042}; + + input_to_output_weights_ = { + -0.0998932, -0.07201956, -0.052803773, -0.15629593, -0.15001918, + -0.07650751, 0.02359855, -0.075155355, -0.08037709, -0.15093534, + 0.029517552, -0.04751393, 0.010350531, -0.02664851, -0.016839722, + -0.023121163, 0.0077019283, 0.012851257, -0.05040649, -0.0129761, + -0.021737747, -0.038305793, -0.06870586, -0.01481247, -0.001285394, + 0.10124236, 0.083122835, 0.053313006, -0.062235646, -0.075637154, + -0.027833903, 0.029774971, 0.1130802, 0.09218906, 0.09506135, + -0.086665764, -0.037162706, -0.038880914, -0.035832845, -0.014481564, + -0.09825003, -0.12048569, -0.097665586, -0.05287633, -0.0964047, + -0.11366429, 0.035777505, 0.13568819, 0.052451383, 0.050649304, + 0.05798951, -0.021852335, -0.099848844, 0.014740475, -0.078897946, + 0.04974699, 0.014160473, 0.06973932, 0.04964942, 0.033364646, + 0.08190124, 0.025535367, 0.050893165, 0.048514254, 0.06945813, + -0.078907564, -0.06707616, -0.11844508, -0.09986688, -0.07509403, + 0.06263226, 0.14925587, 0.20188436, 0.12098451, 0.14639415, + 0.0015017595, -0.014267382, -0.03417257, 0.012711468, 0.0028300495, + -0.024758482, -0.05098548, -0.0821182, 0.014225672, 0.021544158, + 0.08949725, 0.07505268, -0.0020780868, 0.04908258, 0.06476295, + -0.022907063, 0.027562456, 0.040185735, 0.019567577, -0.015598739, + -0.049097303, -0.017121866, -0.083368234, -0.02332002, -0.0840956}; + + input_gate_bias_ = {0.02234832, 0.14757581, 0.18176508, 0.10380666, + 0.053110216, -0.06928846, -0.13942584, -0.11816189, + 0.19483899, 0.03652339, -0.10250295, 0.036714908, + -0.18426876, 0.036065217, 0.21810818, 0.02383196, + -0.043370757, 0.08690144, -0.04444982, 0.00030581196}; + + forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696, + 0.11098921, 0.15378423, 0.09263801, 0.09790885, + 0.09508917, 0.061199076, 0.07665568, -0.015443159, + -0.03499149, 0.046190713, 0.08895977, 0.10899629, + 0.40694186, 0.06030037, 0.012413437, -0.06108739}; + + cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132, 0.033463873, + -0.1483596, -0.10639995, -0.091433935, 0.058573797, + -0.06809782, -0.07889636, -0.043246906, -0.09829136, + -0.4279842, 0.034901652, 0.18797937, 0.0075234566, + 0.016178843, 0.1749513, 0.13975595, 0.92058027}; + + output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469, 0.12648113, + 0.027195795, 0.35373217, -0.018957434, 0.008907322, + -0.0762701, 0.12018895, 0.04216877, 0.0022856654, + 0.040952638, 0.3147856, 0.08225149, -0.057416286, + -0.14995944, -0.008040261, 0.13208859, 0.029760877}; + + recurrent_to_input_weights_ = { + -0.001374326, -0.078856036, 0.10672688, 0.029162422, + -0.11585556, 0.02557986, -0.13446963, -0.035785314, + -0.01244275, 0.025961924, -0.02337298, -0.044228926, + -0.055839065, -0.046598054, -0.010546039, -0.06900766, + 0.027239809, 0.022582639, -0.013296484, -0.05459212, + 0.08981, -0.045407712, 0.08682226, -0.06867011, + -0.14390695, -0.02916037, 0.000996957, 0.091420636, + 0.14283475, -0.07390571, -0.06402044, 0.062524505, + -0.093129106, 0.04860203, -0.08364217, -0.08119002, + 0.009352075, 0.22920375, 0.0016303885, 0.11583097, + -0.13732095, 0.012405723, -0.07551853, 0.06343048, + 0.12162708, -0.031923793, -0.014335606, 0.01790974, + -0.10650317, -0.0724401, 0.08554849, -0.05727212, + 0.06556731, -0.042729504, -0.043227166, 0.011683251, + -0.013082158, -0.029302018, -0.010899579, -0.062036745, + -0.022509435, -0.00964907, -0.01567329, 0.04260106, + -0.07787477, -0.11576462, 0.017356863, 0.048673786, + -0.017577527, -0.05527947, -0.082487635, -0.040137455, + -0.10820036, -0.04666372, 0.022746278, -0.07851417, + 0.01068115, 0.032956902, 0.022433773, 0.0026891115, + 0.08944216, -0.0685835, 0.010513544, 0.07228705, + 0.02032331, -0.059686817, -0.0005566496, -0.086984694, + 0.040414046, -0.1380399, 0.094208956, -0.05722982, + 0.012092817, -0.04989123, -0.086576, -0.003399834, + -0.04696032, -0.045747425, 0.10091314, 0.048676282, + -0.029037097, 0.031399418, -0.0040285117, 0.047237843, + 0.09504992, 0.041799378, -0.049185462, -0.031518843, + -0.10516937, 0.026374253, 0.10058866, -0.0033195973, + -0.041975245, 0.0073591834, 0.0033782164, -0.004325073, + -0.10167381, 0.042500053, -0.01447153, 0.06464186, + -0.017142897, 0.03312627, 0.009205989, 0.024138335, + -0.011337001, 0.035530265, -0.010912711, 0.0706555, + -0.005894094, 0.051841937, -0.1401738, -0.02351249, + 0.0365468, 0.07590991, 0.08838724, 0.021681072, + -0.10086113, 0.019608743, -0.06195883, 0.077335775, + 0.023646897, -0.095322326, 0.02233014, 0.09756986, + -0.048691444, -0.009579111, 0.07595467, 0.11480546, + -0.09801813, 0.019894179, 0.08502348, 0.004032281, + 0.037211012, 0.068537936, -0.048005626, -0.091520436, + -0.028379958, -0.01556313, 0.06554592, -0.045599163, + -0.01672207, -0.020169014, -0.011877351, -0.20212261, + 0.010889619, 0.0047078193, 0.038385306, 0.08540671, + -0.017140968, -0.0035865551, 0.016678626, 0.005633034, + 0.015963363, 0.00871737, 0.060130805, 0.028611384, + 0.10109069, -0.015060172, -0.07894427, 0.06401885, + 0.011584063, -0.024466386, 0.0047652307, -0.09041358, + 0.030737216, -0.0046374933, 0.14215417, -0.11823516, + 0.019899689, 0.006106124, -0.027092824, 0.0786356, + 0.05052217, -0.058925, -0.011402121, -0.024987547, + -0.0013661642, -0.06832946, -0.015667673, -0.1083353, + -0.00096863037, -0.06988685, -0.053350925, -0.027275559, + -0.033664223, -0.07978348, -0.025200296, -0.017207067, + -0.058403496, -0.055697463, 0.005798788, 0.12965427, + -0.062582195, 0.0013350133, -0.10482091, 0.0379771, + 0.072521195, -0.0029455067, -0.13797039, -0.03628521, + 0.013806405, -0.017858358, -0.01008298, -0.07700066, + -0.017081132, 0.019358726, 0.0027079724, 0.004635139, + 0.062634714, -0.02338735, -0.039547626, -0.02050681, + 0.03385117, -0.083611414, 0.002862572, -0.09421313, + 0.058618143, -0.08598433, 0.00972939, 0.023867095, + -0.053934585, -0.023203006, 0.07452513, -0.048767887, + -0.07314807, -0.056307215, -0.10433547, -0.06440842, + 0.04328182, 0.04389765, -0.020006588, -0.09076438, + -0.11652589, -0.021705797, 0.03345259, -0.010329105, + -0.025767034, 0.013057034, -0.07316461, -0.10145612, + 0.06358255, 0.18531723, 0.07759293, 0.12006465, + 0.1305557, 0.058638252, -0.03393652, 0.09622831, + -0.16253184, -2.4580743e-06, 0.079869635, -0.070196845, + -0.005644518, 0.06857898, -0.12598175, -0.035084512, + 0.03156317, -0.12794146, -0.031963028, 0.04692781, + 0.030070418, 0.0071660685, -0.095516115, -0.004643372, + 0.040170413, -0.062104587, -0.0037324072, 0.0554317, + 0.08184801, -0.019164372, 0.06791302, 0.034257166, + -0.10307039, 0.021943003, 0.046745934, 0.0790918, + -0.0265588, -0.007824208, 0.042546265, -0.00977924, + -0.0002440307, -0.017384544, -0.017990116, 0.12252321, + -0.014512694, -0.08251313, 0.08861942, 0.13589665, + 0.026351685, 0.012641483, 0.07466548, 0.044301085, + -0.045414884, -0.051112458, 0.03444247, -0.08502782, + -0.04106223, -0.028126027, 0.028473156, 0.10467447}; + + recurrent_to_cell_weights_ = { + -0.037322544, 0.018592842, 0.0056175636, -0.06253426, + 0.055647098, -0.05713207, -0.05626563, 0.005559383, + 0.03375411, -0.025757805, -0.088049285, 0.06017052, + -0.06570978, 0.007384076, 0.035123326, -0.07920549, + 0.053676967, 0.044480428, -0.07663568, 0.0071805613, + 0.08089997, 0.05143358, 0.038261272, 0.03339287, + -0.027673481, 0.044746667, 0.028349208, 0.020090483, + -0.019443132, -0.030755889, -0.0040000007, 0.04465846, + -0.021585021, 0.0031670958, 0.0053199246, -0.056117613, + -0.10893326, 0.076739706, -0.08509834, -0.027997585, + 0.037871376, 0.01449768, -0.09002357, -0.06111149, + -0.046195522, 0.0422062, -0.005683705, -0.1253618, + -0.012925729, -0.04890792, 0.06985068, 0.037654128, + 0.03398274, -0.004781977, 0.007032333, -0.031787455, + 0.010868644, -0.031489216, 0.09525667, 0.013939797, + 0.0058680447, 0.0167067, 0.02668468, -0.04797466, + -0.048885044, -0.12722108, 0.035304096, 0.06554885, + 0.00972396, -0.039238118, -0.05159735, -0.11329045, + 0.1613692, -0.03750952, 0.06529313, -0.071974665, + -0.11769596, 0.015524369, -0.0013754242, -0.12446318, + 0.02786344, -0.014179351, 0.005264273, 0.14376344, + 0.015983658, 0.03406988, -0.06939408, 0.040699873, + 0.02111075, 0.09669095, 0.041345075, -0.08316494, + -0.07684199, -0.045768797, 0.032298047, -0.041805092, + 0.0119405, 0.0061010392, 0.12652606, 0.0064572375, + -0.024950314, 0.11574242, 0.04508852, -0.04335324, + 0.06760663, -0.027437469, 0.07216407, 0.06977076, + -0.05438599, 0.034033038, -0.028602652, 0.05346137, + 0.043184172, -0.037189785, 0.10420091, 0.00882477, + -0.054019816, -0.074273005, -0.030617684, -0.0028467078, + 0.024302477, -0.0038869337, 0.005332455, 0.0013399826, + 0.04361412, -0.007001822, 0.09631092, -0.06702025, + -0.042049985, -0.035070654, -0.04103342, -0.10273396, + 0.0544271, 0.037184782, -0.13150354, -0.0058036847, + -0.008264958, 0.042035464, 0.05891794, 0.029673764, + 0.0063542654, 0.044788733, 0.054816857, 0.062257513, + -0.00093483756, 0.048938446, -0.004952862, -0.007730018, + -0.04043371, -0.017094059, 0.07229206, -0.023670016, + -0.052195564, -0.025616996, -0.01520939, 0.045104615, + -0.007376126, 0.003533447, 0.006570588, 0.056037236, + 0.12436656, 0.051817212, 0.028532185, -0.08686856, + 0.11868599, 0.07663395, -0.07323171, 0.03463402, + -0.050708205, -0.04458982, -0.11590894, 0.021273347, + 0.1251325, -0.15313013, -0.12224372, 0.17228661, + 0.023029093, 0.086124025, 0.006445803, -0.03496501, + 0.028332196, 0.04449512, -0.042436164, -0.026587414, + -0.006041347, -0.09292539, -0.05678812, 0.03897832, + 0.09465633, 0.008115513, -0.02171956, 0.08304309, + 0.071401566, 0.019622514, 0.032163795, -0.004167056, + 0.02295182, 0.030739572, 0.056506045, 0.004612461, + 0.06524936, 0.059999723, 0.046395954, -0.0045512207, + -0.1335546, -0.030136576, 0.11584653, -0.014678886, + 0.0020118146, -0.09688814, -0.0790206, 0.039770417, + -0.0329582, 0.07922767, 0.029322514, 0.026405897, + 0.04207835, -0.07073373, 0.063781224, 0.0859677, + -0.10925287, -0.07011058, 0.048005477, 0.03438226, + -0.09606514, -0.006669445, -0.043381985, 0.04240257, + -0.06955775, -0.06769346, 0.043903265, -0.026784198, + -0.017840602, 0.024307009, -0.040079936, -0.019946516, + 0.045318738, -0.12233574, 0.026170589, 0.0074471775, + 0.15978073, 0.10185836, 0.10298046, -0.015476589, + -0.039390966, -0.072174534, 0.0739445, -0.1211869, + -0.0347889, -0.07943156, 0.014809798, -0.12412325, + -0.0030663363, 0.039695457, 0.0647603, -0.08291318, + -0.018529687, -0.004423833, 0.0037507233, 0.084633216, + -0.01514876, -0.056505352, -0.012800942, -0.06994386, + 0.012962922, -0.031234352, 0.07029052, 0.016418684, + 0.03618972, 0.055686004, -0.08663945, -0.017404709, + -0.054761406, 0.029065743, 0.052404847, 0.020238016, + 0.0048197987, -0.0214882, 0.07078733, 0.013016777, + 0.06262858, 0.009184685, 0.020785125, -0.043904778, + -0.0270329, -0.03299152, -0.060088247, -0.015162964, + -0.001828936, 0.12642565, -0.056757294, 0.013586685, + 0.09232601, -0.035886683, 0.06000002, 0.05229691, + -0.052580316, -0.082029596, -0.010794592, 0.012947712, + -0.036429964, -0.085508935, -0.13127148, -0.017744139, + 0.031502828, 0.036232427, -0.031581745, 0.023051167, + -0.05325106, -0.03421577, 0.028793324, -0.034633752, + -0.009881397, -0.043551125, -0.018609839, 0.0019097115, + -0.008799762, 0.056595087, 0.0022273948, 0.055752404}; + + recurrent_to_forget_weights_ = { + -0.057784554, -0.026057621, -0.068447545, -0.022581743, + 0.14811787, 0.10826372, 0.09471067, 0.03987225, + -0.0039523416, 0.00030638507, 0.053185795, 0.10572994, + 0.08414449, -0.022036452, -0.00066928595, -0.09203576, + 0.032950465, -0.10985798, -0.023809856, 0.0021431844, + -0.02196096, -0.00326074, 0.00058621005, -0.074678116, + -0.06193199, 0.055729095, 0.03736828, 0.020123724, + 0.061878487, -0.04729229, 0.034919553, -0.07585433, + -0.04421272, -0.044019096, 0.085488975, 0.04058006, + -0.06890133, -0.030951202, -0.024628663, -0.07672815, + 0.034293607, 0.08556707, -0.05293577, -0.033561368, + -0.04899627, 0.0241671, 0.015736353, -0.095442444, + -0.029564252, 0.016493602, -0.035026584, 0.022337519, + -0.026871363, 0.004780428, 0.0077918363, -0.03601621, + 0.016435321, -0.03263031, -0.09543275, -0.047392778, + 0.013454138, 0.028934088, 0.01685226, -0.086110644, + -0.046250615, -0.01847454, 0.047608484, 0.07339695, + 0.034546845, -0.04881143, 0.009128804, -0.08802852, + 0.03761666, 0.008096139, -0.014454086, 0.014361001, + -0.023502491, -0.0011840804, -0.07607001, 0.001856849, + -0.06509276, -0.006021153, -0.08570962, -0.1451793, + 0.060212336, 0.055259194, 0.06974018, 0.049454916, + -0.027794661, -0.08077226, -0.016179763, 0.1169753, + 0.17213494, -0.0056326236, -0.053934924, -0.0124349, + -0.11520337, 0.05409887, 0.088759385, 0.0019655675, + 0.0042065294, 0.03881498, 0.019844765, 0.041858196, + -0.05695512, 0.047233116, 0.038937137, -0.06542224, + 0.014429736, -0.09719407, 0.13908425, -0.05379757, + 0.012321099, 0.082840554, -0.029899208, 0.044217527, + 0.059855383, 0.07711018, -0.045319796, 0.0948846, + -0.011724666, -0.0033288454, -0.033542685, -0.04764985, + -0.13873616, 0.040668588, 0.034832682, -0.015319203, + -0.018715994, 0.046002675, 0.0599172, -0.043107376, + 0.0294216, -0.002314414, -0.022424703, 0.0030315618, + 0.0014641669, 0.0029166266, -0.11878115, 0.013738511, + 0.12375372, -0.0006038222, 0.029104086, 0.087442465, + 0.052958444, 0.07558703, 0.04817258, 0.044462286, + -0.015213451, -0.08783778, -0.0561384, -0.003008196, + 0.047060397, -0.002058388, 0.03429439, -0.018839769, + 0.024734668, 0.024614193, -0.042046934, 0.09597743, + -0.0043254104, 0.04320769, 0.0064070094, -0.0019131786, + -0.02558259, -0.022822596, -0.023273505, -0.02464396, + -0.10991725, -0.006240552, 0.0074488563, 0.024044557, + 0.04383914, -0.046476185, 0.028658995, 0.060410924, + 0.050786525, 0.009452605, -0.0073054377, -0.024810238, + 0.0052906186, 0.0066939713, -0.0020913032, 0.014515517, + 0.015898481, 0.021362653, -0.030262267, 0.016587038, + -0.011442813, 0.041154444, -0.007631438, -0.03423484, + -0.010977775, 0.036152758, 0.0066366293, 0.11915515, + 0.02318443, -0.041350313, 0.021485701, -0.10906167, + -0.028218046, -0.00954771, 0.020531068, -0.11995105, + -0.03672871, 0.024019798, 0.014255957, -0.05221243, + -0.00661567, -0.04630967, 0.033188973, 0.10107534, + -0.014027541, 0.030796422, -0.10270911, -0.035999842, + 0.15443139, 0.07684145, 0.036571592, -0.035900835, + -0.0034699554, 0.06209149, 0.015920248, -0.031122351, + -0.03858649, 0.01849943, 0.13872518, 0.01503974, + 0.069941424, -0.06948533, -0.0088794185, 0.061282158, + -0.047401894, 0.03100163, -0.041533746, -0.10430945, + 0.044574402, -0.01425562, -0.024290353, 0.034563623, + 0.05866852, 0.023947537, -0.09445152, 0.035450947, + 0.02247216, -0.0042998926, 0.061146557, -0.10250651, + 0.020881841, -0.06747029, 0.10062043, -0.0023941975, + 0.03532124, -0.016341697, 0.09685456, -0.016764693, + 0.051808182, 0.05875331, -0.04536488, 0.001626336, + -0.028892258, -0.01048663, -0.009793449, -0.017093895, + 0.010987891, 0.02357273, -0.00010856845, 0.0099760275, + -0.001845119, -0.03551521, 0.0018358806, 0.05763657, + -0.01769146, 0.040995963, 0.02235177, -0.060430344, + 0.11475477, -0.023854522, 0.10071741, 0.0686208, + -0.014250481, 0.034261297, 0.047418304, 0.08562733, + -0.030519066, 0.0060542435, 0.014653856, -0.038836084, + 0.04096551, 0.032249358, -0.08355519, -0.026823482, + 0.056386515, -0.010401743, -0.028396193, 0.08507674, + 0.014410365, 0.020995233, 0.17040324, 0.11511526, + 0.02459721, 0.0066619175, 0.025853224, -0.023133837, + -0.081302024, 0.017264642, -0.009585969, 0.09491168, + -0.051313367, 0.054532815, -0.014298593, 0.10657464, + 0.007076659, 0.10964551, 0.0409152, 0.008275321, + -0.07283536, 0.07937492, 0.04192024, -0.1075027}; + + recurrent_to_output_weights_ = { + 0.025825322, -0.05813119, 0.09495884, -0.045984812, + -0.01255415, -0.0026479573, -0.08196161, -0.054914974, + -0.0046604523, -0.029587349, -0.044576716, -0.07480124, + -0.082868785, 0.023254942, 0.027502948, -0.0039728214, + -0.08683098, -0.08116779, -0.014675607, -0.037924774, + -0.023314456, -0.007401714, -0.09255757, 0.029460307, + -0.08829125, -0.005139627, -0.08989442, -0.0555066, + 0.13596267, -0.025062224, -0.048351806, -0.03850004, + 0.07266485, -0.022414139, 0.05940088, 0.075114764, + 0.09597592, -0.010211725, -0.0049794707, -0.011523867, + -0.025980417, 0.072999895, 0.11091378, -0.081685916, + 0.014416728, 0.043229222, 0.034178585, -0.07530371, + 0.035837382, -0.085607, -0.007721233, -0.03287832, + -0.043848954, -0.06404588, -0.06632928, -0.073643476, + 0.008214239, -0.045984086, 0.039764922, 0.03474462, + 0.060612556, -0.080590084, 0.049127717, 0.04151091, + -0.030063879, 0.008801774, -0.023021035, -0.019558564, + 0.05158114, -0.010947698, -0.011825728, 0.0075720972, + 0.0699727, -0.0039981045, 0.069350146, 0.08799282, + 0.016156472, 0.035502106, 0.11695009, 0.006217345, + 0.13392477, -0.037875112, 0.025745004, 0.08940699, + -0.00924166, 0.0046702605, -0.036598757, -0.08811812, + 0.10522024, -0.032441203, 0.008176899, -0.04454919, + 0.07058152, 0.0067963637, 0.039206743, 0.03259838, + 0.03725492, -0.09515802, 0.013326398, -0.052055415, + -0.025676316, 0.03198509, -0.015951829, -0.058556724, + 0.036879618, 0.043357447, 0.028362012, -0.05908629, + 0.0059240665, -0.04995891, -0.019187413, 0.0276265, + -0.01628143, 0.0025863599, 0.08800015, 0.035250366, + -0.022165963, -0.07328642, -0.009415526, -0.07455109, + 0.11690406, 0.0363299, 0.07411125, 0.042103454, + -0.009660886, 0.019076364, 0.018299393, -0.046004917, + 0.08891175, 0.0431396, -0.026327137, -0.051502608, + 0.08979574, -0.051670972, 0.04940282, -0.07491107, + -0.021240504, 0.022596184, -0.034280192, 0.060163025, + -0.058211457, -0.051837247, -0.01349775, -0.04639988, + -0.035936575, -0.011681591, 0.064818054, 0.0073146066, + -0.021745546, -0.043124277, -0.06471268, -0.07053354, + -0.029321948, -0.05330136, 0.016933719, -0.053782392, + 0.13747959, -0.1361751, -0.11569455, 0.0033329215, + 0.05693899, -0.053219706, 0.063698, 0.07977434, + -0.07924483, 0.06936997, 0.0034815092, -0.007305279, + -0.037325785, -0.07251102, -0.033633437, -0.08677009, + 0.091591336, -0.14165086, 0.021752775, 0.019683983, + 0.0011612234, -0.058154266, 0.049996935, 0.0288841, + -0.0024567875, -0.14345716, 0.010955264, -0.10234828, + 0.1183656, -0.0010731248, -0.023590032, -0.072285876, + -0.0724771, -0.026382286, -0.0014920527, 0.042667855, + 0.0018776858, 0.02986552, 0.009814309, 0.0733756, + 0.12289186, 0.018043943, -0.0458958, 0.049412545, + 0.033632483, 0.05495232, 0.036686596, -0.013781798, + -0.010036754, 0.02576849, -0.08307328, 0.010112348, + 0.042521734, -0.05869831, -0.071689695, 0.03876447, + -0.13275425, -0.0352966, -0.023077697, 0.10285965, + 0.084736146, 0.15568255, -0.00040734606, 0.027835453, + -0.10292561, -0.032401145, 0.10053256, -0.026142767, + -0.08271222, -0.0030240538, -0.016368777, 0.1070414, + 0.042672627, 0.013456989, -0.0437609, -0.022309763, + 0.11576483, 0.04108048, 0.061026827, -0.0190714, + -0.0869359, 0.037901703, 0.0610107, 0.07202949, + 0.01675338, 0.086139716, -0.08795751, -0.014898893, + -0.023771819, -0.01965048, 0.007955471, -0.043740474, + 0.03346837, -0.10549954, 0.090567775, 0.042013682, + -0.03176985, 0.12569028, -0.02421228, -0.029526481, + 0.023851605, 0.031539805, 0.05292009, -0.02344001, + -0.07811758, -0.08834428, 0.10094801, 0.16594367, + -0.06861939, -0.021256343, -0.041093912, -0.06669611, + 0.035498552, 0.021757556, -0.09302526, -0.015403468, + -0.06614931, -0.051798206, -0.013874718, 0.03630673, + 0.010412845, -0.08077351, 0.046185967, 0.0035662893, + 0.03541868, -0.094149634, -0.034814864, 0.003128424, + -0.020674974, -0.03944324, -0.008110165, -0.11113267, + 0.08484226, 0.043586485, 0.040582247, 0.0968012, + -0.065249965, -0.028036479, 0.0050708856, 0.0017462453, + 0.0326779, 0.041296225, 0.09164146, -0.047743853, + -0.015952192, -0.034451712, 0.084197424, -0.05347844, + -0.11768019, 0.085926116, -0.08251791, -0.045081906, + 0.0948852, 0.068401024, 0.024856757, 0.06978981, + -0.057309967, -0.012775832, -0.0032452994, 0.01977615, + -0.041040014, -0.024264973, 0.063464895, 0.05431621, + }; + + cell_to_input_weights_ = { + 0.040369894, 0.030746894, 0.24704495, 0.018586371, -0.037586458, + -0.15312155, -0.11812848, -0.11465643, 0.20259799, 0.11418174, + -0.10116027, -0.011334949, 0.12411352, -0.076769054, -0.052169047, + 0.21198851, -0.38871562, -0.09061183, -0.09683246, -0.21929175}; + + cell_to_forget_weights_ = { + -0.01998659, -0.15568835, -0.24248174, -0.012770197, 0.041331276, + -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766, + -0.047248036, 0.021479502, 0.033189066, 0.11952997, -0.020432774, + 0.64658105, -0.06650122, -0.03467612, 0.095340036, 0.23647355}; + + cell_to_output_weights_ = { + 0.08286371, -0.08261836, -0.51210177, 0.002913762, 0.17764764, + -0.5495371, -0.08460716, -0.24552552, 0.030037103, 0.04123544, + -0.11940523, 0.007358328, 0.1890978, 0.4833202, -0.34441817, + 0.36312827, -0.26375428, 0.1457655, -0.19724406, 0.15548733}; + + projection_weights_ = { + -0.009802181, 0.09401916, 0.0717386, -0.13895074, + 0.09641832, 0.060420845, 0.08539281, 0.054285463, + 0.061395317, 0.034448683, -0.042991187, 0.019801661, + -0.16840284, -0.015726732, -0.23041931, -0.024478018, + -0.10959692, -0.013875541, 0.18600968, -0.061274476, + 0.0138165, -0.08160894, -0.07661644, 0.032372914, + 0.16169067, 0.22465782, -0.03993472, -0.004017731, + 0.08633481, -0.28869787, 0.08682067, 0.17240396, + 0.014975425, 0.056431185, 0.031037588, 0.16702051, + 0.0077946745, 0.15140012, 0.29405436, 0.120285, + -0.188994, -0.027265169, 0.043389652, -0.022061434, + 0.014777949, -0.20203483, 0.094781205, 0.19100232, + 0.13987629, -0.036132768, -0.06426278, -0.05108664, + 0.13221376, 0.009441198, -0.16715929, 0.15859416, + -0.040437475, 0.050779544, -0.022187516, 0.012166504, + 0.027685808, -0.07675938, -0.0055694645, -0.09444123, + 0.0046453946, 0.050794356, 0.10770313, -0.20790008, + -0.07149004, -0.11425117, 0.008225835, -0.035802525, + 0.14374903, 0.15262283, 0.048710253, 0.1847461, + -0.007487823, 0.11000021, -0.09542012, 0.22619456, + -0.029149994, 0.08527916, 0.009043713, 0.0042746216, + 0.016261552, 0.022461696, 0.12689082, -0.043589946, + -0.12035478, -0.08361797, -0.050666027, -0.1248618, + -0.1275799, -0.071875185, 0.07377272, 0.09944291, + -0.18897448, -0.1593054, -0.06526116, -0.040107165, + -0.004618631, -0.067624845, -0.007576253, 0.10727444, + 0.041546922, -0.20424393, 0.06907816, 0.050412357, + 0.00724631, 0.039827548, 0.12449835, 0.10747581, + 0.13708383, 0.09134148, -0.12617786, -0.06428341, + 0.09956831, 0.1208086, -0.14676677, -0.0727722, + 0.1126304, 0.010139365, 0.015571211, -0.038128063, + 0.022913318, -0.042050496, 0.16842307, -0.060597885, + 0.10531834, -0.06411776, -0.07451711, -0.03410368, + -0.13393489, 0.06534304, 0.003620307, 0.04490757, + 0.05970546, 0.05197996, 0.02839995, 0.10434969, + -0.013699693, -0.028353551, -0.07260381, 0.047201227, + -0.024575593, -0.036445823, 0.07155557, 0.009672501, + -0.02328883, 0.009533515, -0.03606021, -0.07421458, + -0.028082801, -0.2678904, -0.13221288, 0.18419984, + -0.13012612, -0.014588381, -0.035059117, -0.04824723, + 0.07830115, -0.056184657, 0.03277091, 0.025466874, + 0.14494097, -0.12522776, -0.098633975, -0.10766018, + -0.08317623, 0.08594209, 0.07749552, 0.039474737, + 0.1776665, -0.07409566, -0.0477268, 0.29323658, + 0.10801441, 0.1154011, 0.013952499, 0.10739139, + 0.10708251, -0.051456142, 0.0074137426, -0.10430189, + 0.10034707, 0.045594677, 0.0635285, -0.0715442, + -0.089667566, -0.10811871, 0.00026344223, 0.08298446, + -0.009525053, 0.006585689, -0.24567553, -0.09450807, + 0.09648481, 0.026996298, -0.06419476, -0.04752702, + -0.11063944, -0.23441927, -0.17608605, -0.052156363, + 0.067035615, 0.19271925, -0.0032889997, -0.043264326, + 0.09663576, -0.057112187, -0.10100678, 0.0628376, + 0.04447668, 0.017961001, -0.10094388, -0.10190601, + 0.18335468, 0.10494553, -0.052095775, -0.0026118709, + 0.10539724, -0.04383912, -0.042349473, 0.08438151, + -0.1947263, 0.02251204, 0.11216432, -0.10307853, + 0.17351969, -0.039091777, 0.08066188, -0.00561982, + 0.12633002, 0.11335965, -0.0088127935, -0.019777594, + 0.06864014, -0.059751723, 0.016233567, -0.06894641, + -0.28651384, -0.004228674, 0.019708522, -0.16305895, + -0.07468996, -0.0855457, 0.099339016, -0.07580735, + -0.13775392, 0.08434318, 0.08330512, -0.12131499, + 0.031935584, 0.09180414, -0.08876437, -0.08049874, + 0.008753825, 0.03498998, 0.030215185, 0.03907079, + 0.089751154, 0.029194152, -0.03337423, -0.019092513, + 0.04331237, 0.04299654, -0.036394123, -0.12915532, + 0.09793732, 0.07512415, -0.11319543, -0.032502122, + 0.15661901, 0.07671967, -0.005491124, -0.19379048, + -0.218606, 0.21448623, 0.017840758, 0.1416943, + -0.07051762, 0.19488361, 0.02664691, -0.18104725, + -0.09334311, 0.15026465, -0.15493552, -0.057762887, + -0.11604192, -0.262013, -0.01391798, 0.012185008, + 0.11156489, -0.07483202, 0.06693364, -0.26151478, + 0.046425626, 0.036540434, -0.16435726, 0.17338543, + -0.21401681, -0.11385144, -0.08283257, -0.069031075, + 0.030635102, 0.010969227, 0.11109743, 0.010919218, + 0.027526086, 0.13519906, 0.01891392, -0.046839405, + -0.040167913, 0.017953383, -0.09700955, 0.0061885654, + -0.07000971, 0.026893595, -0.038844477, 0.14543656}; + + lstm_input_ = { + {// Batch0: 4 (input_sequence_size) * 5 (n_input) + 0.787926, 0.151646, 0.071352, 0.118426, 0.458058, // step 0 + 0.596268, 0.998386, 0.568695, 0.864524, 0.571277, // step 1 + 0.073204, 0.296072, 0.743333, 0.069199, 0.045348, // step 2 + 0.867394, 0.291279, 0.013714, 0.482521, 0.626339}, // step 3 + + {// Batch1: 4 (input_sequence_size) * 5 (n_input) + 0.295743, 0.544053, 0.690064, 0.858138, 0.497181, // step 0 + 0.642421, 0.524260, 0.134799, 0.003639, 0.162482, // step 1 + 0.640394, 0.930399, 0.050782, 0.432485, 0.988078, // step 2 + 0.082922, 0.563329, 0.865614, 0.333232, 0.259916} // step 3 + }; + + lstm_golden_output_ = { + {// Batch0: 4 (input_sequence_size) * 16 (n_output) + -0.00396806, 0.029352, -0.00279226, 0.0159977, -0.00835576, + -0.0211779, 0.0283512, -0.0114597, 0.00907307, -0.0244004, + -0.0152191, -0.0259063, 0.00914318, 0.00415118, 0.017147, + 0.0134203, -0.0166936, 0.0381209, 0.000889694, 0.0143363, + -0.0328911, -0.0234288, 0.0333051, -0.012229, 0.0110322, + -0.0457725, -0.000832209, -0.0202817, 0.0327257, 0.0121308, + 0.0155969, 0.0312091, -0.0213783, 0.0350169, 0.000324794, + 0.0276012, -0.0263374, -0.0371449, 0.0446149, -0.0205474, + 0.0103729, -0.0576349, -0.0150052, -0.0292043, 0.0376827, + 0.0136115, 0.0243435, 0.0354492, -0.0189322, 0.0464512, + -0.00251373, 0.0225745, -0.0308346, -0.0317124, 0.0460407, + -0.0189395, 0.0149363, -0.0530162, -0.0150767, -0.0340193, + 0.0286833, 0.00824207, 0.0264887, 0.0305169}, + {// Batch1: 4 (input_sequence_size) * 16 (n_output) + -0.013869, 0.0287268, -0.00334693, 0.00733398, -0.0287926, + -0.0186926, 0.0193662, -0.0115437, 0.00422612, -0.0345232, + 0.00223253, -0.00957321, 0.0210624, 0.013331, 0.0150954, + 0.02168, -0.0141913, 0.0322082, 0.00227024, 0.0260507, + -0.0188721, -0.0296489, 0.0399134, -0.0160509, 0.0116039, + -0.0447318, -0.0150515, -0.0277406, 0.0316596, 0.0118233, + 0.0214762, 0.0293641, -0.0204549, 0.0450315, -0.00117378, + 0.0167673, -0.0375007, -0.0238314, 0.038784, -0.0174034, + 0.0131743, -0.0506589, -0.0048447, -0.0240239, 0.0325789, + 0.00790065, 0.0220157, 0.0333314, -0.0264787, 0.0387855, + -0.000764675, 0.0217599, -0.037537, -0.0335206, 0.0431679, + -0.0211424, 0.010203, -0.062785, -0.00832363, -0.025181, + 0.0412031, 0.0118723, 0.0239643, 0.0394009}}; + } +}; + +TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) { const int n_batch = 2; const int n_input = 5; const int n_cell = 20; @@ -461,8 +1310,9 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) { const int sequence_length = 4; UnidirectionalLSTMOpModel lstm( - n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false, - /*use_peephole=*/true, /*use_projection_weights=*/true, + n_batch, n_input, n_cell, n_output, sequence_length, + /*use_cifg=*/false, /*use_peephole=*/true, + /*use_projection_weights=*/true, /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0, { @@ -489,590 +1339,99 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) { {n_output, n_cell}, // projection_weight tensor {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor }); - lstm.SetInputToInputWeights( - {0.021393683, 0.06124551, 0.046905167, -0.014657677, -0.03149463, - 0.09171803, 0.14647801, 0.10797193, -0.0057968358, 0.0019193048, - -0.2726754, 0.10154029, -0.018539885, 0.080349885, -0.10262385, - -0.022599787, -0.09121155, -0.008675967, -0.045206103, -0.0821282, - -0.008045952, 0.015478081, 0.055217247, 0.038719587, 0.044153627, - -0.06453243, 0.05031825, -0.046935108, -0.008164439, 0.014574226, - -0.1671009, -0.15519552, -0.16819797, -0.13971269, -0.11953059, - 0.25005487, -0.22790983, 0.009855087, -0.028140958, -0.11200698, - 0.11295408, -0.0035217577, 0.054485075, 0.05184695, 0.064711206, - 0.10989193, 0.11674786, 0.03490607, 0.07727357, 0.11390585, - -0.1863375, -0.1034451, -0.13945189, -0.049401227, -0.18767063, - 0.042483903, 0.14233552, 0.13832581, 0.18350165, 0.14545603, - -0.028545704, 0.024939531, 0.050929718, 0.0076203286, -0.0029723682, - -0.042484224, -0.11827596, -0.09171104, -0.10808628, -0.16327988, - -0.2273378, -0.0993647, -0.017155107, 0.0023917493, 0.049272764, - 0.0038534778, 0.054764505, 0.089753784, 0.06947234, 0.08014476, - -0.04544234, -0.0497073, -0.07135631, -0.048929106, -0.004042012, - -0.009284026, 0.018042054, 0.0036860977, -0.07427302, -0.11434604, - -0.018995456, 0.031487543, 0.012834908, 0.019977754, 0.044256654, - -0.39292613, -0.18519334, -0.11651281, -0.06809892, 0.011373677}); - - lstm.SetInputToForgetWeights( - {-0.0018401089, -0.004852237, 0.03698424, 0.014181704, 0.028273236, - -0.016726194, -0.05249759, -0.10204261, 0.00861066, -0.040979505, - -0.009899187, 0.01923892, -0.028177269, -0.08535103, -0.14585495, - 0.10662567, -0.01909731, -0.017883534, -0.0047269356, -0.045103323, - 0.0030784295, 0.076784775, 0.07463696, 0.094531395, 0.0814421, - -0.12257899, -0.033945758, -0.031303465, 0.045630626, 0.06843887, - -0.13492945, -0.012480007, -0.0811829, -0.07224499, -0.09628791, - 0.045100946, 0.0012300825, 0.013964662, 0.099372394, 0.02543059, - 0.06958324, 0.034257296, 0.0482646, 0.06267997, 0.052625068, - 0.12784666, 0.07077897, 0.025725935, 0.04165009, 0.07241905, - 0.018668644, -0.037377294, -0.06277783, -0.08833636, -0.040120605, - -0.011405586, -0.007808335, -0.010301386, -0.005102167, 0.027717464, - 0.05483423, 0.11449111, 0.11289652, 0.10939839, 0.13396506, - -0.08402166, -0.01901462, -0.044678304, -0.07720565, 0.014350063, - -0.11757958, -0.0652038, -0.08185733, -0.076754324, -0.092614375, - 0.10405491, 0.052960336, 0.035755895, 0.035839386, -0.012540553, - 0.036881298, 0.02913376, 0.03420159, 0.05448447, -0.054523353, - 0.02582715, 0.02327355, -0.011857179, -0.0011980024, -0.034641717, - -0.026125094, -0.17582615, -0.15923657, -0.27486774, -0.0006143371, - 0.0001771948, -8.470171e-05, 0.02651807, 0.045790765, 0.06956496}); - - lstm.SetInputToCellWeights( - {-0.04580283, -0.09549462, -0.032418985, -0.06454633, - -0.043528453, 0.043018587, -0.049152344, -0.12418144, - -0.078985475, -0.07596889, 0.019484362, -0.11434962, - -0.0074034138, -0.06314844, -0.092981495, 0.0062155537, - -0.025034338, -0.0028890965, 0.048929527, 0.06235075, - 0.10665918, -0.032036792, -0.08505916, -0.10843358, - -0.13002433, -0.036816437, -0.02130134, -0.016518239, - 0.0047691227, -0.0025825808, 0.066017866, 0.029991534, - -0.10652836, -0.1037554, -0.13056071, -0.03266643, - -0.033702414, -0.006473424, -0.04611692, 0.014419339, - -0.025174323, 0.0396852, 0.081777506, 0.06157468, - 0.10210095, -0.009658194, 0.046511717, 0.03603906, - 0.0069369148, 0.015960095, -0.06507666, 0.09551598, - 0.053568836, 0.06408714, 0.12835667, -0.008714329, - -0.20211966, -0.12093674, 0.029450472, 0.2849013, - -0.029227901, 0.1164364, -0.08560263, 0.09941786, - -0.036999565, -0.028842626, -0.0033637602, -0.017012902, - -0.09720865, -0.11193351, -0.029155117, -0.017936034, - -0.009768936, -0.04223324, -0.036159635, 0.06505112, - -0.021742892, -0.023377212, -0.07221364, -0.06430552, - 0.05453865, 0.091149814, 0.06387331, 0.007518393, - 0.055960953, 0.069779344, 0.046411168, 0.10509911, - 0.07463894, 0.0075130584, 0.012850982, 0.04555431, - 0.056955688, 0.06555285, 0.050801456, -0.009862683, - 0.00826772, -0.026555609, -0.0073611983, -0.0014897042}); - - lstm.SetInputToOutputWeights( - {-0.0998932, -0.07201956, -0.052803773, -0.15629593, -0.15001918, - -0.07650751, 0.02359855, -0.075155355, -0.08037709, -0.15093534, - 0.029517552, -0.04751393, 0.010350531, -0.02664851, -0.016839722, - -0.023121163, 0.0077019283, 0.012851257, -0.05040649, -0.0129761, - -0.021737747, -0.038305793, -0.06870586, -0.01481247, -0.001285394, - 0.10124236, 0.083122835, 0.053313006, -0.062235646, -0.075637154, - -0.027833903, 0.029774971, 0.1130802, 0.09218906, 0.09506135, - -0.086665764, -0.037162706, -0.038880914, -0.035832845, -0.014481564, - -0.09825003, -0.12048569, -0.097665586, -0.05287633, -0.0964047, - -0.11366429, 0.035777505, 0.13568819, 0.052451383, 0.050649304, - 0.05798951, -0.021852335, -0.099848844, 0.014740475, -0.078897946, - 0.04974699, 0.014160473, 0.06973932, 0.04964942, 0.033364646, - 0.08190124, 0.025535367, 0.050893165, 0.048514254, 0.06945813, - -0.078907564, -0.06707616, -0.11844508, -0.09986688, -0.07509403, - 0.06263226, 0.14925587, 0.20188436, 0.12098451, 0.14639415, - 0.0015017595, -0.014267382, -0.03417257, 0.012711468, 0.0028300495, - -0.024758482, -0.05098548, -0.0821182, 0.014225672, 0.021544158, - 0.08949725, 0.07505268, -0.0020780868, 0.04908258, 0.06476295, - -0.022907063, 0.027562456, 0.040185735, 0.019567577, -0.015598739, - -0.049097303, -0.017121866, -0.083368234, -0.02332002, -0.0840956}); - - lstm.SetInputGateBias( - {0.02234832, 0.14757581, 0.18176508, 0.10380666, 0.053110216, - -0.06928846, -0.13942584, -0.11816189, 0.19483899, 0.03652339, - -0.10250295, 0.036714908, -0.18426876, 0.036065217, 0.21810818, - 0.02383196, -0.043370757, 0.08690144, -0.04444982, 0.00030581196}); - - lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696, - 0.11098921, 0.15378423, 0.09263801, 0.09790885, - 0.09508917, 0.061199076, 0.07665568, -0.015443159, - -0.03499149, 0.046190713, 0.08895977, 0.10899629, - 0.40694186, 0.06030037, 0.012413437, -0.06108739}); - - lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132, 0.033463873, - -0.1483596, -0.10639995, -0.091433935, 0.058573797, - -0.06809782, -0.07889636, -0.043246906, -0.09829136, - -0.4279842, 0.034901652, 0.18797937, 0.0075234566, - 0.016178843, 0.1749513, 0.13975595, 0.92058027}); - - lstm.SetOutputGateBias( - {0.046159424, -0.0012809046, 0.03563469, 0.12648113, 0.027195795, - 0.35373217, -0.018957434, 0.008907322, -0.0762701, 0.12018895, - 0.04216877, 0.0022856654, 0.040952638, 0.3147856, 0.08225149, - -0.057416286, -0.14995944, -0.008040261, 0.13208859, 0.029760877}); - - lstm.SetRecurrentToInputWeights( - {-0.001374326, -0.078856036, 0.10672688, 0.029162422, - -0.11585556, 0.02557986, -0.13446963, -0.035785314, - -0.01244275, 0.025961924, -0.02337298, -0.044228926, - -0.055839065, -0.046598054, -0.010546039, -0.06900766, - 0.027239809, 0.022582639, -0.013296484, -0.05459212, - 0.08981, -0.045407712, 0.08682226, -0.06867011, - -0.14390695, -0.02916037, 0.000996957, 0.091420636, - 0.14283475, -0.07390571, -0.06402044, 0.062524505, - -0.093129106, 0.04860203, -0.08364217, -0.08119002, - 0.009352075, 0.22920375, 0.0016303885, 0.11583097, - -0.13732095, 0.012405723, -0.07551853, 0.06343048, - 0.12162708, -0.031923793, -0.014335606, 0.01790974, - -0.10650317, -0.0724401, 0.08554849, -0.05727212, - 0.06556731, -0.042729504, -0.043227166, 0.011683251, - -0.013082158, -0.029302018, -0.010899579, -0.062036745, - -0.022509435, -0.00964907, -0.01567329, 0.04260106, - -0.07787477, -0.11576462, 0.017356863, 0.048673786, - -0.017577527, -0.05527947, -0.082487635, -0.040137455, - -0.10820036, -0.04666372, 0.022746278, -0.07851417, - 0.01068115, 0.032956902, 0.022433773, 0.0026891115, - 0.08944216, -0.0685835, 0.010513544, 0.07228705, - 0.02032331, -0.059686817, -0.0005566496, -0.086984694, - 0.040414046, -0.1380399, 0.094208956, -0.05722982, - 0.012092817, -0.04989123, -0.086576, -0.003399834, - -0.04696032, -0.045747425, 0.10091314, 0.048676282, - -0.029037097, 0.031399418, -0.0040285117, 0.047237843, - 0.09504992, 0.041799378, -0.049185462, -0.031518843, - -0.10516937, 0.026374253, 0.10058866, -0.0033195973, - -0.041975245, 0.0073591834, 0.0033782164, -0.004325073, - -0.10167381, 0.042500053, -0.01447153, 0.06464186, - -0.017142897, 0.03312627, 0.009205989, 0.024138335, - -0.011337001, 0.035530265, -0.010912711, 0.0706555, - -0.005894094, 0.051841937, -0.1401738, -0.02351249, - 0.0365468, 0.07590991, 0.08838724, 0.021681072, - -0.10086113, 0.019608743, -0.06195883, 0.077335775, - 0.023646897, -0.095322326, 0.02233014, 0.09756986, - -0.048691444, -0.009579111, 0.07595467, 0.11480546, - -0.09801813, 0.019894179, 0.08502348, 0.004032281, - 0.037211012, 0.068537936, -0.048005626, -0.091520436, - -0.028379958, -0.01556313, 0.06554592, -0.045599163, - -0.01672207, -0.020169014, -0.011877351, -0.20212261, - 0.010889619, 0.0047078193, 0.038385306, 0.08540671, - -0.017140968, -0.0035865551, 0.016678626, 0.005633034, - 0.015963363, 0.00871737, 0.060130805, 0.028611384, - 0.10109069, -0.015060172, -0.07894427, 0.06401885, - 0.011584063, -0.024466386, 0.0047652307, -0.09041358, - 0.030737216, -0.0046374933, 0.14215417, -0.11823516, - 0.019899689, 0.006106124, -0.027092824, 0.0786356, - 0.05052217, -0.058925, -0.011402121, -0.024987547, - -0.0013661642, -0.06832946, -0.015667673, -0.1083353, - -0.00096863037, -0.06988685, -0.053350925, -0.027275559, - -0.033664223, -0.07978348, -0.025200296, -0.017207067, - -0.058403496, -0.055697463, 0.005798788, 0.12965427, - -0.062582195, 0.0013350133, -0.10482091, 0.0379771, - 0.072521195, -0.0029455067, -0.13797039, -0.03628521, - 0.013806405, -0.017858358, -0.01008298, -0.07700066, - -0.017081132, 0.019358726, 0.0027079724, 0.004635139, - 0.062634714, -0.02338735, -0.039547626, -0.02050681, - 0.03385117, -0.083611414, 0.002862572, -0.09421313, - 0.058618143, -0.08598433, 0.00972939, 0.023867095, - -0.053934585, -0.023203006, 0.07452513, -0.048767887, - -0.07314807, -0.056307215, -0.10433547, -0.06440842, - 0.04328182, 0.04389765, -0.020006588, -0.09076438, - -0.11652589, -0.021705797, 0.03345259, -0.010329105, - -0.025767034, 0.013057034, -0.07316461, -0.10145612, - 0.06358255, 0.18531723, 0.07759293, 0.12006465, - 0.1305557, 0.058638252, -0.03393652, 0.09622831, - -0.16253184, -2.4580743e-06, 0.079869635, -0.070196845, - -0.005644518, 0.06857898, -0.12598175, -0.035084512, - 0.03156317, -0.12794146, -0.031963028, 0.04692781, - 0.030070418, 0.0071660685, -0.095516115, -0.004643372, - 0.040170413, -0.062104587, -0.0037324072, 0.0554317, - 0.08184801, -0.019164372, 0.06791302, 0.034257166, - -0.10307039, 0.021943003, 0.046745934, 0.0790918, - -0.0265588, -0.007824208, 0.042546265, -0.00977924, - -0.0002440307, -0.017384544, -0.017990116, 0.12252321, - -0.014512694, -0.08251313, 0.08861942, 0.13589665, - 0.026351685, 0.012641483, 0.07466548, 0.044301085, - -0.045414884, -0.051112458, 0.03444247, -0.08502782, - -0.04106223, -0.028126027, 0.028473156, 0.10467447}); - - lstm.SetRecurrentToForgetWeights( - {-0.057784554, -0.026057621, -0.068447545, -0.022581743, - 0.14811787, 0.10826372, 0.09471067, 0.03987225, - -0.0039523416, 0.00030638507, 0.053185795, 0.10572994, - 0.08414449, -0.022036452, -0.00066928595, -0.09203576, - 0.032950465, -0.10985798, -0.023809856, 0.0021431844, - -0.02196096, -0.00326074, 0.00058621005, -0.074678116, - -0.06193199, 0.055729095, 0.03736828, 0.020123724, - 0.061878487, -0.04729229, 0.034919553, -0.07585433, - -0.04421272, -0.044019096, 0.085488975, 0.04058006, - -0.06890133, -0.030951202, -0.024628663, -0.07672815, - 0.034293607, 0.08556707, -0.05293577, -0.033561368, - -0.04899627, 0.0241671, 0.015736353, -0.095442444, - -0.029564252, 0.016493602, -0.035026584, 0.022337519, - -0.026871363, 0.004780428, 0.0077918363, -0.03601621, - 0.016435321, -0.03263031, -0.09543275, -0.047392778, - 0.013454138, 0.028934088, 0.01685226, -0.086110644, - -0.046250615, -0.01847454, 0.047608484, 0.07339695, - 0.034546845, -0.04881143, 0.009128804, -0.08802852, - 0.03761666, 0.008096139, -0.014454086, 0.014361001, - -0.023502491, -0.0011840804, -0.07607001, 0.001856849, - -0.06509276, -0.006021153, -0.08570962, -0.1451793, - 0.060212336, 0.055259194, 0.06974018, 0.049454916, - -0.027794661, -0.08077226, -0.016179763, 0.1169753, - 0.17213494, -0.0056326236, -0.053934924, -0.0124349, - -0.11520337, 0.05409887, 0.088759385, 0.0019655675, - 0.0042065294, 0.03881498, 0.019844765, 0.041858196, - -0.05695512, 0.047233116, 0.038937137, -0.06542224, - 0.014429736, -0.09719407, 0.13908425, -0.05379757, - 0.012321099, 0.082840554, -0.029899208, 0.044217527, - 0.059855383, 0.07711018, -0.045319796, 0.0948846, - -0.011724666, -0.0033288454, -0.033542685, -0.04764985, - -0.13873616, 0.040668588, 0.034832682, -0.015319203, - -0.018715994, 0.046002675, 0.0599172, -0.043107376, - 0.0294216, -0.002314414, -0.022424703, 0.0030315618, - 0.0014641669, 0.0029166266, -0.11878115, 0.013738511, - 0.12375372, -0.0006038222, 0.029104086, 0.087442465, - 0.052958444, 0.07558703, 0.04817258, 0.044462286, - -0.015213451, -0.08783778, -0.0561384, -0.003008196, - 0.047060397, -0.002058388, 0.03429439, -0.018839769, - 0.024734668, 0.024614193, -0.042046934, 0.09597743, - -0.0043254104, 0.04320769, 0.0064070094, -0.0019131786, - -0.02558259, -0.022822596, -0.023273505, -0.02464396, - -0.10991725, -0.006240552, 0.0074488563, 0.024044557, - 0.04383914, -0.046476185, 0.028658995, 0.060410924, - 0.050786525, 0.009452605, -0.0073054377, -0.024810238, - 0.0052906186, 0.0066939713, -0.0020913032, 0.014515517, - 0.015898481, 0.021362653, -0.030262267, 0.016587038, - -0.011442813, 0.041154444, -0.007631438, -0.03423484, - -0.010977775, 0.036152758, 0.0066366293, 0.11915515, - 0.02318443, -0.041350313, 0.021485701, -0.10906167, - -0.028218046, -0.00954771, 0.020531068, -0.11995105, - -0.03672871, 0.024019798, 0.014255957, -0.05221243, - -0.00661567, -0.04630967, 0.033188973, 0.10107534, - -0.014027541, 0.030796422, -0.10270911, -0.035999842, - 0.15443139, 0.07684145, 0.036571592, -0.035900835, - -0.0034699554, 0.06209149, 0.015920248, -0.031122351, - -0.03858649, 0.01849943, 0.13872518, 0.01503974, - 0.069941424, -0.06948533, -0.0088794185, 0.061282158, - -0.047401894, 0.03100163, -0.041533746, -0.10430945, - 0.044574402, -0.01425562, -0.024290353, 0.034563623, - 0.05866852, 0.023947537, -0.09445152, 0.035450947, - 0.02247216, -0.0042998926, 0.061146557, -0.10250651, - 0.020881841, -0.06747029, 0.10062043, -0.0023941975, - 0.03532124, -0.016341697, 0.09685456, -0.016764693, - 0.051808182, 0.05875331, -0.04536488, 0.001626336, - -0.028892258, -0.01048663, -0.009793449, -0.017093895, - 0.010987891, 0.02357273, -0.00010856845, 0.0099760275, - -0.001845119, -0.03551521, 0.0018358806, 0.05763657, - -0.01769146, 0.040995963, 0.02235177, -0.060430344, - 0.11475477, -0.023854522, 0.10071741, 0.0686208, - -0.014250481, 0.034261297, 0.047418304, 0.08562733, - -0.030519066, 0.0060542435, 0.014653856, -0.038836084, - 0.04096551, 0.032249358, -0.08355519, -0.026823482, - 0.056386515, -0.010401743, -0.028396193, 0.08507674, - 0.014410365, 0.020995233, 0.17040324, 0.11511526, - 0.02459721, 0.0066619175, 0.025853224, -0.023133837, - -0.081302024, 0.017264642, -0.009585969, 0.09491168, - -0.051313367, 0.054532815, -0.014298593, 0.10657464, - 0.007076659, 0.10964551, 0.0409152, 0.008275321, - -0.07283536, 0.07937492, 0.04192024, -0.1075027}); - - lstm.SetRecurrentToCellWeights( - {-0.037322544, 0.018592842, 0.0056175636, -0.06253426, - 0.055647098, -0.05713207, -0.05626563, 0.005559383, - 0.03375411, -0.025757805, -0.088049285, 0.06017052, - -0.06570978, 0.007384076, 0.035123326, -0.07920549, - 0.053676967, 0.044480428, -0.07663568, 0.0071805613, - 0.08089997, 0.05143358, 0.038261272, 0.03339287, - -0.027673481, 0.044746667, 0.028349208, 0.020090483, - -0.019443132, -0.030755889, -0.0040000007, 0.04465846, - -0.021585021, 0.0031670958, 0.0053199246, -0.056117613, - -0.10893326, 0.076739706, -0.08509834, -0.027997585, - 0.037871376, 0.01449768, -0.09002357, -0.06111149, - -0.046195522, 0.0422062, -0.005683705, -0.1253618, - -0.012925729, -0.04890792, 0.06985068, 0.037654128, - 0.03398274, -0.004781977, 0.007032333, -0.031787455, - 0.010868644, -0.031489216, 0.09525667, 0.013939797, - 0.0058680447, 0.0167067, 0.02668468, -0.04797466, - -0.048885044, -0.12722108, 0.035304096, 0.06554885, - 0.00972396, -0.039238118, -0.05159735, -0.11329045, - 0.1613692, -0.03750952, 0.06529313, -0.071974665, - -0.11769596, 0.015524369, -0.0013754242, -0.12446318, - 0.02786344, -0.014179351, 0.005264273, 0.14376344, - 0.015983658, 0.03406988, -0.06939408, 0.040699873, - 0.02111075, 0.09669095, 0.041345075, -0.08316494, - -0.07684199, -0.045768797, 0.032298047, -0.041805092, - 0.0119405, 0.0061010392, 0.12652606, 0.0064572375, - -0.024950314, 0.11574242, 0.04508852, -0.04335324, - 0.06760663, -0.027437469, 0.07216407, 0.06977076, - -0.05438599, 0.034033038, -0.028602652, 0.05346137, - 0.043184172, -0.037189785, 0.10420091, 0.00882477, - -0.054019816, -0.074273005, -0.030617684, -0.0028467078, - 0.024302477, -0.0038869337, 0.005332455, 0.0013399826, - 0.04361412, -0.007001822, 0.09631092, -0.06702025, - -0.042049985, -0.035070654, -0.04103342, -0.10273396, - 0.0544271, 0.037184782, -0.13150354, -0.0058036847, - -0.008264958, 0.042035464, 0.05891794, 0.029673764, - 0.0063542654, 0.044788733, 0.054816857, 0.062257513, - -0.00093483756, 0.048938446, -0.004952862, -0.007730018, - -0.04043371, -0.017094059, 0.07229206, -0.023670016, - -0.052195564, -0.025616996, -0.01520939, 0.045104615, - -0.007376126, 0.003533447, 0.006570588, 0.056037236, - 0.12436656, 0.051817212, 0.028532185, -0.08686856, - 0.11868599, 0.07663395, -0.07323171, 0.03463402, - -0.050708205, -0.04458982, -0.11590894, 0.021273347, - 0.1251325, -0.15313013, -0.12224372, 0.17228661, - 0.023029093, 0.086124025, 0.006445803, -0.03496501, - 0.028332196, 0.04449512, -0.042436164, -0.026587414, - -0.006041347, -0.09292539, -0.05678812, 0.03897832, - 0.09465633, 0.008115513, -0.02171956, 0.08304309, - 0.071401566, 0.019622514, 0.032163795, -0.004167056, - 0.02295182, 0.030739572, 0.056506045, 0.004612461, - 0.06524936, 0.059999723, 0.046395954, -0.0045512207, - -0.1335546, -0.030136576, 0.11584653, -0.014678886, - 0.0020118146, -0.09688814, -0.0790206, 0.039770417, - -0.0329582, 0.07922767, 0.029322514, 0.026405897, - 0.04207835, -0.07073373, 0.063781224, 0.0859677, - -0.10925287, -0.07011058, 0.048005477, 0.03438226, - -0.09606514, -0.006669445, -0.043381985, 0.04240257, - -0.06955775, -0.06769346, 0.043903265, -0.026784198, - -0.017840602, 0.024307009, -0.040079936, -0.019946516, - 0.045318738, -0.12233574, 0.026170589, 0.0074471775, - 0.15978073, 0.10185836, 0.10298046, -0.015476589, - -0.039390966, -0.072174534, 0.0739445, -0.1211869, - -0.0347889, -0.07943156, 0.014809798, -0.12412325, - -0.0030663363, 0.039695457, 0.0647603, -0.08291318, - -0.018529687, -0.004423833, 0.0037507233, 0.084633216, - -0.01514876, -0.056505352, -0.012800942, -0.06994386, - 0.012962922, -0.031234352, 0.07029052, 0.016418684, - 0.03618972, 0.055686004, -0.08663945, -0.017404709, - -0.054761406, 0.029065743, 0.052404847, 0.020238016, - 0.0048197987, -0.0214882, 0.07078733, 0.013016777, - 0.06262858, 0.009184685, 0.020785125, -0.043904778, - -0.0270329, -0.03299152, -0.060088247, -0.015162964, - -0.001828936, 0.12642565, -0.056757294, 0.013586685, - 0.09232601, -0.035886683, 0.06000002, 0.05229691, - -0.052580316, -0.082029596, -0.010794592, 0.012947712, - -0.036429964, -0.085508935, -0.13127148, -0.017744139, - 0.031502828, 0.036232427, -0.031581745, 0.023051167, - -0.05325106, -0.03421577, 0.028793324, -0.034633752, - -0.009881397, -0.043551125, -0.018609839, 0.0019097115, - -0.008799762, 0.056595087, 0.0022273948, 0.055752404}); - - lstm.SetRecurrentToOutputWeights({ - 0.025825322, -0.05813119, 0.09495884, -0.045984812, -0.01255415, - -0.0026479573, -0.08196161, -0.054914974, -0.0046604523, -0.029587349, - -0.044576716, -0.07480124, -0.082868785, 0.023254942, 0.027502948, - -0.0039728214, -0.08683098, -0.08116779, -0.014675607, -0.037924774, - -0.023314456, -0.007401714, -0.09255757, 0.029460307, -0.08829125, - -0.005139627, -0.08989442, -0.0555066, 0.13596267, -0.025062224, - -0.048351806, -0.03850004, 0.07266485, -0.022414139, 0.05940088, - 0.075114764, 0.09597592, -0.010211725, -0.0049794707, -0.011523867, - -0.025980417, 0.072999895, 0.11091378, -0.081685916, 0.014416728, - 0.043229222, 0.034178585, -0.07530371, 0.035837382, -0.085607, - -0.007721233, -0.03287832, -0.043848954, -0.06404588, -0.06632928, - -0.073643476, 0.008214239, -0.045984086, 0.039764922, 0.03474462, - 0.060612556, -0.080590084, 0.049127717, 0.04151091, -0.030063879, - 0.008801774, -0.023021035, -0.019558564, 0.05158114, -0.010947698, - -0.011825728, 0.0075720972, 0.0699727, -0.0039981045, 0.069350146, - 0.08799282, 0.016156472, 0.035502106, 0.11695009, 0.006217345, - 0.13392477, -0.037875112, 0.025745004, 0.08940699, -0.00924166, - 0.0046702605, -0.036598757, -0.08811812, 0.10522024, -0.032441203, - 0.008176899, -0.04454919, 0.07058152, 0.0067963637, 0.039206743, - 0.03259838, 0.03725492, -0.09515802, 0.013326398, -0.052055415, - -0.025676316, 0.03198509, -0.015951829, -0.058556724, 0.036879618, - 0.043357447, 0.028362012, -0.05908629, 0.0059240665, -0.04995891, - -0.019187413, 0.0276265, -0.01628143, 0.0025863599, 0.08800015, - 0.035250366, -0.022165963, -0.07328642, -0.009415526, -0.07455109, - 0.11690406, 0.0363299, 0.07411125, 0.042103454, -0.009660886, - 0.019076364, 0.018299393, -0.046004917, 0.08891175, 0.0431396, - -0.026327137, -0.051502608, 0.08979574, -0.051670972, 0.04940282, - -0.07491107, -0.021240504, 0.022596184, -0.034280192, 0.060163025, - -0.058211457, -0.051837247, -0.01349775, -0.04639988, -0.035936575, - -0.011681591, 0.064818054, 0.0073146066, -0.021745546, -0.043124277, - -0.06471268, -0.07053354, -0.029321948, -0.05330136, 0.016933719, - -0.053782392, 0.13747959, -0.1361751, -0.11569455, 0.0033329215, - 0.05693899, -0.053219706, 0.063698, 0.07977434, -0.07924483, - 0.06936997, 0.0034815092, -0.007305279, -0.037325785, -0.07251102, - -0.033633437, -0.08677009, 0.091591336, -0.14165086, 0.021752775, - 0.019683983, 0.0011612234, -0.058154266, 0.049996935, 0.0288841, - -0.0024567875, -0.14345716, 0.010955264, -0.10234828, 0.1183656, - -0.0010731248, -0.023590032, -0.072285876, -0.0724771, -0.026382286, - -0.0014920527, 0.042667855, 0.0018776858, 0.02986552, 0.009814309, - 0.0733756, 0.12289186, 0.018043943, -0.0458958, 0.049412545, - 0.033632483, 0.05495232, 0.036686596, -0.013781798, -0.010036754, - 0.02576849, -0.08307328, 0.010112348, 0.042521734, -0.05869831, - -0.071689695, 0.03876447, -0.13275425, -0.0352966, -0.023077697, - 0.10285965, 0.084736146, 0.15568255, -0.00040734606, 0.027835453, - -0.10292561, -0.032401145, 0.10053256, -0.026142767, -0.08271222, - -0.0030240538, -0.016368777, 0.1070414, 0.042672627, 0.013456989, - -0.0437609, -0.022309763, 0.11576483, 0.04108048, 0.061026827, - -0.0190714, -0.0869359, 0.037901703, 0.0610107, 0.07202949, - 0.01675338, 0.086139716, -0.08795751, -0.014898893, -0.023771819, - -0.01965048, 0.007955471, -0.043740474, 0.03346837, -0.10549954, - 0.090567775, 0.042013682, -0.03176985, 0.12569028, -0.02421228, - -0.029526481, 0.023851605, 0.031539805, 0.05292009, -0.02344001, - -0.07811758, -0.08834428, 0.10094801, 0.16594367, -0.06861939, - -0.021256343, -0.041093912, -0.06669611, 0.035498552, 0.021757556, - -0.09302526, -0.015403468, -0.06614931, -0.051798206, -0.013874718, - 0.03630673, 0.010412845, -0.08077351, 0.046185967, 0.0035662893, - 0.03541868, -0.094149634, -0.034814864, 0.003128424, -0.020674974, - -0.03944324, -0.008110165, -0.11113267, 0.08484226, 0.043586485, - 0.040582247, 0.0968012, -0.065249965, -0.028036479, 0.0050708856, - 0.0017462453, 0.0326779, 0.041296225, 0.09164146, -0.047743853, - -0.015952192, -0.034451712, 0.084197424, -0.05347844, -0.11768019, - 0.085926116, -0.08251791, -0.045081906, 0.0948852, 0.068401024, - 0.024856757, 0.06978981, -0.057309967, -0.012775832, -0.0032452994, - 0.01977615, -0.041040014, -0.024264973, 0.063464895, 0.05431621, - }); - - lstm.SetCellToInputWeights( - {0.040369894, 0.030746894, 0.24704495, 0.018586371, -0.037586458, - -0.15312155, -0.11812848, -0.11465643, 0.20259799, 0.11418174, - -0.10116027, -0.011334949, 0.12411352, -0.076769054, -0.052169047, - 0.21198851, -0.38871562, -0.09061183, -0.09683246, -0.21929175}); - - lstm.SetCellToForgetWeights( - {-0.01998659, -0.15568835, -0.24248174, -0.012770197, 0.041331276, - -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766, - -0.047248036, 0.021479502, 0.033189066, 0.11952997, -0.020432774, - 0.64658105, -0.06650122, -0.03467612, 0.095340036, 0.23647355}); - - lstm.SetCellToOutputWeights( - {0.08286371, -0.08261836, -0.51210177, 0.002913762, 0.17764764, - -0.5495371, -0.08460716, -0.24552552, 0.030037103, 0.04123544, - -0.11940523, 0.007358328, 0.1890978, 0.4833202, -0.34441817, - 0.36312827, -0.26375428, 0.1457655, -0.19724406, 0.15548733}); - - lstm.SetProjectionWeights( - {-0.009802181, 0.09401916, 0.0717386, -0.13895074, 0.09641832, - 0.060420845, 0.08539281, 0.054285463, 0.061395317, 0.034448683, - -0.042991187, 0.019801661, -0.16840284, -0.015726732, -0.23041931, - -0.024478018, -0.10959692, -0.013875541, 0.18600968, -0.061274476, - 0.0138165, -0.08160894, -0.07661644, 0.032372914, 0.16169067, - 0.22465782, -0.03993472, -0.004017731, 0.08633481, -0.28869787, - 0.08682067, 0.17240396, 0.014975425, 0.056431185, 0.031037588, - 0.16702051, 0.0077946745, 0.15140012, 0.29405436, 0.120285, - -0.188994, -0.027265169, 0.043389652, -0.022061434, 0.014777949, - -0.20203483, 0.094781205, 0.19100232, 0.13987629, -0.036132768, - -0.06426278, -0.05108664, 0.13221376, 0.009441198, -0.16715929, - 0.15859416, -0.040437475, 0.050779544, -0.022187516, 0.012166504, - 0.027685808, -0.07675938, -0.0055694645, -0.09444123, 0.0046453946, - 0.050794356, 0.10770313, -0.20790008, -0.07149004, -0.11425117, - 0.008225835, -0.035802525, 0.14374903, 0.15262283, 0.048710253, - 0.1847461, -0.007487823, 0.11000021, -0.09542012, 0.22619456, - -0.029149994, 0.08527916, 0.009043713, 0.0042746216, 0.016261552, - 0.022461696, 0.12689082, -0.043589946, -0.12035478, -0.08361797, - -0.050666027, -0.1248618, -0.1275799, -0.071875185, 0.07377272, - 0.09944291, -0.18897448, -0.1593054, -0.06526116, -0.040107165, - -0.004618631, -0.067624845, -0.007576253, 0.10727444, 0.041546922, - -0.20424393, 0.06907816, 0.050412357, 0.00724631, 0.039827548, - 0.12449835, 0.10747581, 0.13708383, 0.09134148, -0.12617786, - -0.06428341, 0.09956831, 0.1208086, -0.14676677, -0.0727722, - 0.1126304, 0.010139365, 0.015571211, -0.038128063, 0.022913318, - -0.042050496, 0.16842307, -0.060597885, 0.10531834, -0.06411776, - -0.07451711, -0.03410368, -0.13393489, 0.06534304, 0.003620307, - 0.04490757, 0.05970546, 0.05197996, 0.02839995, 0.10434969, - -0.013699693, -0.028353551, -0.07260381, 0.047201227, -0.024575593, - -0.036445823, 0.07155557, 0.009672501, -0.02328883, 0.009533515, - -0.03606021, -0.07421458, -0.028082801, -0.2678904, -0.13221288, - 0.18419984, -0.13012612, -0.014588381, -0.035059117, -0.04824723, - 0.07830115, -0.056184657, 0.03277091, 0.025466874, 0.14494097, - -0.12522776, -0.098633975, -0.10766018, -0.08317623, 0.08594209, - 0.07749552, 0.039474737, 0.1776665, -0.07409566, -0.0477268, - 0.29323658, 0.10801441, 0.1154011, 0.013952499, 0.10739139, - 0.10708251, -0.051456142, 0.0074137426, -0.10430189, 0.10034707, - 0.045594677, 0.0635285, -0.0715442, -0.089667566, -0.10811871, - 0.00026344223, 0.08298446, -0.009525053, 0.006585689, -0.24567553, - -0.09450807, 0.09648481, 0.026996298, -0.06419476, -0.04752702, - -0.11063944, -0.23441927, -0.17608605, -0.052156363, 0.067035615, - 0.19271925, -0.0032889997, -0.043264326, 0.09663576, -0.057112187, - -0.10100678, 0.0628376, 0.04447668, 0.017961001, -0.10094388, - -0.10190601, 0.18335468, 0.10494553, -0.052095775, -0.0026118709, - 0.10539724, -0.04383912, -0.042349473, 0.08438151, -0.1947263, - 0.02251204, 0.11216432, -0.10307853, 0.17351969, -0.039091777, - 0.08066188, -0.00561982, 0.12633002, 0.11335965, -0.0088127935, - -0.019777594, 0.06864014, -0.059751723, 0.016233567, -0.06894641, - -0.28651384, -0.004228674, 0.019708522, -0.16305895, -0.07468996, - -0.0855457, 0.099339016, -0.07580735, -0.13775392, 0.08434318, - 0.08330512, -0.12131499, 0.031935584, 0.09180414, -0.08876437, - -0.08049874, 0.008753825, 0.03498998, 0.030215185, 0.03907079, - 0.089751154, 0.029194152, -0.03337423, -0.019092513, 0.04331237, - 0.04299654, -0.036394123, -0.12915532, 0.09793732, 0.07512415, - -0.11319543, -0.032502122, 0.15661901, 0.07671967, -0.005491124, - -0.19379048, -0.218606, 0.21448623, 0.017840758, 0.1416943, - -0.07051762, 0.19488361, 0.02664691, -0.18104725, -0.09334311, - 0.15026465, -0.15493552, -0.057762887, -0.11604192, -0.262013, - -0.01391798, 0.012185008, 0.11156489, -0.07483202, 0.06693364, - -0.26151478, 0.046425626, 0.036540434, -0.16435726, 0.17338543, - -0.21401681, -0.11385144, -0.08283257, -0.069031075, 0.030635102, - 0.010969227, 0.11109743, 0.010919218, 0.027526086, 0.13519906, - 0.01891392, -0.046839405, -0.040167913, 0.017953383, -0.09700955, - 0.0061885654, -0.07000971, 0.026893595, -0.038844477, 0.14543656}); - - static float lstm_input[][20] = { - {// Batch0: 4 (input_sequence_size) * 5 (n_input) - 0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386, - 0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199, - 0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339}, - - {// Batch1: 4 (input_sequence_size) * 5 (n_input) - 0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260, - 0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485, - 0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}}; - - static float lstm_golden_output[][64] = { - {// Batch0: 4 (input_sequence_size) * 16 (n_output) - -0.00396806, 0.029352, -0.00279226, 0.0159977, -0.00835576, - -0.0211779, 0.0283512, -0.0114597, 0.00907307, -0.0244004, - -0.0152191, -0.0259063, 0.00914318, 0.00415118, 0.017147, - 0.0134203, -0.0166936, 0.0381209, 0.000889694, 0.0143363, - -0.0328911, -0.0234288, 0.0333051, -0.012229, 0.0110322, - -0.0457725, -0.000832209, -0.0202817, 0.0327257, 0.0121308, - 0.0155969, 0.0312091, -0.0213783, 0.0350169, 0.000324794, - 0.0276012, -0.0263374, -0.0371449, 0.0446149, -0.0205474, - 0.0103729, -0.0576349, -0.0150052, -0.0292043, 0.0376827, - 0.0136115, 0.0243435, 0.0354492, -0.0189322, 0.0464512, - -0.00251373, 0.0225745, -0.0308346, -0.0317124, 0.0460407, - -0.0189395, 0.0149363, -0.0530162, -0.0150767, -0.0340193, - 0.0286833, 0.00824207, 0.0264887, 0.0305169}, - {// Batch1: 4 (input_sequence_size) * 16 (n_output) - -0.013869, 0.0287268, -0.00334693, 0.00733398, -0.0287926, - -0.0186926, 0.0193662, -0.0115437, 0.00422612, -0.0345232, - 0.00223253, -0.00957321, 0.0210624, 0.013331, 0.0150954, - 0.02168, -0.0141913, 0.0322082, 0.00227024, 0.0260507, - -0.0188721, -0.0296489, 0.0399134, -0.0160509, 0.0116039, - -0.0447318, -0.0150515, -0.0277406, 0.0316596, 0.0118233, - 0.0214762, 0.0293641, -0.0204549, 0.0450315, -0.00117378, - 0.0167673, -0.0375007, -0.0238314, 0.038784, -0.0174034, - 0.0131743, -0.0506589, -0.0048447, -0.0240239, 0.0325789, - 0.00790065, 0.0220157, 0.0333314, -0.0264787, 0.0387855, - -0.000764675, 0.0217599, -0.037537, -0.0335206, 0.0431679, - -0.0211424, 0.010203, -0.062785, -0.00832363, -0.025181, - 0.0412031, 0.0118723, 0.0239643, 0.0394009}}; - - // Resetting cell_state and output_state - lstm.ResetCellState(); - lstm.ResetOutputState(); - - for (int i = 0; i < lstm.sequence_length(); i++) { - float* batch0_start = lstm_input[0] + i * lstm.num_inputs(); - float* batch0_end = batch0_start + lstm.num_inputs(); - - lstm.SetInput(2 * i * lstm.num_inputs(), batch0_start, batch0_end); - - float* batch1_start = lstm_input[1] + i * lstm.num_inputs(); - float* batch1_end = batch1_start + lstm.num_inputs(); - lstm.SetInput((2 * i + 1) * lstm.num_inputs(), batch1_start, batch1_end); - } + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); - lstm.Invoke(); + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); - std::vector<float> expected; - for (int i = 0; i < lstm.sequence_length(); i++) { - float* golden_start_batch0 = lstm_golden_output[0] + i * lstm.num_outputs(); - float* golden_end_batch0 = golden_start_batch0 + lstm.num_outputs(); - float* golden_start_batch1 = lstm_golden_output[1] + i * lstm.num_outputs(); - float* golden_end_batch1 = golden_start_batch1 + lstm.num_outputs(); - expected.insert(expected.end(), golden_start_batch0, golden_end_batch0); - expected.insert(expected.end(), golden_start_batch1, golden_end_batch1); - } - EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + lstm.SetCellToInputWeights(cell_to_input_weights_); + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); + + lstm.SetProjectionWeights(projection_weights_); + + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); +} + +TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) { + const int n_batch = 2; + const int n_input = 5; + const int n_cell = 20; + const int n_output = 16; + const int sequence_length = 4; + + HybridUnidirectionalLSTMOpModel lstm( + n_batch, n_input, n_cell, n_output, sequence_length, + /*use_cifg=*/false, /*use_peephole=*/true, + /*use_projection_weights=*/true, + /*use_projection_bias=*/false, + /*cell_clip=*/0.0, /*proj_clip=*/0.0, + { + {sequence_length, n_batch, n_input}, // input tensor + + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {n_cell}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {n_output, n_cell}, // projection_weight tensor + {0}, // projection_bias tensor + + {n_batch, n_output}, // activation_state tensor + {n_batch, n_cell}, // cell_state tensor + }); + + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); + + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); + + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + lstm.SetCellToInputWeights(cell_to_input_weights_); + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); + + lstm.SetProjectionWeights(projection_weights_); + + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467); } } // namespace diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc index 22c80df19c..744ee7c109 100644 --- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc +++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <unistd.h> #include <cassert> #include <cmath> #include <cstdio> @@ -20,8 +19,8 @@ limitations under the License. #include <iostream> #include <limits> -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" @@ -32,16 +31,19 @@ namespace ops { namespace builtin { namespace unidirectional_sequence_rnn { +// Input tensors. constexpr int kInputTensor = 0; constexpr int kWeightsTensor = 1; constexpr int kRecurrentWeightsTensor = 2; constexpr int kBiasTensor = 3; -constexpr int kHiddenStateTensor = 0; -constexpr int kOutputTensor = 1; +constexpr int kHiddenStateTensor = 4; + +// Output tensor. +constexpr int kOutputTensor = 0; void* Init(TfLiteContext* context, const char* buffer, size_t length) { auto* scratch_tensor_index = new int; - context->AddTensors(context, /*tensors_to_add=*/2, scratch_tensor_index); + context->AddTensors(context, /*tensors_to_add=*/3, scratch_tensor_index); return scratch_tensor_index; } @@ -51,14 +53,16 @@ void Free(TfLiteContext* context, void* buffer) { TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Check we have all the inputs and outputs we need. - TF_LITE_ENSURE_EQ(context, node->inputs->size, 4); - TF_LITE_ENSURE_EQ(context, node->outputs->size, 2); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 5); + TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor); const TfLiteTensor* recurrent_weights = GetInput(context, node, kRecurrentWeightsTensor); const TfLiteTensor* bias = GetInput(context, node, kBiasTensor); + const TfLiteTensor* hidden_state = + GetInput(context, node, kHiddenStateTensor); // Check all the parameters of tensor match within themselves and match the // input configuration. @@ -75,20 +79,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]); TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type); + TF_LITE_ENSURE_EQ(context, NumDimensions(hidden_state), 2); + TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[0], batch_size); + TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[1], num_units); - TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - // Resize state. - TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2); - hidden_state_size_array->data[0] = batch_size; - hidden_state_size_array->data[1] = num_units; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, hidden_state, - hidden_state_size_array)); - - // Mark hidden state as a persistent tensor. - hidden_state->allocation_type = kTfLiteArenaRwPersistent; - // Resize output. TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(3); output_size_array->data[0] = (time_major) ? max_time : batch_size; @@ -102,7 +98,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) { int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data); TfLiteIntArrayFree(node->temporaries); - node->temporaries = TfLiteIntArrayCreate(2); + node->temporaries = TfLiteIntArrayCreate(3); node->temporaries->data[0] = *scratch_tensor_index; TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0); input_quantized->type = kTfLiteUInt8; @@ -125,6 +121,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { context->ResizeTensor(context, hidden_state_quantized, hidden_state_quantized_size)); } + node->temporaries->data[2] = *scratch_tensor_index + 2; + TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2); + scaling_factors->type = kTfLiteFloat32; + scaling_factors->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); + scaling_factors_size->data[0] = batch_size; + if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) { + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, + scaling_factors_size)); + } } return kTfLiteOk; } @@ -187,14 +193,12 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input, return kTfLiteOk; } -TfLiteStatus EvalQuantized(const TfLiteTensor* input, - const TfLiteTensor* input_weights, - const TfLiteTensor* recurrent_weights, - const TfLiteTensor* bias, - const TfLiteSequenceRNNParams* params, - TfLiteTensor* input_scratch, - TfLiteTensor* hidden_state_scratch, - TfLiteTensor* hidden_state, TfLiteTensor* output) { +TfLiteStatus EvalHybrid( + const TfLiteTensor* input, const TfLiteTensor* input_weights, + const TfLiteTensor* recurrent_weights, const TfLiteTensor* bias, + const TfLiteSequenceRNNParams* params, TfLiteTensor* input_scratch, + TfLiteTensor* hidden_state_scratch, TfLiteTensor* scaling_factors, + TfLiteTensor* hidden_state, TfLiteTensor* output) { const bool time_major = params->time_major; const int batch_size = (time_major) ? input->dims->data[1] : input->dims->data[0]; @@ -218,6 +222,7 @@ TfLiteStatus EvalQuantized(const TfLiteTensor* input, reinterpret_cast<int8_t*>(input_scratch->data.uint8); int8_t* quantized_hidden_state_ptr = reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8); + float* scaling_factors_ptr = scaling_factors->data.f; if (time_major) { // Initialize the pointer to hidden state. @@ -233,7 +238,8 @@ TfLiteStatus EvalQuantized(const TfLiteTensor* input, input_ptr_batch, input_weights_ptr, input_weights_scale, recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size, num_units, batch_size, params->activation, quantized_input_ptr, - quantized_hidden_state_ptr, hidden_state_ptr_batch, output_ptr_batch); + quantized_hidden_state_ptr, scaling_factors_ptr, + hidden_state_ptr_batch, output_ptr_batch); } } else { // For each batch @@ -252,7 +258,7 @@ TfLiteStatus EvalQuantized(const TfLiteTensor* input, recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size, num_units, /*batch_size=*/1, params->activation, quantized_input_ptr, quantized_hidden_state_ptr, - hidden_state_ptr_batch, output_ptr_batch); + scaling_factors_ptr, hidden_state_ptr_batch, output_ptr_batch); } } } @@ -267,7 +273,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* recurrent_weights = GetInput(context, node, kRecurrentWeightsTensor); const TfLiteTensor* bias = GetInput(context, node, kBiasTensor); - TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor); + // The hidden_state is a variable input tensor that can be modified. + TfLiteTensor* hidden_state = + const_cast<TfLiteTensor*>(GetInput(context, node, kHiddenStateTensor)); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); switch (input_weights->type) { @@ -278,12 +286,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { // TODO(mirkov): implement eval with quantized inputs as well. TfLiteTensor* input_quantized = GetTemporary(context, node, 0); TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1); - return EvalQuantized(input, input_weights, recurrent_weights, bias, - params, input_quantized, hidden_state_quantized, - hidden_state, output); + TfLiteTensor* scaling_factors = GetTemporary(context, node, 2); + return EvalHybrid(input, input_weights, recurrent_weights, bias, params, + input_quantized, hidden_state_quantized, + scaling_factors, hidden_state, output); } default: - context->ReportError(context, "Type not currently supported."); + context->ReportError(context, "Type %d not currently supported.", + input_weights->type); return kTfLiteError; } return kTfLiteOk; diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc index 0adab837b0..6b48e3fff7 100644 --- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc +++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc @@ -183,7 +183,7 @@ class UnidirectionalRNNOpModel : public SingleOpModel { weights_ = AddInput(weights); recurrent_weights_ = AddInput(recurrent_weights); bias_ = AddInput(TensorType_FLOAT32); - hidden_state_ = AddOutput(TensorType_FLOAT32); + hidden_state_ = AddInput(TensorType_FLOAT32, true); output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, BuiltinOptions_SequenceRNNOptions, @@ -194,12 +194,14 @@ class UnidirectionalRNNOpModel : public SingleOpModel { BuildInterpreter({{sequence_len_, batches_, input_size_}, {units_, input_size_}, {units_, units_}, - {units_}}); + {units_}, + {batches_, units}}); } else { BuildInterpreter({{batches_, sequence_len_, input_size_}, {units_, input_size_}, {units_, units_}, - {units_}}); + {units_}, + {batches_, units_}}); } } @@ -221,14 +223,6 @@ class UnidirectionalRNNOpModel : public SingleOpModel { PopulateTensor(input_, offset, begin, end); } - void ResetHiddenState() { - const int zero_buffer_size = units_ * batches_; - std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]); - memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); - PopulateTensor(hidden_state_, 0, zero_buffer.get(), - zero_buffer.get() + zero_buffer_size); - } - std::vector<float> GetOutput() { return ExtractVector<float>(output_); } int input_size() { return input_size_; } @@ -273,7 +267,6 @@ TEST(UnidirectionalRNNOpTest, BlackBoxTest) { rnn.SetWeights(rnn_weights); rnn.SetBias(rnn_bias); rnn.SetRecurrentWeights(rnn_recurrent_weights); - rnn.ResetHiddenState(); const int input_sequence_size = rnn.input_size() * rnn.sequence_len(); float* batch_start = rnn_input; @@ -299,7 +292,6 @@ TEST(HybridUnidirectionalRNNOpModelOpTest, BlackBoxTest) { rnn.SetWeights(rnn_weights); rnn.SetBias(rnn_bias); rnn.SetRecurrentWeights(rnn_recurrent_weights); - rnn.ResetHiddenState(); const int input_sequence_size = rnn.input_size() * rnn.sequence_len(); float* batch_start = rnn_input; @@ -326,7 +318,6 @@ TEST(UnidirectionalRNNOpTest, TimeMajorBlackBoxTest) { rnn.SetWeights(rnn_weights); rnn.SetBias(rnn_bias); rnn.SetRecurrentWeights(rnn_recurrent_weights); - rnn.ResetHiddenState(); for (int i = 0; i < rnn.sequence_len(); i++) { float* batch_start = rnn_input + i * rnn.input_size(); @@ -356,7 +347,6 @@ TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTest) { rnn.SetWeights(rnn_weights); rnn.SetBias(rnn_bias); rnn.SetRecurrentWeights(rnn_recurrent_weights); - rnn.ResetHiddenState(); for (int i = 0; i < rnn.sequence_len(); i++) { float* batch_start = rnn_input + i * rnn.input_size(); diff --git a/tensorflow/contrib/lite/kernels/unpack.cc b/tensorflow/contrib/lite/kernels/unpack.cc new file mode 100644 index 0000000000..a7d3a9bc76 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/unpack.cc @@ -0,0 +1,133 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace unpack { +namespace { + +constexpr int kInputTensor = 0; + +// Op data for unpack op. +struct OpData { + int num; + int axis; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* data = new OpData; + data->axis = 0; + return data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast<OpData*>(buffer); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + const OpData* data = reinterpret_cast<OpData*>(node->builtin_data); + + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), data->num); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TF_LITE_ENSURE(context, NumDimensions(input) <= 4); + TF_LITE_ENSURE(context, NumDimensions(input) > 1); + TF_LITE_ENSURE(context, NumDimensions(input) > data->axis); + // TODO(renjieliu): Support negative axis. + TF_LITE_ENSURE(context, data->axis >= 0); + if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32) { + context->ReportError(context, + "Currently pack only supports int32 and float32."); + return kTfLiteError; + } + + const TfLiteIntArray* input_shape = input->dims; + // Num should be equal to the shape[axis]. + // Resize outputs. rank will be R - 1. + TfLiteIntArray* output_shape = TfLiteIntArrayCreate(NumDimensions(input) - 1); + int o = 0; + for (int index = 0; index < NumDimensions(input); ++index) { + if (index != data->axis) { + output_shape->data[o++] = input_shape->data[index]; + } + } + + TF_LITE_ENSURE_EQ(context, data->num, input_shape->data[data->axis]); + for (int i = 0; i < data->num; ++i) { + TfLiteIntArray* copied_output_shape = TfLiteIntArrayCopy(output_shape); + TfLiteTensor* output = GetOutput(context, node, i); + TF_LITE_ENSURE_EQ(context, output->type, input->type); + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, output, copied_output_shape)); + } + + TfLiteIntArrayFree(output_shape); + return kTfLiteOk; +} + +template <typename T> +void UnpackImpl(TfLiteContext* context, TfLiteNode* node, + const TfLiteTensor* input, int output_count, int axis) { + tflite::UnpackParams op_params; + op_params.axis = axis; + op_params.num_split = output_count; + VectorOfTensors<T> all_outputs(*context, *node->outputs); + reference_ops::Unpack<T>(op_params, GetTensorShape(input), + GetTensorData<T>(input), **all_outputs.shapes(), + all_outputs.data()); +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const OpData* data = reinterpret_cast<OpData*>(node->builtin_data); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + switch (input->type) { + case kTfLiteFloat32: { + UnpackImpl<float>(context, node, input, data->num, data->axis); + break; + } + case kTfLiteInt32: { + UnpackImpl<int32_t>(context, node, input, data->num, data->axis); + break; + } + default: { + context->ReportError(context, + "Currently pack only supports int32 and float32."); + return kTfLiteError; + } + } + + return kTfLiteOk; +} +} // namespace +} // namespace unpack + +TfLiteRegistration* Register_UNPACK() { + static TfLiteRegistration r = {unpack::Init, unpack::Free, unpack::Prepare, + unpack::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/unpack_test.cc b/tensorflow/contrib/lite/kernels/unpack_test.cc new file mode 100644 index 0000000000..4efc92a0fd --- /dev/null +++ b/tensorflow/contrib/lite/kernels/unpack_test.cc @@ -0,0 +1,225 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <vector> +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAre; + +template <typename T> +class UnpackOpModel : public SingleOpModel { + public: + UnpackOpModel(const TensorData& input, int axis) { + CHECK_LE(axis, input.shape.size()); + const int num_outputs = input.shape[axis]; + input_ = AddInput(input); + for (int i = 0; i < num_outputs; ++i) { + outputs_.push_back(AddOutput(input.type)); + } + SetBuiltinOp(BuiltinOperator_UNPACK, BuiltinOptions_UnpackOptions, + CreatePackOptions(builder_, num_outputs, axis).Union()); + BuildInterpreter({GetShape(input_)}); + } + + void SetInput(std::initializer_list<T> data) { + PopulateTensor<T>(input_, data); + } + + std::vector<std::vector<T>> GetOutputDatas() { + std::vector<std::vector<T>> output_datas; + for (const int output : outputs_) { + std::cerr << "the output is " << output << std::endl; + output_datas.push_back(ExtractVector<T>(output)); + } + return output_datas; + } + + std::vector<std::vector<int>> GetOutputShapes() { + std::vector<std::vector<int>> output_shapes; + for (const int output : outputs_) { + output_shapes.push_back(GetTensorShape(output)); + } + return output_shapes; + } + + private: + int input_; + std::vector<int> outputs_; +}; + +// float32 tests. +TEST(UnpackOpTest, FloatThreeOutputs) { + UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, 0); + model.SetInput({1, 2, 3, 4, 5, 6}); + model.Invoke(); + + // Check outputs shapes. + const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 3); + EXPECT_THAT(output_shapes[0], ElementsAre(2)); + EXPECT_THAT(output_shapes[1], ElementsAre(2)); + EXPECT_THAT(output_shapes[2], ElementsAre(2)); + + // Check outputs values. + const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas(); + EXPECT_EQ(output_datas.size(), 3); + EXPECT_THAT(output_datas[0], ElementsAre(1, 2)); + EXPECT_THAT(output_datas[1], ElementsAre(3, 4)); + EXPECT_THAT(output_datas[2], ElementsAre(5, 6)); +} + +TEST(UnpackOpTest, FloatThreeOutputsAxisOne) { + UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, 1); + model.SetInput({1, 2, 3, 4, 5, 6}); + model.Invoke(); + + // Check outputs shapes. + const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 2); + EXPECT_THAT(output_shapes[0], ElementsAre(3)); + EXPECT_THAT(output_shapes[1], ElementsAre(3)); + + // Check outputs values. + const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas(); + EXPECT_EQ(output_datas.size(), 2); + EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5)); + EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6)); +} + +TEST(UnpackOpTest, FloatOneOutput) { + UnpackOpModel<float> model({TensorType_FLOAT32, {1, 6}}, 0); + model.SetInput({1, 2, 3, 4, 5, 6}); + model.Invoke(); + + // Check outputs shapes. + const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_THAT(output_shapes[0], ElementsAre(6)); + + // Check outputs values. + const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas(); + EXPECT_EQ(output_datas.size(), 1); + EXPECT_THAT(output_datas[0], ElementsAre(1, 2, 3, 4, 5, 6)); +} + +TEST(UnpackOpTest, FloatThreeDimensionsOutputs) { + UnpackOpModel<float> model({TensorType_FLOAT32, {2, 2, 2}}, 2); + model.SetInput({1, 2, 3, 4, 5, 6, 7, 8}); + model.Invoke(); + + // Check outputs shapes. + const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 2); + EXPECT_THAT(output_shapes[0], ElementsAre(2, 2)); + EXPECT_THAT(output_shapes[1], ElementsAre(2, 2)); + + // Check outputs values. + const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas(); + EXPECT_EQ(output_datas.size(), 2); + EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5, 7)); + EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6, 8)); +} + +// int32 tests. +TEST(UnpackOpTest, IntThreeOutputs) { + UnpackOpModel<int32_t> model({TensorType_INT32, {3, 2}}, 0); + model.SetInput({1, 2, 3, 4, 5, 6}); + model.Invoke(); + + // Check outputs shapes. + const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 3); + EXPECT_THAT(output_shapes[0], ElementsAre(2)); + EXPECT_THAT(output_shapes[1], ElementsAre(2)); + EXPECT_THAT(output_shapes[2], ElementsAre(2)); + + // Check outputs values. + const std::vector<std::vector<int32_t>>& output_datas = + model.GetOutputDatas(); + EXPECT_EQ(output_datas.size(), 3); + EXPECT_THAT(output_datas[0], ElementsAre(1, 2)); + EXPECT_THAT(output_datas[1], ElementsAre(3, 4)); + EXPECT_THAT(output_datas[2], ElementsAre(5, 6)); +} + +TEST(UnpackOpTest, IntThreeOutputsAxisOne) { + UnpackOpModel<int32_t> model({TensorType_INT32, {3, 2}}, 1); + model.SetInput({1, 2, 3, 4, 5, 6}); + model.Invoke(); + + // Check outputs shapes. + const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 2); + EXPECT_THAT(output_shapes[0], ElementsAre(3)); + EXPECT_THAT(output_shapes[1], ElementsAre(3)); + + // Check outputs values. + const std::vector<std::vector<int32_t>>& output_datas = + model.GetOutputDatas(); + EXPECT_EQ(output_datas.size(), 2); + EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5)); + EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6)); +} + +TEST(UnpackOpTest, IntOneOutput) { + UnpackOpModel<int32_t> model({TensorType_INT32, {1, 6}}, 0); + model.SetInput({1, 2, 3, 4, 5, 6}); + model.Invoke(); + + // Check outputs shapes. + const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_THAT(output_shapes[0], ElementsAre(6)); + + // Check outputs values. + const std::vector<std::vector<int32_t>>& output_datas = + model.GetOutputDatas(); + EXPECT_EQ(output_datas.size(), 1); + EXPECT_THAT(output_datas[0], ElementsAre(1, 2, 3, 4, 5, 6)); +} + +TEST(UnpackOpTest, IntThreeDimensionsOutputs) { + UnpackOpModel<int32_t> model({TensorType_INT32, {2, 2, 2}}, 2); + model.SetInput({1, 2, 3, 4, 5, 6, 7, 8}); + model.Invoke(); + + // Check outputs shapes. + const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 2); + EXPECT_THAT(output_shapes[0], ElementsAre(2, 2)); + EXPECT_THAT(output_shapes[1], ElementsAre(2, 2)); + + // Check outputs values. + const std::vector<std::vector<int32_t>>& output_datas = + model.GetOutputDatas(); + EXPECT_EQ(output_datas.size(), 2); + EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5, 7)); + EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6, 8)); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/zeros_like.cc b/tensorflow/contrib/lite/kernels/zeros_like.cc new file mode 100644 index 0000000000..cce5240a9b --- /dev/null +++ b/tensorflow/contrib/lite/kernels/zeros_like.cc @@ -0,0 +1,73 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace zeros_like { + +constexpr int kInputTensor = 0; +constexpr int kOutputTensor = 0; + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + output->type = input->type; + + return context->ResizeTensor(context, output, + TfLiteIntArrayCopy(input->dims)); +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + const int num_elements = NumElements(input); + switch (input->type) { + case kTfLiteInt64: + memset(GetTensorData<int64_t>(output), 0, num_elements * sizeof(int64_t)); + break; + case kTfLiteInt32: + memset(GetTensorData<int32_t>(output), 0, num_elements * sizeof(int32_t)); + break; + case kTfLiteFloat32: + memset(GetTensorData<float>(output), 0, num_elements * sizeof(float)); + break; + default: + context->ReportError(context, + "ZerosLike only currently supports int64, int32, " + "and float32, got %d.", + input->type); + return kTfLiteError; + } + return kTfLiteOk; +} + +} // namespace zeros_like + +TfLiteRegistration* Register_ZEROS_LIKE() { + static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr, + zeros_like::Prepare, zeros_like::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/zeros_like_test.cc b/tensorflow/contrib/lite/kernels/zeros_like_test.cc new file mode 100644 index 0000000000..d3382d1d5b --- /dev/null +++ b/tensorflow/contrib/lite/kernels/zeros_like_test.cc @@ -0,0 +1,78 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <gtest/gtest.h> +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; + +class ZerosLikeOpModel : public SingleOpModel { + public: + explicit ZerosLikeOpModel(const TensorData& input) { + input_ = AddInput(input); + output_ = AddOutput(input); + SetBuiltinOp(BuiltinOperator_ZEROS_LIKE, BuiltinOptions_ZerosLikeOptions, + CreateZerosLikeOptions(builder_).Union()); + BuildInterpreter({GetShape(input_)}); + } + + int input() { return input_; } + int output() { return output_; } + + protected: + int input_; + int output_; +}; + +TEST(ZerosLikeOpModel, ZerosLikeFloat) { + ZerosLikeOpModel m({TensorType_FLOAT32, {2, 3}}); + m.PopulateTensor<float>(m.input(), {-2.0, -1.0, 0.0, 1.0, 2.0, 3.0}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<float>(m.output()), + ElementsAreArray({0.0, 0.0, 0.0, 0.0, 0.0, 0.0})); + EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 3})); +} + +TEST(ZerosLikeOpModel, ZerosLikeInt32) { + ZerosLikeOpModel m({TensorType_INT32, {1, 2, 2, 1}}); + m.PopulateTensor<int32_t>(m.input(), {-2, -1, 0, 3}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<int32_t>(m.output()), + ElementsAreArray({0, 0, 0, 0})); + EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 2, 2, 1})); +} + +TEST(ZerosLikeOpModel, ZerosLikeInt64) { + ZerosLikeOpModel m({TensorType_INT64, {1, 2, 2, 1}}); + m.PopulateTensor<int64_t>(m.input(), {-2, -1, 0, 3}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector<int64_t>(m.output()), + ElementsAreArray({0, 0, 0, 0})); + EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 2, 2, 1})); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} |