aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/lite/kernels/conv.cc
diff options
context:
space:
mode:
authorGravatar Andrew Selle <aselle@google.com>2017-11-10 10:35:35 -0800
committerGravatar Andrew Selle <aselle@andyselle.com>2017-11-10 16:14:42 -0800
commit0b15439f8f0f2d4755587f4096c3ea04cb199d23 (patch)
tree9aa4fc8162bf9b4ee50112a7b85703f70ca4df08 /tensorflow/contrib/lite/kernels/conv.cc
parent7ac140a5845553275427162aabd9d54987144b4a (diff)
Internal Change.
PiperOrigin-RevId: 175307445
Diffstat (limited to 'tensorflow/contrib/lite/kernels/conv.cc')
-rw-r--r--tensorflow/contrib/lite/kernels/conv.cc425
1 files changed, 425 insertions, 0 deletions
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
new file mode 100644
index 0000000000..c75c04baea
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -0,0 +1,425 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace conv {
+
+// This file has three implementation of Conv.
+enum KernelType {
+ kReference,
+ kGenericOptimized, // Neon-free
+ kNeonOptimized,
+};
+
+struct OpData {
+ // IDs are the arbitrary identifiers used by TF Lite to identify and access
+ // memory buffers.
+ int im2col_id;
+ int hwcn_weights_id;
+
+ TfLitePaddingValues padding;
+ // The scaling factor from input to output (aka the 'real multiplier') can
+ // be represented as a fixed point multipler plus a left shift.
+ int32_t output_multiplier;
+ int output_shift;
+ // The range of the fused activation layer. For example for kNone and
+ // uint8_t these would be 0 and 255.
+ int32_t output_activation_min;
+ int32_t output_activation_max;
+ // Indexes are the offset to the memory buffer in the array used to keep track
+ // of the allocated temporaries.
+ int32_t im2col_index;
+ int32_t hwcn_weights_index;
+ bool need_hwcn_weights;
+ bool have_weights_been_transposed;
+ bool need_im2col;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+ // This is a builtin op, so we don't use the contents in 'buffer', if any.
+ // Instead, we allocate a new object to use as scratch space for im2col, and
+ // to carry information from Prepare() to Eval().
+ auto* data = new OpData;
+ context->AddTensors(context, 1, &data->im2col_id);
+ context->AddTensors(context, 1, &data->hwcn_weights_id);
+ gemm_support::IncrementUsageCounter(context);
+ return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+ gemm_support::DecrementUsageCounter(context);
+ delete reinterpret_cast<OpData*>(buffer);
+}
+
+// Naive implementation of transpose for floats. Could be optimized to be more
+// cache friendly, but for now it's a one-time cost on first run, and we would
+// prefer to remove the need to do this at all eventually.
+void TransposeFloatTensor(TfLiteTensor* input, TfLiteTensor* output) {
+ const int rows = output->dims->data[1];
+ const int cols = output->dims->data[0];
+ const float* input_data = GetTensorData<float>(input);
+ float* output_data = GetTensorData<float>(output);
+ for (int i = 0; i < rows; ++i) {
+ for (int j = 0; j < cols; ++j) {
+ const float in_value = input_data[i * cols + j];
+ output_data[j * rows + i] = in_value;
+ }
+ }
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ bool hasBias = node->inputs->size == 3;
+ // Check number of inputs/outputs
+ TF_LITE_ENSURE(context, hasBias || node->inputs->size == 2);
+ TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+ TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+ TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+ TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
+ // Check dimensionality of input, filter
+ TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
+ TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
+ // Check input channels matching filter
+ TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]);
+
+ // Check types. (We assume that UINT8 refers to quantized tensors)
+ TfLiteType data_type = input->type;
+ TF_LITE_ENSURE(context,
+ data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
+ TF_LITE_ENSURE_EQ(context, output->type, data_type);
+ TF_LITE_ENSURE_EQ(context, filter->type, data_type);
+
+ TfLiteTensor* bias = nullptr;
+
+ // TODO(ahentz): At this point the optimized versions require 'bias'. We can
+ // either change that or document that convolution requires it.
+ TF_LITE_ENSURE(context, hasBias);
+
+ if (hasBias) {
+ bias = &context->tensors[node->inputs->data[2]];
+ if (data_type == kTfLiteUInt8) {
+ TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+ TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+ } else {
+ TF_LITE_ENSURE_EQ(context, bias->type, data_type);
+ }
+ TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
+ TF_LITE_ENSURE_EQ(context, bias->dims->data[0], filter->dims->data[0]);
+ }
+
+ int channels_out = filter->dims->data[0];
+ int width = input->dims->data[2];
+ int height = input->dims->data[1];
+ int filter_width = filter->dims->data[2];
+ int filter_height = filter->dims->data[1];
+ int batches = input->dims->data[0];
+
+ // Matching GetWindowedOutputSize in TensorFlow.
+ auto padding = params->padding;
+ auto computeOutSize = [padding](int imageSize, int filterSize,
+ int stride) -> int {
+ return padding == kTfLitePaddingSame
+ ? (imageSize + stride - 1) / stride
+ : padding == kTfLitePaddingValid
+ ? (imageSize - filterSize + stride) / stride
+ : 0;
+ };
+
+ int outWidth = computeOutSize(width, filter_width, params->stride_width);
+ int outHeight = computeOutSize(height, filter_height, params->stride_height);
+
+ data->padding.height =
+ ComputePadding(params->stride_height, height, filter_height, outHeight);
+ data->padding.width =
+ ComputePadding(params->stride_width, width, filter_width, outWidth);
+
+ TF_LITE_ENSURE(context, hasBias);
+
+ // Note that quantized inference requires that all tensors have their
+ // parameters set. This is usually done during quantized training.
+ if (data_type != kTfLiteFloat32) {
+ double real_multiplier = 0.0;
+ TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+ context, input, filter, bias, output, &real_multiplier));
+ QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
+ &data->output_shift);
+ CalculateActivationRangeUint8(params->activation, output,
+ &data->output_activation_min,
+ &data->output_activation_max);
+ }
+
+ TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+ output_size->data[0] = batches;
+ output_size->data[1] = outHeight;
+ output_size->data[2] = outWidth;
+ output_size->data[3] = channels_out;
+ auto output_status = context->ResizeTensor(context, output, output_size);
+
+ if (output_status != kTfLiteOk) return output_status;
+
+ // We don't always need to allocate im2col. It is only used in some versions
+ // of the optimized Conv. This test just mimics something that happens inside
+ // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
+ data->need_im2col =
+ (params->stride_width != 1 || params->stride_height != 1 ||
+ filter_width != 1 || filter_height != 1);
+ // If we're using the optimized multithreaded EigenTensor implementation of
+ // convolution, it expects the filter weights to be transposed compared to
+ // the normal TF Lite buffer format. Typical TF Lite weights are
+ // [filter_count, filter_height, filter_width, input_depth], but for the float
+ // implementation we need them as [filter_height, filter_width, input_depth,
+ // filter_count]. We get to that format by transposing, and create a temporary
+ // buffer to store the results.
+ // This path is only used for float processing, so only create the buffer if
+ // we're running with that data type.
+ data->need_hwcn_weights = (data_type == kTfLiteFloat32);
+
+ int temporaries_count = 0;
+ if (data->need_im2col) {
+ data->im2col_index = temporaries_count;
+ ++temporaries_count;
+ }
+ if (data->need_hwcn_weights) {
+ data->hwcn_weights_index = temporaries_count;
+ ++temporaries_count;
+ }
+
+ TfLiteIntArrayFree(node->temporaries);
+ node->temporaries = TfLiteIntArrayCreate(temporaries_count);
+
+ if (data->need_im2col) {
+ node->temporaries->data[data->im2col_index] = data->im2col_id;
+
+ TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
+
+ int input_depth = input->dims->data[3];
+ im2col_size->data[0] = output_size->data[0];
+ im2col_size->data[1] = output_size->data[1];
+ im2col_size->data[2] = output_size->data[2];
+ im2col_size->data[3] = input_depth * filter_height * filter_width;
+
+ TfLiteTensor* im2col =
+ &context->tensors[node->temporaries->data[data->im2col_index]];
+ im2col->type = data_type;
+ im2col->allocation_type = kTfLiteArenaRw;
+ auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
+ if (im2col_status != kTfLiteOk) return im2col_status;
+ }
+
+ if (data->need_hwcn_weights) {
+ node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
+ TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
+
+ // Because we're treating the filter weights as a matrix when we do the
+ // transpose, we allocate the buffer with a two-dimensional shape, where one
+ // dimension is the number of elements in each filter, and the second is the
+ // total number of filters.
+ int input_depth = input->dims->data[3];
+ hwcn_weights_size->data[0] = (filter_height * filter_width * input_depth);
+ hwcn_weights_size->data[1] = channels_out;
+
+ TfLiteTensor* hwcn_weights =
+ &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
+ hwcn_weights->type = data_type;
+ hwcn_weights->allocation_type = kTfLiteDynamic;
+ // Make sure we release any previous allocations before we reallocate.
+ // TODO(petewarden): Persistent arenas would be a better fit for this, but
+ // they aren't fully implemented yet.
+ if (hwcn_weights->data.raw) {
+ free(hwcn_weights->data.raw);
+ hwcn_weights->data.raw = nullptr;
+ }
+ auto hwcn_weights_status =
+ context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
+ if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
+ hwcn_weights->data.raw = static_cast<char*>(malloc(hwcn_weights->bytes));
+
+ // TODO(petewarden): If Resize() is called when the size hasn't actually
+ // changed, this will do extra redundant work.
+ data->have_weights_been_transposed = false;
+ }
+
+ return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+ TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
+ TfLiteTensor* filter, TfLiteTensor* bias,
+ TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
+ TfLiteTensor* output) {
+ gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
+
+ auto input_offset = -input->params.zero_point;
+ auto filter_offset = -filter->params.zero_point;
+ auto output_offset = output->params.zero_point;
+
+ if (kernel_type == kReference) {
+ reference_ops::Conv(
+ GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+ GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+ GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
+ params->stride_height, data->padding.width, data->padding.height,
+ output_offset, data->output_multiplier, data->output_shift,
+ data->output_activation_min, data->output_activation_max,
+ GetTensorData<uint8_t>(output), GetTensorDims(output),
+ GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+ } else {
+ optimized_ops::Conv(
+ GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+ GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+ GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
+ params->stride_height, data->padding.width, data->padding.height,
+ output_offset, data->output_multiplier, data->output_shift,
+ data->output_activation_min, data->output_activation_max,
+ GetTensorData<uint8_t>(output), GetTensorDims(output),
+ GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+ }
+}
+
+template <KernelType kernel_type>
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+ TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
+ TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col,
+ TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+ float output_activation_min, output_activation_max;
+ CalculateActivationRangeFloat(params->activation, &output_activation_min,
+ &output_activation_max);
+
+ const float* filter_data;
+ if (data->need_hwcn_weights) {
+ filter_data = GetTensorData<float>(hwcn_weights);
+ } else {
+ filter_data = GetTensorData<float>(filter);
+ }
+
+ if (kernel_type == kReference) {
+ reference_ops::Conv(
+ GetTensorData<float>(input), GetTensorDims(input), filter_data,
+ GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
+ params->stride_width, params->stride_height, data->padding.width,
+ data->padding.height, output_activation_min, output_activation_max,
+ GetTensorData<float>(output), GetTensorDims(output),
+ GetTensorData<float>(im2col), GetTensorDims(im2col));
+ } else {
+ multithreaded_ops::Conv(
+ GetTensorData<float>(input), GetTensorDims(input), filter_data,
+ GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
+ params->stride_width, params->stride_height, data->padding.width,
+ data->padding.height, params->padding, output_activation_min,
+ output_activation_max, GetTensorData<float>(output),
+ GetTensorDims(output), GetTensorData<float>(im2col),
+ GetTensorDims(im2col));
+ }
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+ auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+ OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+ TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+ TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+ TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
+ bool hasBias = node->inputs->size == 3;
+ TfLiteTensor* bias =
+ hasBias ? &context->tensors[node->inputs->data[2]] : nullptr;
+ TfLiteTensor* im2col =
+ data->need_im2col
+ ? &context->tensors[node->temporaries->data[data->im2col_index]]
+ : nullptr;
+ TfLiteTensor* hwcn_weights =
+ data->need_hwcn_weights
+ ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
+ : nullptr;
+
+ if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
+ TransposeFloatTensor(filter, hwcn_weights);
+ data->have_weights_been_transposed = true;
+ }
+
+ // TODO(aselle): Consider whether float conv and quantized conv should be
+ // separate ops to avoid dispatch overhead here.
+ switch (input->type) { // Already know in/outtypes are same.
+ case kTfLiteFloat32:
+ EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
+ im2col, hwcn_weights, output);
+ break;
+ case kTfLiteUInt8:
+ EvalQuantized<kernel_type>(context, node, params, data, input, filter,
+ bias, im2col, hwcn_weights, output);
+ break;
+ default:
+ context->ReportError(context, "Type not currently supported.");
+ return kTfLiteError;
+ }
+ return kTfLiteOk;
+}
+
+} // namespace conv
+
+TfLiteRegistration* Register_CONVOLUTION_REF() {
+ static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+ conv::Eval<conv::kReference>};
+ return &r;
+}
+
+TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
+ static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+ conv::Eval<conv::kGenericOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_CONVOLUTION_NEON_OPT() {
+ static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+ conv::Eval<conv::kNeonOptimized>};
+ return &r;
+}
+
+TfLiteRegistration* Register_CONV_2D() {
+#ifdef USE_NEON
+ return Register_CONVOLUTION_NEON_OPT();
+#else
+ return Register_CONVOLUTION_GENERIC_OPT();
+#endif
+}
+
+} // namespace builtin
+} // namespace ops
+} // namespace tflite