aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc')
-rw-r--r--tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc526
1 files changed, 0 insertions, 526 deletions
diff --git a/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc b/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
deleted file mode 100644
index b25bff45a1..0000000000
--- a/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
+++ /dev/null
@@ -1,526 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Implements quantized eight-bit versions of the convolution operations.
-
-#include <algorithm>
-#include <vector>
-
-#include "public/gemmlowp.h"
-#include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
-#include "tensorflow/contrib/quantization/kernels/reference_gemm.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/util/padding.h"
-
-namespace tensorflow {
-
-// This functor implements the convolution operation in as simple a form as
-// possible. It won't give great performance, but it is very useful for
-// stepping through and instrumenting for debugging, creating minimal benchmarks
-// to prototype with, and sharing with teams that want to run this outside of
-// our environment.
-// With that in mind, I've avoided using anything except pretty standard C++
-// types. This is especially noticeable in the data access through raw array
-// indexing. It's deliberate in this case though, since it makes the underlying
-// memory order very explicit, which is important for both inspecting memory
-// contents during debugging and for specifying what we expect to others.
-// The memory layout of the data is, from biggest stride to smallest:
-// input_data = [input_batches, input_height, input_width, input_depth]
-// filter_data = [filter_height, filter_width, input_depth, filter_count]
-// output_data = [input_batches, output_height, output_width, filter_count]
-template <class T1, class T2, class T3>
-class ReferenceConvFunctor {
- public:
- void operator()(OpKernelContext* op_context, const T1* input_data,
- int input_batches, int input_height, int input_width,
- int input_depth, int input_offset, const T2* filter_data,
- int filter_height, int filter_width, int filter_count,
- int filter_offset, int stride, Padding padding,
- T3* output_data, int output_height, int output_width,
- int output_shift, int output_offset, int output_mult) {
- // Set up some constants we need for the output down-shifting and
- // saturation.
- const int32 highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
- const int32 lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
-
- // When we're converting the 32 bit accumulator to a lower bit depth, we
- // need to add on 0.5 in fixed-point terms to make the operation round half
- // up towards positive infinity, rather than a floor.
- // We also need to watch out for the case when there's no down shift,
- // because a left shift by a negative number gives undefined results.
- const int32 rounding = (output_shift < 1) ? 0 : (1 << (output_shift - 1));
-
- // The two different padding modes we support can be a bit confusing. SAME
- // means we're trying to produce an output image that's the same size as the
- // input. It's complicated by stride, which shrinks the output image by a
- // a factor, but it means we end up sampling from outside the borders of the
- // input. These out-of-bounds values are read as zeroes. VALID means only
- // produce output values where the filters can read all their values from
- // within the input image. It effectively removes the margins of the output
- // image compared to the one produced by SAME. Stride complicates this
- // definition though, because it can result in the right and bottom filter
- // patches sampling from outside the borders if it's greater than 1.
- // Most of the logic for sorting this all out is done before this function,
- // when we calculate the output size, but the positioning of the origin of
- // the filters is different between the two modes, since SAME positions the
- // first filter off the edge of the input.
- int filter_left_offset;
- int filter_top_offset;
- if (padding == VALID) {
- filter_left_offset =
- ((output_width - 1) * stride + filter_width - input_width) / 2;
- filter_top_offset =
- ((output_height - 1) * stride + filter_height - input_height) / 2;
- } else {
- filter_left_offset =
- ((output_width - 1) * stride + filter_width - input_width) / 2;
- filter_top_offset =
- ((output_height - 1) * stride + filter_height - input_height) / 2;
- }
-
- // If we've got multiple images in our input, work through each of them.
- for (int batch = 0; batch < input_batches; ++batch) {
- // Walk through all the output image values, sliding the filter to
- // different
- // positions in the input.
- for (int out_y = 0; out_y < output_height; ++out_y) {
- for (int out_x = 0; out_x < output_width; ++out_x) {
- // Each filter kernel produces one output channel.
- for (int out_channel = 0; out_channel < filter_count; ++out_channel) {
- // We're going to calculate a single output value, which means we
- // need to multiply a three dimensional kernel of weights against
- // the current location within the input image.
- /*
- *-------------------------------...
- |\ ^
- | \in_depth
- | \ v
- | *-------------------------------...
- | | ^
- | | in_y_origin
- | | v \
- | |<in_x_origin>*---*^
- | | \| |filter_height
- . | *---*v
- . | <--->
- . filter_width
- .
- */
- const int in_x_origin = (out_x * stride) - filter_left_offset;
- const int in_y_origin = (out_y * stride) - filter_top_offset;
- int32 total = 0;
- for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
- for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
- for (int in_channel = 0; in_channel < input_depth;
- ++in_channel) {
- const int in_x = in_x_origin + filter_x;
- const int in_y = in_y_origin + filter_y;
- int32 input_value;
- // If the location is outside the bounds of the input image,
- // use zero as a default value.
- if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
- (in_y < input_height)) {
- const T1 input_source_value =
- input_data[(batch * input_height * input_width *
- input_depth) +
- (in_y * input_width * input_depth) +
- (in_x * input_depth) + in_channel];
- // We're promoting the T1 type to a higher bit depth here as
- // we do the subtraction.
- input_value =
- static_cast<int32>(input_source_value) - input_offset;
- } else {
- input_value = 0;
- }
- const T2 filter_source_value =
- filter_data[(filter_y * filter_width * input_depth *
- filter_count) +
- (filter_x * input_depth * filter_count) +
- (in_channel * filter_count) + out_channel];
- // Another promotion to 32 bit, as above.
- const int32 filter_value =
- static_cast<int32>(filter_source_value) - filter_offset;
- total += (input_value * filter_value);
- }
- }
- }
- // Here we're applying scale factors to compress the 32 bit
- // accumulated total to a potentially lower bit depth.
- const int32_t output =
- ((((total + output_offset) * output_mult) + rounding) >>
- output_shift);
- // We need to saturate the results against the largest and smallest
- // values that can be represented in this type.
- const int32 top_clamped_output = std::min(output, highest);
- const int32 clamped_output = std::max(top_clamped_output, lowest);
- output_data[(batch * output_height * output_width * filter_count) +
- (out_y * output_width * filter_count) +
- (out_x * filter_count) + out_channel] = clamped_output;
- }
- }
- }
- }
- }
-};
-
-// Implements convolution as a two stage process, first packing the patches of
-// the input image into columns (im2col) and then running GEMM to produce the
-// final result.
-// TODO(petewarden) - We need to update gemmlowp to support 32-bit outputs
-// before we can re-enable this path.
-template <class T1, class T2, class T3>
-class Im2ColConvFunctor {
- public:
- void operator()(OpKernelContext* op_context, const T1* input_data,
- int input_batches, int input_height, int input_width,
- int input_depth, int input_offset, const T2* filter_data,
- int filter_height, int filter_width, int filter_count,
- int filter_offset, int stride, Padding padding,
- T3* output_data, int output_height, int output_width,
- int output_shift, int output_offset, int output_mult) {
- if (input_offset < 0) {
- // Only log the first few occurrences of this warning.
- static int warning_count = 0;
- if (warning_count < 10) {
- ++warning_count;
- LOG(WARNING)
- << "Zero is not representable in the quantized range used by the"
- << " input. This means QuantizedConv2d has to fall back to a slow"
- << " implementation, since the border of zero values can't be"
- << " represented easily. You should try to construct graphs that"
- << " avoid this situation.";
- }
- ReferenceConvFunctor<T1, T2, T3> conv_functor;
- conv_functor(op_context, input_data, input_batches, input_height,
- input_width, input_depth, input_offset, filter_data,
- filter_height, filter_width, filter_count, filter_offset,
- stride, padding, output_data, output_height, output_width,
- output_shift, output_offset, output_mult);
- return;
- }
-
- CHECK_GT(output_width, 0);
- CHECK_GT(output_height, 0);
- int filter_left_offset;
- int filter_top_offset;
- if (padding == VALID) {
- filter_left_offset =
- ((output_width - 1) * stride + filter_width - input_width) / 2;
- filter_top_offset =
- ((output_height - 1) * stride + filter_height - input_height) / 2;
- } else {
- filter_left_offset =
- ((output_width - 1) * stride + filter_width - input_width) / 2;
- filter_top_offset =
- ((output_height - 1) * stride + filter_height - input_height) / 2;
- }
-
- // The im2col buffer has # of patches rows, and # of filters cols.
- // It's laid out like this, in row major order in memory:
- // < filter value count >
- // ^ +---------------------+
- // patch | |
- // count | |
- // v +---------------------+
- // Each patch row contains a filter_width x filter_height patch of the
- // input, with the depth channel as the most contiguous in memory, followed
- // by the width, then the height. This is the standard memory order in the
- // image world if it helps to visualize it.
- const int filter_value_count = filter_width * filter_height * input_depth;
- const int patch_count = input_batches * output_width * output_height;
- const int im2col_size = patch_count * filter_value_count;
- // TODO(petewarden) - Memory allocation can be very slow on Android. Can we
- // optimize this by keeping the scratch buffer around?
- std::unique_ptr<T1[]> im2col_buffer(new T1[im2col_size]);
-
- for (int batch = 0; batch < input_batches; ++batch) {
- const T1* input_batch_start =
- input_data + (batch * input_height * input_width * input_depth);
- for (int out_y = 0; out_y < output_height; ++out_y) {
- const int in_y_origin = (out_y * stride) - filter_top_offset;
- for (int out_x = 0; out_x < output_width; ++out_x) {
- const int in_x_origin = (out_x * stride) - filter_left_offset;
- const int patch_index = (batch * output_width * output_height) +
- (out_y * output_width) + out_x;
- T1* im2col_patch_start =
- im2col_buffer.get() + (patch_index * filter_value_count);
- for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
- const int in_y = in_y_origin + filter_y;
- T1* im2col_row_start =
- im2col_patch_start + (filter_y * filter_width * input_depth);
- // If we're off the top or the bottom of the input, fill the whole
- // row with zeroes.
- if ((in_y < 0) || (in_y >= input_height)) {
- T1* im2col_row_end =
- im2col_row_start + (filter_width * input_depth);
- // We'll be subtracting this offset during the calculations
- // so to get an actual zero after that bias we need to set
- // it to input_offset here.
- std::fill(im2col_row_start, im2col_row_end, input_offset);
- } else {
- // What we're doing here is trying to copy and fill the im2col
- // buffer as efficiently as possible, using functions to set or
- // duplicate values en masse. We know we don't have to worry about
- // vertical edges because we dealt with that case above, so we
- // just need to handle filters that overlap the left or right
- // edges. Here's what that looks like:
- //
- // < left_zero_count > < center_copy_count > < right_zero_count >
- // +------------------+---------------------+--------------------+
- // | (filter) | (image) | (filter) |
- // +------------------+---------------------+--------------------+
- // in_x_origin 0 input_width in_x_end
- //
- // In reality it's unlikely that a filter patch will be wider
- // than an input, but this shows all the edge cases.
- // We use std::fill() to set the left and right sections to zeroes
- // and std::copy() to copy over the input data for the center.
- const int in_x_end = in_x_origin + filter_width;
- const int left_zero_count = std::max(0, 0 - in_x_origin);
- const int right_zero_count = std::max(0, in_x_end - input_width);
- const int center_copy_count =
- filter_width - (left_zero_count + right_zero_count);
- if (left_zero_count > 0) {
- T1* im2col_left_start = im2col_row_start;
- T1* im2col_left_end =
- im2col_left_start + (left_zero_count * input_depth);
- std::fill(im2col_left_start, im2col_left_end, input_offset);
- }
- if (center_copy_count > 0) {
- const T1* input_row_start =
- input_batch_start + (in_y * input_width * input_depth) +
- (std::max(0, in_x_origin) * input_depth);
- const T1* input_row_end =
- input_row_start + (center_copy_count * input_depth);
- T1* im2col_center_start =
- im2col_row_start + (left_zero_count * input_depth);
- std::copy(input_row_start, input_row_end, im2col_center_start);
- }
- if (right_zero_count > 0) {
- T1* im2col_right_start =
- im2col_row_start +
- ((left_zero_count + center_copy_count) * input_depth);
- T1* im2col_right_end =
- im2col_right_start + (right_zero_count * input_depth);
- std::fill(im2col_right_start, im2col_right_end, input_offset);
- }
- }
- }
- }
- }
- }
-
- CHECK_GT(patch_count, 0);
- CHECK_GT(filter_count, 0);
- CHECK_GT(filter_value_count, 0);
-
- const bool transpose_a = false;
- const bool transpose_b = false;
- const bool transpose_c = false;
- const int m = patch_count;
- const int n = filter_count;
- const int k = filter_value_count;
- const int lda = filter_value_count;
- const int ldb = filter_count;
- const int ldc = filter_count;
- // The gemmlowp optimized library only works for a particular set of data
- // types, so check if we meet those requirements and
- // fall back to a slower reference implementation if not.
- if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
- std::is_same<T3, qint32>() && (output_offset == 0) &&
- (output_mult == 1) && (output_shift == 0)) {
- const uint8* im2col_data_as_uint8 = &(im2col_buffer.get()->value);
- const uint8* filter_data_as_uint8 = &(filter_data->value);
- int32* output_data_as_int32 = &(output_data->value);
- // All of the transpose_* variables are currently compile-time consts, so
- // we could just hard-code these values too, but that would break if
- // anybody changed those values in the future (e.g. to match the ability
- // of MatMul to specify them as attributes). We're using a verbose
- // approach of deriving the order values from the transpose variables to
- // be able to catch any changes like that.
- static const gemmlowp::MapOrder ResultOrder =
- !transpose_c ? gemmlowp::MapOrder::RowMajor
- : gemmlowp::MapOrder::ColMajor;
- static const gemmlowp::MapOrder LhsOrder =
- !transpose_a ? gemmlowp::MapOrder::RowMajor
- : gemmlowp::MapOrder::ColMajor;
- static const gemmlowp::MapOrder RhsOrder =
- !transpose_b ? gemmlowp::MapOrder::RowMajor
- : gemmlowp::MapOrder::ColMajor;
- gemmlowp::MatrixMap<const std::uint8_t, LhsOrder> lhs(
- im2col_data_as_uint8, m, k, lda);
- gemmlowp::MatrixMap<const std::uint8_t, RhsOrder> rhs(
- filter_data_as_uint8, k, n, ldb);
- gemmlowp::MatrixMap<std::int32_t, ResultOrder> result(
- output_data_as_int32, m, n, ldc);
- const std::tuple<> empty_pipeline = {};
-
- auto& worker_threads =
- *(op_context->device()->tensorflow_cpu_worker_threads());
- TensorflowGemmContext context(worker_threads.num_threads,
- worker_threads.workers);
- gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
- gemmlowp::DefaultL8R8BitDepthParams>(
- &context, lhs, rhs, &result, -input_offset, -filter_offset,
- empty_pipeline);
- } else {
- ReferenceGemm<T1, T2, T3>(transpose_a, transpose_b, transpose_c, m, n, k,
- im2col_buffer.get(), input_offset, lda,
- filter_data, filter_offset, ldb, output_data,
- output_shift, output_offset, output_mult, ldc);
- }
- }
-};
-
-template <class T1, class T2, class T3,
- template <class TF1, class TF2, class TF3> class ConvFunctor>
-class QuantizedConv2DOp : public OpKernel {
- public:
- explicit QuantizedConv2DOp(OpKernelConstruction* context)
- : OpKernel(context) {
- OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
- OP_REQUIRES(context, strides_.size() == 4,
- errors::InvalidArgument("Sliding window strides field must "
- "specify 4 dimensions"));
- OP_REQUIRES(context, strides_[1] == strides_[2],
- errors::InvalidArgument(
- "Current implementation only supports equal length "
- "strides in the row and column dimensions."));
- OP_REQUIRES(
- context, (strides_[0] == 1 && strides_[3] == 1),
- errors::InvalidArgument("Current implementation does not yet support "
- "strides in the batch and depth dimensions."));
- OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
- }
-
- void Compute(OpKernelContext* context) override {
- // Input tensor is of the following dimensions:
- // [ batch, in_rows, in_cols, in_depth ]
- const Tensor& input = context->input(0);
-
- // Input filter is of the following dimensions:
- // [ filter_rows, filter_cols, in_depth, out_depth]
- const Tensor& filter = context->input(1);
-
- // For 2D convolution, there should be 4 dimensions.
- OP_REQUIRES(context, input.dims() == 4,
- errors::InvalidArgument("input must be 4-dimensional",
- input.shape().DebugString()));
- OP_REQUIRES(context, filter.dims() == 4,
- errors::InvalidArgument("filter must be 4-dimensional: ",
- filter.shape().DebugString()));
-
- const float min_input = context->input(2).flat<float>()(0);
- const float max_input = context->input(3).flat<float>()(0);
- const float min_filter = context->input(4).flat<float>()(0);
- const float max_filter = context->input(5).flat<float>()(0);
- const int32 offset_input =
- FloatToQuantizedUnclamped<T1>(0.0f, min_input, max_input);
- const int32 offset_filter =
- FloatToQuantizedUnclamped<T2>(0.0f, min_filter, max_filter);
- const int32 offset_output = 0;
- const int32 mult_output = 1;
- const int32 shift_output = 0;
-
- // The last dimension for input is in_depth. It must be the same as the
- // filter's in_depth.
- const int64 in_depth = input.dim_size(3);
- OP_REQUIRES(
- context, in_depth == filter.dim_size(2),
- errors::InvalidArgument("input and filter must have the same depth: ",
- in_depth, " vs ", filter.dim_size(2)));
-
- // The last dimension for filter is out_depth.
- const int64 out_depth = filter.dim_size(3);
-
- // The second dimension for input is rows/height.
- // The first dimension for filter is rows/height.
- const int64 input_rows = input.dim_size(1);
- const int64 filter_rows = filter.dim_size(0);
-
- // The third dimension for input is columns/width.
- // The second dimension for filter is columns/width.
- const int64 input_cols = input.dim_size(2);
- const int64 filter_cols = filter.dim_size(1);
-
- // The first dimension for input is batch.
- const int64 batch = input.dim_size(0);
-
- // For now we take the stride from the second dimension only (we
- // assume row = col stride, and do not support striding on the
- // batch or depth dimension).
- const int stride = strides_[1];
-
- int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
- OP_REQUIRES_OK(context,
- GetWindowedOutputSize(input_rows, filter_rows, stride,
- padding_, &out_rows, &pad_rows));
- OP_REQUIRES_OK(context,
- GetWindowedOutputSize(input_cols, filter_cols, stride,
- padding_, &out_cols, &pad_cols));
- CHECK_GT(batch, 0);
- CHECK_GT(out_rows, 0);
- CHECK_GT(out_cols, 0);
- CHECK_GT(out_depth, 0);
- TensorShape out_shape({batch, out_rows, out_cols, out_depth});
-
- // Output tensor is of the following dimensions:
- // [ in_batch, out_rows, out_cols, out_depth ]
- Tensor* output = nullptr;
- OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
-
- // This will call different implementations (e.g. reference or optimized)
- // depending on the template parameter.
- ConvFunctor<T1, T2, T3> conv_functor;
- conv_functor(context, input.flat<T1>().data(), batch, input_rows,
- input_cols, in_depth, offset_input, filter.flat<T2>().data(),
- filter_rows, filter_cols, out_depth, offset_filter, stride,
- padding_, output->flat<T3>().data(), out_rows, out_cols,
- shift_output, offset_output, mult_output);
-
- float min_output_value;
- float max_output_value;
- QuantizationRangeForMultiplication<T1, T2, T3>(
- min_input, max_input, min_filter, max_filter, &min_output_value,
- &max_output_value);
-
- Tensor* output_min = nullptr;
- OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
- output_min->flat<float>()(0) = min_output_value;
-
- Tensor* output_max = nullptr;
- OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_max));
- output_max->flat<float>()(0) = max_output_value;
- }
-
- private:
- std::vector<int32> strides_;
- Padding padding_;
-};
-
-// Right now we only support taking two eight bit inputs, and returning the
-// results as signed 32-bit integers.
-REGISTER_KERNEL_BUILDER(
- Name("QuantizedConv2D")
- .Device(DEVICE_CPU)
- .TypeConstraint<quint8>("Tinput")
- .TypeConstraint<quint8>("Tfilter")
- .TypeConstraint<qint32>("out_type"),
- QuantizedConv2DOp<quint8, quint8, qint32, Im2ColConvFunctor>);
-
-} // namespace tensorflow