diff options
Diffstat (limited to 'tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc')
-rw-r--r-- | tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc | 526 |
1 files changed, 0 insertions, 526 deletions
diff --git a/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc b/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc deleted file mode 100644 index b25bff45a1..0000000000 --- a/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc +++ /dev/null @@ -1,526 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// Implements quantized eight-bit versions of the convolution operations. - -#include <algorithm> -#include <vector> - -#include "public/gemmlowp.h" -#include "tensorflow/contrib/quantization/kernels/quantization_utils.h" -#include "tensorflow/contrib/quantization/kernels/reference_gemm.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/kernels/ops_util.h" -#include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/util/padding.h" - -namespace tensorflow { - -// This functor implements the convolution operation in as simple a form as -// possible. It won't give great performance, but it is very useful for -// stepping through and instrumenting for debugging, creating minimal benchmarks -// to prototype with, and sharing with teams that want to run this outside of -// our environment. -// With that in mind, I've avoided using anything except pretty standard C++ -// types. This is especially noticeable in the data access through raw array -// indexing. It's deliberate in this case though, since it makes the underlying -// memory order very explicit, which is important for both inspecting memory -// contents during debugging and for specifying what we expect to others. -// The memory layout of the data is, from biggest stride to smallest: -// input_data = [input_batches, input_height, input_width, input_depth] -// filter_data = [filter_height, filter_width, input_depth, filter_count] -// output_data = [input_batches, output_height, output_width, filter_count] -template <class T1, class T2, class T3> -class ReferenceConvFunctor { - public: - void operator()(OpKernelContext* op_context, const T1* input_data, - int input_batches, int input_height, int input_width, - int input_depth, int input_offset, const T2* filter_data, - int filter_height, int filter_width, int filter_count, - int filter_offset, int stride, Padding padding, - T3* output_data, int output_height, int output_width, - int output_shift, int output_offset, int output_mult) { - // Set up some constants we need for the output down-shifting and - // saturation. - const int32 highest = static_cast<int32>(Eigen::NumTraits<T3>::highest()); - const int32 lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest()); - - // When we're converting the 32 bit accumulator to a lower bit depth, we - // need to add on 0.5 in fixed-point terms to make the operation round half - // up towards positive infinity, rather than a floor. - // We also need to watch out for the case when there's no down shift, - // because a left shift by a negative number gives undefined results. - const int32 rounding = (output_shift < 1) ? 0 : (1 << (output_shift - 1)); - - // The two different padding modes we support can be a bit confusing. SAME - // means we're trying to produce an output image that's the same size as the - // input. It's complicated by stride, which shrinks the output image by a - // a factor, but it means we end up sampling from outside the borders of the - // input. These out-of-bounds values are read as zeroes. VALID means only - // produce output values where the filters can read all their values from - // within the input image. It effectively removes the margins of the output - // image compared to the one produced by SAME. Stride complicates this - // definition though, because it can result in the right and bottom filter - // patches sampling from outside the borders if it's greater than 1. - // Most of the logic for sorting this all out is done before this function, - // when we calculate the output size, but the positioning of the origin of - // the filters is different between the two modes, since SAME positions the - // first filter off the edge of the input. - int filter_left_offset; - int filter_top_offset; - if (padding == VALID) { - filter_left_offset = - ((output_width - 1) * stride + filter_width - input_width) / 2; - filter_top_offset = - ((output_height - 1) * stride + filter_height - input_height) / 2; - } else { - filter_left_offset = - ((output_width - 1) * stride + filter_width - input_width) / 2; - filter_top_offset = - ((output_height - 1) * stride + filter_height - input_height) / 2; - } - - // If we've got multiple images in our input, work through each of them. - for (int batch = 0; batch < input_batches; ++batch) { - // Walk through all the output image values, sliding the filter to - // different - // positions in the input. - for (int out_y = 0; out_y < output_height; ++out_y) { - for (int out_x = 0; out_x < output_width; ++out_x) { - // Each filter kernel produces one output channel. - for (int out_channel = 0; out_channel < filter_count; ++out_channel) { - // We're going to calculate a single output value, which means we - // need to multiply a three dimensional kernel of weights against - // the current location within the input image. - /* - *-------------------------------... - |\ ^ - | \in_depth - | \ v - | *-------------------------------... - | | ^ - | | in_y_origin - | | v \ - | |<in_x_origin>*---*^ - | | \| |filter_height - . | *---*v - . | <---> - . filter_width - . - */ - const int in_x_origin = (out_x * stride) - filter_left_offset; - const int in_y_origin = (out_y * stride) - filter_top_offset; - int32 total = 0; - for (int filter_y = 0; filter_y < filter_height; ++filter_y) { - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - for (int in_channel = 0; in_channel < input_depth; - ++in_channel) { - const int in_x = in_x_origin + filter_x; - const int in_y = in_y_origin + filter_y; - int32 input_value; - // If the location is outside the bounds of the input image, - // use zero as a default value. - if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && - (in_y < input_height)) { - const T1 input_source_value = - input_data[(batch * input_height * input_width * - input_depth) + - (in_y * input_width * input_depth) + - (in_x * input_depth) + in_channel]; - // We're promoting the T1 type to a higher bit depth here as - // we do the subtraction. - input_value = - static_cast<int32>(input_source_value) - input_offset; - } else { - input_value = 0; - } - const T2 filter_source_value = - filter_data[(filter_y * filter_width * input_depth * - filter_count) + - (filter_x * input_depth * filter_count) + - (in_channel * filter_count) + out_channel]; - // Another promotion to 32 bit, as above. - const int32 filter_value = - static_cast<int32>(filter_source_value) - filter_offset; - total += (input_value * filter_value); - } - } - } - // Here we're applying scale factors to compress the 32 bit - // accumulated total to a potentially lower bit depth. - const int32_t output = - ((((total + output_offset) * output_mult) + rounding) >> - output_shift); - // We need to saturate the results against the largest and smallest - // values that can be represented in this type. - const int32 top_clamped_output = std::min(output, highest); - const int32 clamped_output = std::max(top_clamped_output, lowest); - output_data[(batch * output_height * output_width * filter_count) + - (out_y * output_width * filter_count) + - (out_x * filter_count) + out_channel] = clamped_output; - } - } - } - } - } -}; - -// Implements convolution as a two stage process, first packing the patches of -// the input image into columns (im2col) and then running GEMM to produce the -// final result. -// TODO(petewarden) - We need to update gemmlowp to support 32-bit outputs -// before we can re-enable this path. -template <class T1, class T2, class T3> -class Im2ColConvFunctor { - public: - void operator()(OpKernelContext* op_context, const T1* input_data, - int input_batches, int input_height, int input_width, - int input_depth, int input_offset, const T2* filter_data, - int filter_height, int filter_width, int filter_count, - int filter_offset, int stride, Padding padding, - T3* output_data, int output_height, int output_width, - int output_shift, int output_offset, int output_mult) { - if (input_offset < 0) { - // Only log the first few occurrences of this warning. - static int warning_count = 0; - if (warning_count < 10) { - ++warning_count; - LOG(WARNING) - << "Zero is not representable in the quantized range used by the" - << " input. This means QuantizedConv2d has to fall back to a slow" - << " implementation, since the border of zero values can't be" - << " represented easily. You should try to construct graphs that" - << " avoid this situation."; - } - ReferenceConvFunctor<T1, T2, T3> conv_functor; - conv_functor(op_context, input_data, input_batches, input_height, - input_width, input_depth, input_offset, filter_data, - filter_height, filter_width, filter_count, filter_offset, - stride, padding, output_data, output_height, output_width, - output_shift, output_offset, output_mult); - return; - } - - CHECK_GT(output_width, 0); - CHECK_GT(output_height, 0); - int filter_left_offset; - int filter_top_offset; - if (padding == VALID) { - filter_left_offset = - ((output_width - 1) * stride + filter_width - input_width) / 2; - filter_top_offset = - ((output_height - 1) * stride + filter_height - input_height) / 2; - } else { - filter_left_offset = - ((output_width - 1) * stride + filter_width - input_width) / 2; - filter_top_offset = - ((output_height - 1) * stride + filter_height - input_height) / 2; - } - - // The im2col buffer has # of patches rows, and # of filters cols. - // It's laid out like this, in row major order in memory: - // < filter value count > - // ^ +---------------------+ - // patch | | - // count | | - // v +---------------------+ - // Each patch row contains a filter_width x filter_height patch of the - // input, with the depth channel as the most contiguous in memory, followed - // by the width, then the height. This is the standard memory order in the - // image world if it helps to visualize it. - const int filter_value_count = filter_width * filter_height * input_depth; - const int patch_count = input_batches * output_width * output_height; - const int im2col_size = patch_count * filter_value_count; - // TODO(petewarden) - Memory allocation can be very slow on Android. Can we - // optimize this by keeping the scratch buffer around? - std::unique_ptr<T1[]> im2col_buffer(new T1[im2col_size]); - - for (int batch = 0; batch < input_batches; ++batch) { - const T1* input_batch_start = - input_data + (batch * input_height * input_width * input_depth); - for (int out_y = 0; out_y < output_height; ++out_y) { - const int in_y_origin = (out_y * stride) - filter_top_offset; - for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = (out_x * stride) - filter_left_offset; - const int patch_index = (batch * output_width * output_height) + - (out_y * output_width) + out_x; - T1* im2col_patch_start = - im2col_buffer.get() + (patch_index * filter_value_count); - for (int filter_y = 0; filter_y < filter_height; ++filter_y) { - const int in_y = in_y_origin + filter_y; - T1* im2col_row_start = - im2col_patch_start + (filter_y * filter_width * input_depth); - // If we're off the top or the bottom of the input, fill the whole - // row with zeroes. - if ((in_y < 0) || (in_y >= input_height)) { - T1* im2col_row_end = - im2col_row_start + (filter_width * input_depth); - // We'll be subtracting this offset during the calculations - // so to get an actual zero after that bias we need to set - // it to input_offset here. - std::fill(im2col_row_start, im2col_row_end, input_offset); - } else { - // What we're doing here is trying to copy and fill the im2col - // buffer as efficiently as possible, using functions to set or - // duplicate values en masse. We know we don't have to worry about - // vertical edges because we dealt with that case above, so we - // just need to handle filters that overlap the left or right - // edges. Here's what that looks like: - // - // < left_zero_count > < center_copy_count > < right_zero_count > - // +------------------+---------------------+--------------------+ - // | (filter) | (image) | (filter) | - // +------------------+---------------------+--------------------+ - // in_x_origin 0 input_width in_x_end - // - // In reality it's unlikely that a filter patch will be wider - // than an input, but this shows all the edge cases. - // We use std::fill() to set the left and right sections to zeroes - // and std::copy() to copy over the input data for the center. - const int in_x_end = in_x_origin + filter_width; - const int left_zero_count = std::max(0, 0 - in_x_origin); - const int right_zero_count = std::max(0, in_x_end - input_width); - const int center_copy_count = - filter_width - (left_zero_count + right_zero_count); - if (left_zero_count > 0) { - T1* im2col_left_start = im2col_row_start; - T1* im2col_left_end = - im2col_left_start + (left_zero_count * input_depth); - std::fill(im2col_left_start, im2col_left_end, input_offset); - } - if (center_copy_count > 0) { - const T1* input_row_start = - input_batch_start + (in_y * input_width * input_depth) + - (std::max(0, in_x_origin) * input_depth); - const T1* input_row_end = - input_row_start + (center_copy_count * input_depth); - T1* im2col_center_start = - im2col_row_start + (left_zero_count * input_depth); - std::copy(input_row_start, input_row_end, im2col_center_start); - } - if (right_zero_count > 0) { - T1* im2col_right_start = - im2col_row_start + - ((left_zero_count + center_copy_count) * input_depth); - T1* im2col_right_end = - im2col_right_start + (right_zero_count * input_depth); - std::fill(im2col_right_start, im2col_right_end, input_offset); - } - } - } - } - } - } - - CHECK_GT(patch_count, 0); - CHECK_GT(filter_count, 0); - CHECK_GT(filter_value_count, 0); - - const bool transpose_a = false; - const bool transpose_b = false; - const bool transpose_c = false; - const int m = patch_count; - const int n = filter_count; - const int k = filter_value_count; - const int lda = filter_value_count; - const int ldb = filter_count; - const int ldc = filter_count; - // The gemmlowp optimized library only works for a particular set of data - // types, so check if we meet those requirements and - // fall back to a slower reference implementation if not. - if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() && - std::is_same<T3, qint32>() && (output_offset == 0) && - (output_mult == 1) && (output_shift == 0)) { - const uint8* im2col_data_as_uint8 = &(im2col_buffer.get()->value); - const uint8* filter_data_as_uint8 = &(filter_data->value); - int32* output_data_as_int32 = &(output_data->value); - // All of the transpose_* variables are currently compile-time consts, so - // we could just hard-code these values too, but that would break if - // anybody changed those values in the future (e.g. to match the ability - // of MatMul to specify them as attributes). We're using a verbose - // approach of deriving the order values from the transpose variables to - // be able to catch any changes like that. - static const gemmlowp::MapOrder ResultOrder = - !transpose_c ? gemmlowp::MapOrder::RowMajor - : gemmlowp::MapOrder::ColMajor; - static const gemmlowp::MapOrder LhsOrder = - !transpose_a ? gemmlowp::MapOrder::RowMajor - : gemmlowp::MapOrder::ColMajor; - static const gemmlowp::MapOrder RhsOrder = - !transpose_b ? gemmlowp::MapOrder::RowMajor - : gemmlowp::MapOrder::ColMajor; - gemmlowp::MatrixMap<const std::uint8_t, LhsOrder> lhs( - im2col_data_as_uint8, m, k, lda); - gemmlowp::MatrixMap<const std::uint8_t, RhsOrder> rhs( - filter_data_as_uint8, k, n, ldb); - gemmlowp::MatrixMap<std::int32_t, ResultOrder> result( - output_data_as_int32, m, n, ldc); - const std::tuple<> empty_pipeline = {}; - - auto& worker_threads = - *(op_context->device()->tensorflow_cpu_worker_threads()); - TensorflowGemmContext context(worker_threads.num_threads, - worker_threads.workers); - gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, - gemmlowp::DefaultL8R8BitDepthParams>( - &context, lhs, rhs, &result, -input_offset, -filter_offset, - empty_pipeline); - } else { - ReferenceGemm<T1, T2, T3>(transpose_a, transpose_b, transpose_c, m, n, k, - im2col_buffer.get(), input_offset, lda, - filter_data, filter_offset, ldb, output_data, - output_shift, output_offset, output_mult, ldc); - } - } -}; - -template <class T1, class T2, class T3, - template <class TF1, class TF2, class TF3> class ConvFunctor> -class QuantizedConv2DOp : public OpKernel { - public: - explicit QuantizedConv2DOp(OpKernelConstruction* context) - : OpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); - OP_REQUIRES(context, strides_.size() == 4, - errors::InvalidArgument("Sliding window strides field must " - "specify 4 dimensions")); - OP_REQUIRES(context, strides_[1] == strides_[2], - errors::InvalidArgument( - "Current implementation only supports equal length " - "strides in the row and column dimensions.")); - OP_REQUIRES( - context, (strides_[0] == 1 && strides_[3] == 1), - errors::InvalidArgument("Current implementation does not yet support " - "strides in the batch and depth dimensions.")); - OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); - } - - void Compute(OpKernelContext* context) override { - // Input tensor is of the following dimensions: - // [ batch, in_rows, in_cols, in_depth ] - const Tensor& input = context->input(0); - - // Input filter is of the following dimensions: - // [ filter_rows, filter_cols, in_depth, out_depth] - const Tensor& filter = context->input(1); - - // For 2D convolution, there should be 4 dimensions. - OP_REQUIRES(context, input.dims() == 4, - errors::InvalidArgument("input must be 4-dimensional", - input.shape().DebugString())); - OP_REQUIRES(context, filter.dims() == 4, - errors::InvalidArgument("filter must be 4-dimensional: ", - filter.shape().DebugString())); - - const float min_input = context->input(2).flat<float>()(0); - const float max_input = context->input(3).flat<float>()(0); - const float min_filter = context->input(4).flat<float>()(0); - const float max_filter = context->input(5).flat<float>()(0); - const int32 offset_input = - FloatToQuantizedUnclamped<T1>(0.0f, min_input, max_input); - const int32 offset_filter = - FloatToQuantizedUnclamped<T2>(0.0f, min_filter, max_filter); - const int32 offset_output = 0; - const int32 mult_output = 1; - const int32 shift_output = 0; - - // The last dimension for input is in_depth. It must be the same as the - // filter's in_depth. - const int64 in_depth = input.dim_size(3); - OP_REQUIRES( - context, in_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - in_depth, " vs ", filter.dim_size(2))); - - // The last dimension for filter is out_depth. - const int64 out_depth = filter.dim_size(3); - - // The second dimension for input is rows/height. - // The first dimension for filter is rows/height. - const int64 input_rows = input.dim_size(1); - const int64 filter_rows = filter.dim_size(0); - - // The third dimension for input is columns/width. - // The second dimension for filter is columns/width. - const int64 input_cols = input.dim_size(2); - const int64 filter_cols = filter.dim_size(1); - - // The first dimension for input is batch. - const int64 batch = input.dim_size(0); - - // For now we take the stride from the second dimension only (we - // assume row = col stride, and do not support striding on the - // batch or depth dimension). - const int stride = strides_[1]; - - int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; - OP_REQUIRES_OK(context, - GetWindowedOutputSize(input_rows, filter_rows, stride, - padding_, &out_rows, &pad_rows)); - OP_REQUIRES_OK(context, - GetWindowedOutputSize(input_cols, filter_cols, stride, - padding_, &out_cols, &pad_cols)); - CHECK_GT(batch, 0); - CHECK_GT(out_rows, 0); - CHECK_GT(out_cols, 0); - CHECK_GT(out_depth, 0); - TensorShape out_shape({batch, out_rows, out_cols, out_depth}); - - // Output tensor is of the following dimensions: - // [ in_batch, out_rows, out_cols, out_depth ] - Tensor* output = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); - - // This will call different implementations (e.g. reference or optimized) - // depending on the template parameter. - ConvFunctor<T1, T2, T3> conv_functor; - conv_functor(context, input.flat<T1>().data(), batch, input_rows, - input_cols, in_depth, offset_input, filter.flat<T2>().data(), - filter_rows, filter_cols, out_depth, offset_filter, stride, - padding_, output->flat<T3>().data(), out_rows, out_cols, - shift_output, offset_output, mult_output); - - float min_output_value; - float max_output_value; - QuantizationRangeForMultiplication<T1, T2, T3>( - min_input, max_input, min_filter, max_filter, &min_output_value, - &max_output_value); - - Tensor* output_min = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min)); - output_min->flat<float>()(0) = min_output_value; - - Tensor* output_max = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_max)); - output_max->flat<float>()(0) = max_output_value; - } - - private: - std::vector<int32> strides_; - Padding padding_; -}; - -// Right now we only support taking two eight bit inputs, and returning the -// results as signed 32-bit integers. -REGISTER_KERNEL_BUILDER( - Name("QuantizedConv2D") - .Device(DEVICE_CPU) - .TypeConstraint<quint8>("Tinput") - .TypeConstraint<quint8>("Tfilter") - .TypeConstraint<qint32>("out_type"), - QuantizedConv2DOp<quint8, quint8, qint32, Im2ColConvFunctor>); - -} // namespace tensorflow |