diff options
author | Suharsh Sivakumar <suharshs@google.com> | 2018-06-01 18:30:32 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-06-01 18:33:35 -0700 |
commit | dbdd276a05c417963b3f06f71e801540bde9ab7c (patch) | |
tree | 7a3d8c875fb393026266603b293eb59869ea7268 | |
parent | d81328115bd10de70570c46dbfc683cd0238d779 (diff) |
Quantize weights transformation for toco.
Finds float weight tensors, quantizes them to 8 bits, and adds Dequantize operations after them.
PiperOrigin-RevId: 198955123
11 files changed, 319 insertions, 8 deletions
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD index b8acc9a8e0..7ea4f32ef6 100644 --- a/tensorflow/contrib/lite/toco/BUILD +++ b/tensorflow/contrib/lite/toco/BUILD @@ -245,6 +245,7 @@ cc_library( "graph_transformations/quantization_util.cc", "graph_transformations/quantization_util.h", "graph_transformations/quantize.cc", + "graph_transformations/quantize_weights.cc", "graph_transformations/read_fake_quant_min_max.cc", "graph_transformations/remove_final_dequantize_op.cc", "graph_transformations/remove_tensorflow_assert.cc", diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h index 77bc54f191..9f5ca66d05 100644 --- a/tensorflow/contrib/lite/toco/args.h +++ b/tensorflow/contrib/lite/toco/args.h @@ -234,6 +234,7 @@ struct ParsedTocoFlags { Arg<bool> drop_fake_quant = Arg<bool>(false); Arg<bool> reorder_across_fake_quant = Arg<bool>(false); Arg<bool> allow_custom_ops = Arg<bool>(false); + Arg<bool> quantize_weights = Arg<bool>(false); // Deprecated flags Arg<string> input_type; Arg<string> input_types; diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md index 9e99287f82..a8381169b8 100644 --- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md +++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md @@ -203,6 +203,10 @@ have. graph transformations on them, at the cost of no longer faithfully matching inference and training arithmetic. +* `--quantize_weights`. Type: boolean. Default: false. Store weights as + quantized weights followed by dequantize operations. Computation is still + done in float, but reduces model size (at the cost of accuracy and latency). + ## Logging flags The following are standard Google logging flags: diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h index 8da242aa9c..1bc7557d46 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h +++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h @@ -139,6 +139,7 @@ DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits); DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes) DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax) DECLARE_GRAPH_TRANSFORMATION(Quantize) +DECLARE_GRAPH_TRANSFORMATION(QuantizeWeights) DECLARE_GRAPH_TRANSFORMATION(RemoveFinalDequantizeOp) DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowAssert) DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity) diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc new file mode 100644 index 0000000000..88ea0945e7 --- /dev/null +++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc @@ -0,0 +1,108 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <iterator> +#include <string> +#include <vector> + +#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h" +#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h" +#include "tensorflow/contrib/lite/toco/model.h" +#include "tensorflow/contrib/lite/toco/tooling_util.h" + +namespace toco { + +namespace { + +// The minimum number of elements a weights array must have to be quantized +// by this transformation. +// TODO(suharshs): Make this minimum size configurable. +const int kWeightsMinSize = 1024; + +// Gets the quantization params from the float array. +void GetQuantizationParamsFromArray(const Array& array, + QuantizationParams* params) { + const std::vector<float>& float_vals = + array.GetBuffer<ArrayDataType::kFloat>().data; + auto minmax = std::minmax_element(float_vals.begin(), float_vals.end()); + MinMax toco_minmax; + toco_minmax.min = *minmax.first; + toco_minmax.max = *minmax.second; + GetQuantizationParams(ArrayDataType::kUint8, toco_minmax, params); +} + +} // namespace + +bool QuantizeWeights::Run(Model* model, std::size_t op_index) { + const auto op_it = model->operators.begin() + op_index; + Operator* op = op_it->get(); + + // Get the weights tensor, if the current operator has one. + int weights_index; + if (op->type == OperatorType::kConv || + op->type == OperatorType::kDepthwiseConv || + op->type == OperatorType::kFullyConnected) { + weights_index = 1; + } else if (op->type == OperatorType::kLstmCell) { + weights_index = LstmCellOperator::WEIGHTS_INPUT; + } else { + return false; + } + + // Return early if the array isn't a constant param, this can happen in early + // transformation passes until transpose operations following the weight array + // are resolved. + const string weights = op->inputs[weights_index]; + if (!IsConstantParameterArray(*model, weights)) { + return false; + } + + // Return early if the weight tensor is not type float. + Array& weights_array = model->GetArray(weights); + if (weights_array.data_type != ArrayDataType::kFloat) { + return false; + } + + // Return early if the tensor is too small. Small tensors don't take up too + // much space and can result in bad quantization results. + if (weights_array.GetBuffer<ArrayDataType::kFloat>().data.size() < + kWeightsMinSize) { + return false; + } + + // Quantize the weight tensor to type kUint8. + QuantizationParams params; + GetQuantizationParamsFromArray(weights_array, ¶ms); + QuantizeArray(this, model, weights, ArrayDataType::kUint8, params); + + // Insert a Dequantize operation after the quantized weights tensor. + auto* dequantize_op = new DequantizeOperator; + model->operators.emplace(op_it, dequantize_op); + + // Create a new intermediate tensor to connect the Dequantize op to the + // original op. + const string dequantized_output = + AvailableArrayName(*model, weights + "_dequantized"); + Array& dequantized_output_array = model->GetOrCreateArray(dequantized_output); + dequantized_output_array.data_type = ArrayDataType::kFloat; + + // Connect up the new Dequantize op with the weights and original op. + op->inputs[weights_index] = dequantized_output; + dequantize_op->inputs = {weights}; + dequantize_op->outputs = {dequantized_output}; + + return true; +} + +} // namespace toco diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD index 8dcd4adc90..95e8433be2 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD +++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD @@ -8,8 +8,8 @@ load( ) tf_cc_test( - name = "resolve_constant_concatenation_test", - srcs = ["resolve_constant_concatenation_test.cc"], + name = "lstm_utils_test", + srcs = ["lstm_utils_test.cc"], deps = [ "//tensorflow/contrib/lite/toco:graph_transformations", "//tensorflow/contrib/lite/toco:model", @@ -19,8 +19,20 @@ tf_cc_test( ) tf_cc_test( - name = "lstm_utils_test", - srcs = ["lstm_utils_test.cc"], + name = "quantize_weights_test", + srcs = ["quantize_weights_test.cc"], + deps = [ + "//tensorflow/contrib/lite/toco:graph_transformations", + "//tensorflow/contrib/lite/toco:model", + "//tensorflow/contrib/lite/toco:tooling_util", + "@com_google_absl//absl/memory", + "@com_google_googletest//:gtest_main", + ], +) + +tf_cc_test( + name = "resolve_constant_concatenation_test", + srcs = ["resolve_constant_concatenation_test.cc"], deps = [ "//tensorflow/contrib/lite/toco:graph_transformations", "//tensorflow/contrib/lite/toco:model", diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc new file mode 100644 index 0000000000..c05eb0929f --- /dev/null +++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc @@ -0,0 +1,167 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include <math.h> +#include <string> +#include <vector> + +#include <gmock/gmock.h> +#include <gtest/gtest.h> +#include "absl/memory/memory.h" +#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h" +#include "tensorflow/contrib/lite/toco/model.h" +#include "tensorflow/contrib/lite/toco/tooling_util.h" + +namespace toco { + +class QuantizeWeightsTest : public ::testing::Test { + protected: + QuantizeWeightsTest() {} + + // The name of the weights input array. + const string kWeightsName = "weights"; + // The zero_point of the values in the input array. + const int kZeroPoint = 128; + + // Prepare a hypothetical TOCO model of a quantizable fully connected float + // layer. + void PrepareModel(Model* model, int elements_per_dim) { + std::vector<string> fc_input_names = {"inputs", kWeightsName}; + + const int kDim = 4; + const int buf_size = std::pow(elements_per_dim, static_cast<double>(kDim)); + auto in_buf = absl::make_unique<float[]>(buf_size); + // Initialize the array with values from -128.0 to 127.0, since these values + // should be exactly representable by quantization. + for (int i = 0; i < buf_size; i++) { + in_buf[i] = static_cast<float>(i % 256 - kZeroPoint); + } + + for (const string& fc_input_name : fc_input_names) { + Array& in_array = model->GetOrCreateArray(fc_input_name); + in_array.data_type = ArrayDataType::kFloat; + + // Initialize shape for the input array. + Shape* in_array_shape = in_array.mutable_shape(); + std::vector<int>* in_array_shape_dim = in_array_shape->mutable_dims(); + in_array_shape_dim->resize(kDim, elements_per_dim); + auto& in_array_buffer = + in_array.GetMutableBuffer<ArrayDataType::kFloat>(); + in_array_buffer.data.resize(buf_size); + float* buf_ptr = + in_array.GetMutableBuffer<ArrayDataType::kFloat>().data.data(); + std::copy(in_buf.get(), in_buf.get() + buf_size, buf_ptr); + } + + auto* fc_op = new FullyConnectedOperator; + fc_op->inputs = fc_input_names; + fc_op->outputs = {"fc_op_outputs"}; + Array& out_array = model->GetOrCreateArray(fc_op->outputs[0]); + out_array.data_type = ArrayDataType::kFloat; + Shape* out_array_shape = out_array.mutable_shape(); + std::vector<int>* out_array_shape_dim = out_array_shape->mutable_dims(); + out_array_shape_dim->resize(kDim, elements_per_dim); + model->operators.push_back(std::unique_ptr<Operator>(fc_op)); + } +}; + +TEST_F(QuantizeWeightsTest, QuantizedFullyConnected) { + // Test that weight arrays that are large enough are quantized. + Model model; + // 6 elements per dim gives us 1296 elements, which is sufficient to be + // quantized. + PrepareModel(&model, 6); + + // Check the state of the graph before the transformation. + const auto& float_array_map = model.GetArrayMap(); + EXPECT_EQ(float_array_map.size(), 3); + // Before the transformation, all arrays should be type float. + for (const auto& element : float_array_map) { + EXPECT_EQ(element.second->data_type, ArrayDataType::kFloat); + } + const std::vector<float> float_weight_vals = + model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kFloat>().data; + + // Invoke the transformation. + GraphTransformationsSet graph_transformation_set; + graph_transformation_set.Add(new toco::QuantizeWeights); + (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0); + + // Check the state of the graph after the transformation. + const auto& quantized_array_map = model.GetArrayMap(); + EXPECT_EQ(quantized_array_map.size(), 4); + // After the transformation, three arrays should be type float and one array + // should be uint8. + int num_float = 0; + int num_uint8 = 0; + for (const auto& element : quantized_array_map) { + if (element.second->data_type == ArrayDataType::kFloat) { + num_float++; + } else if (element.second->data_type == ArrayDataType::kUint8) { + num_uint8++; + } else { + FAIL() << "Unexpected array type."; + } + } + EXPECT_EQ(num_float, 3); + EXPECT_EQ(num_uint8, 1); + // Ensure that the values were quantized correctly. + const std::vector<uint8>& quantized_weight_vals = + model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kUint8>().data; + for (int i = 0; i < quantized_weight_vals.size(); i++) { + EXPECT_EQ(quantized_weight_vals[i], float_weight_vals[i] + kZeroPoint); + } + + // Ensure that a Dequantize operator has been inserted before the + // FullyConnectedLayer. + EXPECT_EQ(model.operators[0]->type, OperatorType::kDequantize); +} + +TEST_F(QuantizeWeightsTest, NotQuantizedFullyConnected) { + // Test that weight arrays that are too small are left untouched. + Model model; + // 5 elements per dim gives us 625 elements, which is NOT sufficient to be + // quantized. + PrepareModel(&model, 5); + + // Check the state of the graph before the transformation. + const auto& float_array_map = model.GetArrayMap(); + EXPECT_EQ(float_array_map.size(), 3); + // Before the transformation, all arrays should be type float. + for (auto it = float_array_map.begin(); it != float_array_map.end(); it++) { + EXPECT_EQ(it->second->data_type, ArrayDataType::kFloat); + } + std::vector<float> float_weight_vals = + model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kFloat>().data; + + // Invoke the transformation. + GraphTransformationsSet graph_transformation_set; + graph_transformation_set.Add(new toco::QuantizeWeights); + (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0); + + // Check the state of the graph after the transformation. + const auto& post_array_map = model.GetArrayMap(); + EXPECT_EQ(post_array_map.size(), 3); + for (auto it = post_array_map.begin(); it != post_array_map.end(); it++) { + EXPECT_EQ(it->second->data_type, ArrayDataType::kFloat); + } + // Ensure that the values remain unchanged. + std::vector<float> const& quantized_weight_vals = + model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kFloat>().data; + for (int i = 0; i < quantized_weight_vals.size(); i++) { + EXPECT_EQ(quantized_weight_vals[i], float_weight_vals[i]); + } +} + +} // namespace toco diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc index 3a1d175b98..66cfed4ac2 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc @@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include <memory> #include <string> -#include <unordered_map> #include <vector> #include <gmock/gmock.h> @@ -126,7 +124,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test { Array& in_array = model->GetOrCreateArray(concat_input_name); in_array.data_type = ArrayDataType::kFloat; - // Initialize shape for the input array. + // Initialize shape for the input array. Shape* in_array_shape = in_array.mutable_shape(); std::vector<int>* in_array_shape_dim = in_array_shape->mutable_dims(); for (int i = 0; i < kDim; i++) { diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc index 9c6ad673ab..87a1e429b9 100644 --- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc +++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc @@ -158,6 +158,11 @@ bool ParseTocoFlagsFromCommandLineFlags( parsed_flags.split_tflite_lstm_inputs.default_value(), "Split the LSTM inputs from 5 tensors to 18 tensors for TFLite. " "Ignored if the output format is not TFLite."), + Flag("quantize_weights", parsed_flags.quantize_weights.bind(), + parsed_flags.quantize_weights.default_value(), + "Store weights as quantized weights followed by dequantize " + "operations. Computation is still done in float, but reduces model " + "size (at the cost of accuracy and latency)."), }; bool asked_for_help = *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help")); @@ -251,6 +256,7 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags, FlagRequirement::kNone); READ_TOCO_FLAG(dedupe_array_min_size_bytes, FlagRequirement::kNone); READ_TOCO_FLAG(split_tflite_lstm_inputs, FlagRequirement::kNone); + READ_TOCO_FLAG(quantize_weights, FlagRequirement::kNone); // Deprecated flag handling. if (parsed_toco_flags.input_type.specified()) { @@ -284,6 +290,11 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags, QCHECK(toco::IODataType_Parse(input_types[0], &input_type)); toco_flags->set_inference_input_type(input_type); } + if (parsed_toco_flags.quantize_weights.value()) { + QCHECK_NE(toco_flags->inference_type(), IODataType::QUANTIZED_UINT8) + << "quantize_weights is not supported with inference_type " + "QUANTIZED_UINT8."; + } #undef READ_TOCO_FLAG #undef PARSE_TOCO_FLAG diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto index 15f755c104..4fe57879fb 100644 --- a/tensorflow/contrib/lite/toco/toco_flags.proto +++ b/tensorflow/contrib/lite/toco/toco_flags.proto @@ -37,7 +37,7 @@ enum FileFormat { // of as properties of models, instead describing how models are to be // processed in the context of the present tooling job. // -// Next ID to use: 20. +// Next ID to use: 21. message TocoFlags { // Input file format optional FileFormat input_format = 1; @@ -169,4 +169,9 @@ message TocoFlags { // Split the LSTM inputs from 5 tensors to 18 tensors for TFLite. // Ignored if the output format is not TFLite. optional bool split_tflite_lstm_inputs = 19 [default = true]; + + // Store weights as quantized weights followed by dequantize operations. + // Computation is still done in float, but reduces model size (at the cost of + // accuracy and latency). + optional bool quantize_weights = 20 [default = false]; } diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc index a648883d1f..1fe76f8163 100644 --- a/tensorflow/contrib/lite/toco/toco_tooling.cc +++ b/tensorflow/contrib/lite/toco/toco_tooling.cc @@ -269,6 +269,9 @@ void Transform(const TocoFlags& toco_flags, Model* model) { transformations.Add(new toco::MergeLstmCellInputs); } } + if (toco_flags.quantize_weights()) { + transformations.Add(new QuantizeWeights); + } transformations.Add(new ResolveConstantConcatenation); RunGraphTransformations(model, "general graph transformations", transformations); |