diff options
-rw-r--r-- | tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h | 33 | ||||
-rw-r--r-- | tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h | 2 | ||||
-rw-r--r-- | tensorflow/contrib/lite/kernels/internal/types.h | 61 | ||||
-rw-r--r-- | tensorflow/contrib/lite/toco/BUILD | 2 | ||||
-rw-r--r-- | tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc | 2 | ||||
-rw-r--r-- | tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h | 2 | ||||
-rw-r--r-- | tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc | 4 | ||||
-rw-r--r-- | tensorflow/contrib/lite/toco/graph_transformations/shuffle_fc_weights.cc (renamed from tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc) | 6 | ||||
-rw-r--r-- | tensorflow/contrib/lite/toco/model.h | 3 | ||||
-rw-r--r-- | tensorflow/contrib/lite/toco/runtime/types.h | 1 |
10 files changed, 90 insertions, 26 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 868269477e..6b5d35f21e 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -1292,11 +1292,11 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, } // Internal function doing the actual arithmetic work for -// ExperimentalShuffledFullyConnected. +// ShuffledFullyConnected. // May be called either directly by it (single-threaded case) or may be used // as the 'task' for worker threads to run (multi-threaded case, see -// ExperimentalShuffledFullyConnectedWorkerTask below). -inline void ExperimentalShuffledFullyConnectedWorkerImpl( +// ShuffledFullyConnectedWorkerTask below). +inline void ShuffledFullyConnectedWorkerImpl( const uint8* shuffled_input_workspace_data, const int8* shuffled_weights_data, int batches, int output_depth, int output_stride, int accum_depth, const int32* bias_data, @@ -1570,14 +1570,16 @@ inline void ExperimentalShuffledFullyConnectedWorkerImpl( #endif } -// Wraps ExperimentalShuffledFullyConnectedWorkerImpl into a Task class +// Wraps ShuffledFullyConnectedWorkerImpl into a Task class // to allow using gemmlowp's threadpool. -struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { - ExperimentalShuffledFullyConnectedWorkerTask( - const uint8* input_data, const int8* shuffled_weights_data, int batches, - int output_depth, int output_stride, int accum_depth, - const int32* bias_data, int32 output_multiplier, int output_shift, - int16* output_data) +struct ShuffledFullyConnectedWorkerTask : gemmlowp::Task { + ShuffledFullyConnectedWorkerTask(const uint8* input_data, + const int8* shuffled_weights_data, + int batches, int output_depth, + int output_stride, int accum_depth, + const int32* bias_data, + int32 output_multiplier, int output_shift, + int16* output_data) : input_data_(input_data), shuffled_weights_data_(shuffled_weights_data), batches_(batches), @@ -1590,7 +1592,7 @@ struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { output_data_(output_data) {} void Run() override { - ExperimentalShuffledFullyConnectedWorkerImpl( + ShuffledFullyConnectedWorkerImpl( input_data_, shuffled_weights_data_, batches_, output_depth_, output_stride_, accum_depth_, bias_data_, output_multiplier_, output_shift_, output_data_); @@ -1608,15 +1610,14 @@ struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { int16* output_data_; }; -inline void ExperimentalShuffledFullyConnected( +inline void ShuffledFullyConnected( const uint8* input_data, const Dims<4>& input_dims, const uint8* shuffled_weights_data, const Dims<4>& weights_dims, const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, int output_shift, int32 output_activation_min, int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { - gemmlowp::ScopedProfilingLabel label( - "ExperimentalShuffledFullyConnected/8bit"); + gemmlowp::ScopedProfilingLabel label("ShuffledFullyConnected/8bit"); (void)gemm_context; // only used in optimized code. TFLITE_DCHECK_EQ(output_activation_min, -32768); TFLITE_DCHECK_EQ(output_activation_max, 32767); @@ -1700,7 +1701,7 @@ inline void ExperimentalShuffledFullyConnected( if (thread_count == 1) { // Single-thread case: do the computation on the current thread, don't // use a threadpool - ExperimentalShuffledFullyConnectedWorkerImpl( + ShuffledFullyConnectedWorkerImpl( shuffled_input_workspace_data, int8_shuffled_weights_data, batches, output_depth, output_depth, accum_depth, bias_data, output_multiplier, output_shift, output_data); @@ -1715,7 +1716,7 @@ inline void ExperimentalShuffledFullyConnected( int row_start = 0; for (int i = 0; i < thread_count; i++) { int row_end = std::min(output_depth, row_start + kRowsPerWorker); - tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask( + tasks[i] = new ShuffledFullyConnectedWorkerTask( shuffled_input_workspace_data, int8_shuffled_weights_data + row_start * accum_depth, batches, row_end - row_start, output_depth, accum_depth, bias_data + row_start, diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index 89ec0eb266..7b8a56a524 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -697,7 +697,7 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, } } -inline void ExperimentalShuffledFullyConnected( +inline void ShuffledFullyConnected( const uint8* input_data, const Dims<4>& input_dims, const uint8* shuffled_weights_data, const Dims<4>& weights_dims, const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h index 707d2d261a..fa2420713f 100644 --- a/tensorflow/contrib/lite/kernels/internal/types.h +++ b/tensorflow/contrib/lite/kernels/internal/types.h @@ -25,6 +25,67 @@ namespace tflite { enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu }; enum class PaddingType { kNone, kSame, kValid }; +// This enumeration allows for non-default formats for the weights array +// of a fully-connected operator, allowing the use of special optimized +// runtime paths. +enum class FullyConnectedWeightsFormat : uint8 { + // Default format (flat 2D layout, the inner contiguous dimension + // is input_depth, the outer non-contiguous dimension is output_depth) + kDefault, + // Summary: optimized layout for fast CPU runtime implementation, + // aimed specifically at ARM CPUs at the moment, and specialized for + // 8-bit quantized layers. + // + // The use case we're concerned with here is: 8-bit quantization, + // large weights matrix that doesn't fit in cache (e.g. 4096x2048 in + // a key application that drove this), very small batch size (e.g. 1 -- 4). + // + // Even with 8-bit quantization of weights, the performance of memory + // accesses to the weights can become the dominant issue when + // the batch size is small, so each weight value is used in only a few + // arithmetic ops, i.e. the fully-connected node has a low arithmetic + // intensity. The specific issues that arise are of three kinds: + // (1) One may, ideally, max out DRAM bandwidth, i.e. be truly memory + // bound. That's the "good" issue to run into. + // (2) One may run into sub-optimal pre-fetching: the data hasn't been + // prefetched into the cache by the time we need it. + // (3) One may run into cache aliasing: multiple values that are + // pre-fetched, alias each other in the L1 cache (which typically + // has only 4-way set associativity in ARM CPUs) and thus evict + // each other before we get to using them. + // + // The point of this shuffling is to avoid issues (2) and (3) so that + // we get as fast as possible given only the hard constraint (1). + // This is achieved by turning the difficulty into a solution: the + // difficulty, that each value loaded from memory is used only in + // one kernel iteration, making this operation memory-intensive, hints at + // the solution, of shuffling the weights so that they are stored in the + // exact order as the kernel needs to load them, so that the memory + // accesses made by the kernel are trivial. This solves (2) because the + // trivial memory access pattern allows the CPU's automatic prefetching + // to perform very well (no need even for preload instructions), and this + // solves (3) because the values being loaded concurrently are now + // contiguous in the address space, thus don't alias each other in the cache. + // + // On ARM, we typically want our kernel to process a 4x16 block of weights + // at a time, because: + // - 16 is the number of bytes in a NEON register. + // - 4 is how many rows we need to handle concurrently in the kernel in + // order to have sufficient mutual independence of instructions to + // maximize arithmetic throughput. + // + // Finally, the 'Int8' part in the name refers to the fact that this + // weights format has each weights value encoded as a signed int8 value, + // even if the data type of the weights buffer is uint8. This is intended + // to save runtime kernels the effort to have to XOR the top bit of these + // bytes before using them in signed arithmetic, see this file for more + // explanations on the 'signed int8 trick' in matrix multiplication kernels: + // + // tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc + // + kShuffled4x16Int8, +}; + // Quantization parameters, determining the mapping of quantized values // to real values (i.e. determining how quantized values are mathematically // interpreted). diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD index dd05c484fa..be102faa4c 100644 --- a/tensorflow/contrib/lite/toco/BUILD +++ b/tensorflow/contrib/lite/toco/BUILD @@ -221,7 +221,6 @@ cc_library( "graph_transformations/drop_im2col_arrays.cc", "graph_transformations/ensure_bias_vectors.cc", "graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc", - "graph_transformations/experimental_shuffle_fc_weights.cc", "graph_transformations/fuse_activation_functions.cc", "graph_transformations/fuse_binary_into_following_affine.cc", "graph_transformations/fuse_binary_into_preceding_affine.cc", @@ -296,6 +295,7 @@ cc_library( "graph_transformations/resolve_tensorflow_merge.cc", "graph_transformations/resolve_tensorflow_switch.cc", "graph_transformations/resolve_transpose_attributes.cc", + "graph_transformations/shuffle_fc_weights.cc", "graph_transformations/unfuse_activation_functions.cc", "graph_transformations/unpartition_embedding_lookup.cc", "graph_transformations/unroll_batch_matmul.cc", diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc index 394fa349e2..75642bbc37 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc @@ -122,7 +122,7 @@ bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model, case OperatorType::kFullyConnected: { weights_index = 1; const auto& fc_op = static_cast<const toco::FullyConnectedOperator&>(op); - CHECK(!fc_op.experimental_shuffled_weights) + CHECK(fc_op.weights_format == FullyConnectedWeightsFormat::kDefault) << "This graph transformation expects to run before FC weights get " "shuffled."; break; diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h index 62a09acdfb..4025fede6f 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h +++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h @@ -192,7 +192,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantGather) DECLARE_GRAPH_TRANSFORMATION(ResolveMultiplyByZero) DECLARE_GRAPH_TRANSFORMATION(Dequantize) DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup) -DECLARE_GRAPH_TRANSFORMATION(ExperimentalShuffleFCWeights) +DECLARE_GRAPH_TRANSFORMATION(ShuffleFCWeights) class PropagateDefaultMinMax : public GraphTransformation { public: diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc index 910e38a6ba..685353a846 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc @@ -306,8 +306,8 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) { return false; } - if (static_cast<FullyConnectedOperator*>(fully_connected) - ->experimental_shuffled_weights) { + if (static_cast<FullyConnectedOperator*>(fully_connected)->weights_format != + FullyConnectedWeightsFormat::kDefault) { // Not yet implemented: experimental shuffled weights in fused LSTM cell. return false; } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/shuffle_fc_weights.cc index c00cdcb944..22c258cec5 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/shuffle_fc_weights.cc @@ -24,14 +24,14 @@ limitations under the License. namespace toco { -bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) { +bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) { Operator* op = model->operators[op_index].get(); if (op->type != OperatorType::kFullyConnected) { return false; } FullyConnectedOperator* fc_op = static_cast<FullyConnectedOperator*>(op); // Exit if this FC op already has shuffled weights - if (fc_op->experimental_shuffled_weights) { + if (fc_op->weights_format != FullyConnectedWeightsFormat::kDefault) { return false; } const Array& input_array = model->GetArray(fc_op->inputs[0]); @@ -135,7 +135,7 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) { CHECK_EQ(shuffled_data_ptr, shuffled_data.data() + rows * cols); // Switch this FC op to using the shuffled weights. weights_data = std::move(shuffled_data); - fc_op->experimental_shuffled_weights = true; + fc_op->weights_format = FullyConnectedWeightsFormat::kShuffled4x16Int8; AddMessageF("Applied experimental shuffling to the weights of %s", LogName(*op)); // Add a second output array to this FC op, serving as a workspace to perform diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h index ef170b3884..89cb061499 100644 --- a/tensorflow/contrib/lite/toco/model.h +++ b/tensorflow/contrib/lite/toco/model.h @@ -433,7 +433,8 @@ struct SpaceToDepthOperator : Operator { // input activations as a matrix, followed by a MatMul node. struct FullyConnectedOperator : Operator { FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {} - bool experimental_shuffled_weights = false; + FullyConnectedWeightsFormat weights_format = + FullyConnectedWeightsFormat::kDefault; }; // Dequantization operator, converting a quantized array of integers with diff --git a/tensorflow/contrib/lite/toco/runtime/types.h b/tensorflow/contrib/lite/toco/runtime/types.h index f5de5a5781..207f2c1706 100644 --- a/tensorflow/contrib/lite/toco/runtime/types.h +++ b/tensorflow/contrib/lite/toco/runtime/types.h @@ -24,6 +24,7 @@ namespace toco { // TODO(ahentz): These are just stopgaps for now, untils we move all // the code over to tflite. using tflite::Dims; +using tflite::FullyConnectedWeightsFormat; using tflite::FusedActivationFunctionType; using tflite::RequiredBufferSizeForDims; |