aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-06-21 12:41:24 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-06-21 12:44:26 -0700
commit846520326327d5eb8e1be13cad7d5526adf67db2 (patch)
tree77f3f0a266306c8d8591a5ecf617d13a1eb94440
parentfc4484c359cab66bd5bfdfaab936b1a5128850be (diff)
Graduate the 'experimental' fully connected weights shuffling feature
(which enables faster runtime kernels) as no longer experimental, no longer behind a flag, and replace the bool experimental_shuffled_weights field on FullyConnectedOperator with an enum. PiperOrigin-RevId: 201569224
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h33
-rw-r--r--tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h2
-rw-r--r--tensorflow/contrib/lite/kernels/internal/types.h61
-rw-r--r--tensorflow/contrib/lite/toco/BUILD2
-rw-r--r--tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc2
-rw-r--r--tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h2
-rw-r--r--tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc4
-rw-r--r--tensorflow/contrib/lite/toco/graph_transformations/shuffle_fc_weights.cc (renamed from tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc)6
-rw-r--r--tensorflow/contrib/lite/toco/model.h3
-rw-r--r--tensorflow/contrib/lite/toco/runtime/types.h1
10 files changed, 90 insertions, 26 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 868269477e..6b5d35f21e 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1292,11 +1292,11 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
}
// Internal function doing the actual arithmetic work for
-// ExperimentalShuffledFullyConnected.
+// ShuffledFullyConnected.
// May be called either directly by it (single-threaded case) or may be used
// as the 'task' for worker threads to run (multi-threaded case, see
-// ExperimentalShuffledFullyConnectedWorkerTask below).
-inline void ExperimentalShuffledFullyConnectedWorkerImpl(
+// ShuffledFullyConnectedWorkerTask below).
+inline void ShuffledFullyConnectedWorkerImpl(
const uint8* shuffled_input_workspace_data,
const int8* shuffled_weights_data, int batches, int output_depth,
int output_stride, int accum_depth, const int32* bias_data,
@@ -1570,14 +1570,16 @@ inline void ExperimentalShuffledFullyConnectedWorkerImpl(
#endif
}
-// Wraps ExperimentalShuffledFullyConnectedWorkerImpl into a Task class
+// Wraps ShuffledFullyConnectedWorkerImpl into a Task class
// to allow using gemmlowp's threadpool.
-struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task {
- ExperimentalShuffledFullyConnectedWorkerTask(
- const uint8* input_data, const int8* shuffled_weights_data, int batches,
- int output_depth, int output_stride, int accum_depth,
- const int32* bias_data, int32 output_multiplier, int output_shift,
- int16* output_data)
+struct ShuffledFullyConnectedWorkerTask : gemmlowp::Task {
+ ShuffledFullyConnectedWorkerTask(const uint8* input_data,
+ const int8* shuffled_weights_data,
+ int batches, int output_depth,
+ int output_stride, int accum_depth,
+ const int32* bias_data,
+ int32 output_multiplier, int output_shift,
+ int16* output_data)
: input_data_(input_data),
shuffled_weights_data_(shuffled_weights_data),
batches_(batches),
@@ -1590,7 +1592,7 @@ struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task {
output_data_(output_data) {}
void Run() override {
- ExperimentalShuffledFullyConnectedWorkerImpl(
+ ShuffledFullyConnectedWorkerImpl(
input_data_, shuffled_weights_data_, batches_, output_depth_,
output_stride_, accum_depth_, bias_data_, output_multiplier_,
output_shift_, output_data_);
@@ -1608,15 +1610,14 @@ struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task {
int16* output_data_;
};
-inline void ExperimentalShuffledFullyConnected(
+inline void ShuffledFullyConnected(
const uint8* input_data, const Dims<4>& input_dims,
const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
int output_shift, int32 output_activation_min, int32 output_activation_max,
int16* output_data, const Dims<4>& output_dims,
uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) {
- gemmlowp::ScopedProfilingLabel label(
- "ExperimentalShuffledFullyConnected/8bit");
+ gemmlowp::ScopedProfilingLabel label("ShuffledFullyConnected/8bit");
(void)gemm_context; // only used in optimized code.
TFLITE_DCHECK_EQ(output_activation_min, -32768);
TFLITE_DCHECK_EQ(output_activation_max, 32767);
@@ -1700,7 +1701,7 @@ inline void ExperimentalShuffledFullyConnected(
if (thread_count == 1) {
// Single-thread case: do the computation on the current thread, don't
// use a threadpool
- ExperimentalShuffledFullyConnectedWorkerImpl(
+ ShuffledFullyConnectedWorkerImpl(
shuffled_input_workspace_data, int8_shuffled_weights_data, batches,
output_depth, output_depth, accum_depth, bias_data, output_multiplier,
output_shift, output_data);
@@ -1715,7 +1716,7 @@ inline void ExperimentalShuffledFullyConnected(
int row_start = 0;
for (int i = 0; i < thread_count; i++) {
int row_end = std::min(output_depth, row_start + kRowsPerWorker);
- tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask(
+ tasks[i] = new ShuffledFullyConnectedWorkerTask(
shuffled_input_workspace_data,
int8_shuffled_weights_data + row_start * accum_depth, batches,
row_end - row_start, output_depth, accum_depth, bias_data + row_start,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 89ec0eb266..7b8a56a524 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -697,7 +697,7 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
}
}
-inline void ExperimentalShuffledFullyConnected(
+inline void ShuffledFullyConnected(
const uint8* input_data, const Dims<4>& input_dims,
const uint8* shuffled_weights_data, const Dims<4>& weights_dims,
const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier,
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 707d2d261a..fa2420713f 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -25,6 +25,67 @@ namespace tflite {
enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
enum class PaddingType { kNone, kSame, kValid };
+// This enumeration allows for non-default formats for the weights array
+// of a fully-connected operator, allowing the use of special optimized
+// runtime paths.
+enum class FullyConnectedWeightsFormat : uint8 {
+ // Default format (flat 2D layout, the inner contiguous dimension
+ // is input_depth, the outer non-contiguous dimension is output_depth)
+ kDefault,
+ // Summary: optimized layout for fast CPU runtime implementation,
+ // aimed specifically at ARM CPUs at the moment, and specialized for
+ // 8-bit quantized layers.
+ //
+ // The use case we're concerned with here is: 8-bit quantization,
+ // large weights matrix that doesn't fit in cache (e.g. 4096x2048 in
+ // a key application that drove this), very small batch size (e.g. 1 -- 4).
+ //
+ // Even with 8-bit quantization of weights, the performance of memory
+ // accesses to the weights can become the dominant issue when
+ // the batch size is small, so each weight value is used in only a few
+ // arithmetic ops, i.e. the fully-connected node has a low arithmetic
+ // intensity. The specific issues that arise are of three kinds:
+ // (1) One may, ideally, max out DRAM bandwidth, i.e. be truly memory
+ // bound. That's the "good" issue to run into.
+ // (2) One may run into sub-optimal pre-fetching: the data hasn't been
+ // prefetched into the cache by the time we need it.
+ // (3) One may run into cache aliasing: multiple values that are
+ // pre-fetched, alias each other in the L1 cache (which typically
+ // has only 4-way set associativity in ARM CPUs) and thus evict
+ // each other before we get to using them.
+ //
+ // The point of this shuffling is to avoid issues (2) and (3) so that
+ // we get as fast as possible given only the hard constraint (1).
+ // This is achieved by turning the difficulty into a solution: the
+ // difficulty, that each value loaded from memory is used only in
+ // one kernel iteration, making this operation memory-intensive, hints at
+ // the solution, of shuffling the weights so that they are stored in the
+ // exact order as the kernel needs to load them, so that the memory
+ // accesses made by the kernel are trivial. This solves (2) because the
+ // trivial memory access pattern allows the CPU's automatic prefetching
+ // to perform very well (no need even for preload instructions), and this
+ // solves (3) because the values being loaded concurrently are now
+ // contiguous in the address space, thus don't alias each other in the cache.
+ //
+ // On ARM, we typically want our kernel to process a 4x16 block of weights
+ // at a time, because:
+ // - 16 is the number of bytes in a NEON register.
+ // - 4 is how many rows we need to handle concurrently in the kernel in
+ // order to have sufficient mutual independence of instructions to
+ // maximize arithmetic throughput.
+ //
+ // Finally, the 'Int8' part in the name refers to the fact that this
+ // weights format has each weights value encoded as a signed int8 value,
+ // even if the data type of the weights buffer is uint8. This is intended
+ // to save runtime kernels the effort to have to XOR the top bit of these
+ // bytes before using them in signed arithmetic, see this file for more
+ // explanations on the 'signed int8 trick' in matrix multiplication kernels:
+ //
+ // tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+ //
+ kShuffled4x16Int8,
+};
+
// Quantization parameters, determining the mapping of quantized values
// to real values (i.e. determining how quantized values are mathematically
// interpreted).
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index dd05c484fa..be102faa4c 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -221,7 +221,6 @@ cc_library(
"graph_transformations/drop_im2col_arrays.cc",
"graph_transformations/ensure_bias_vectors.cc",
"graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc",
- "graph_transformations/experimental_shuffle_fc_weights.cc",
"graph_transformations/fuse_activation_functions.cc",
"graph_transformations/fuse_binary_into_following_affine.cc",
"graph_transformations/fuse_binary_into_preceding_affine.cc",
@@ -296,6 +295,7 @@ cc_library(
"graph_transformations/resolve_tensorflow_merge.cc",
"graph_transformations/resolve_tensorflow_switch.cc",
"graph_transformations/resolve_transpose_attributes.cc",
+ "graph_transformations/shuffle_fc_weights.cc",
"graph_transformations/unfuse_activation_functions.cc",
"graph_transformations/unpartition_embedding_lookup.cc",
"graph_transformations/unroll_batch_matmul.cc",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 394fa349e2..75642bbc37 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -122,7 +122,7 @@ bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
case OperatorType::kFullyConnected: {
weights_index = 1;
const auto& fc_op = static_cast<const toco::FullyConnectedOperator&>(op);
- CHECK(!fc_op.experimental_shuffled_weights)
+ CHECK(fc_op.weights_format == FullyConnectedWeightsFormat::kDefault)
<< "This graph transformation expects to run before FC weights get "
"shuffled.";
break;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 62a09acdfb..4025fede6f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -192,7 +192,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantGather)
DECLARE_GRAPH_TRANSFORMATION(ResolveMultiplyByZero)
DECLARE_GRAPH_TRANSFORMATION(Dequantize)
DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup)
-DECLARE_GRAPH_TRANSFORMATION(ExperimentalShuffleFCWeights)
+DECLARE_GRAPH_TRANSFORMATION(ShuffleFCWeights)
class PropagateDefaultMinMax : public GraphTransformation {
public:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
index 910e38a6ba..685353a846 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
@@ -306,8 +306,8 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
return false;
}
- if (static_cast<FullyConnectedOperator*>(fully_connected)
- ->experimental_shuffled_weights) {
+ if (static_cast<FullyConnectedOperator*>(fully_connected)->weights_format !=
+ FullyConnectedWeightsFormat::kDefault) {
// Not yet implemented: experimental shuffled weights in fused LSTM cell.
return false;
}
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/shuffle_fc_weights.cc
index c00cdcb944..22c258cec5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/shuffle_fc_weights.cc
@@ -24,14 +24,14 @@ limitations under the License.
namespace toco {
-bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) {
+bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) {
Operator* op = model->operators[op_index].get();
if (op->type != OperatorType::kFullyConnected) {
return false;
}
FullyConnectedOperator* fc_op = static_cast<FullyConnectedOperator*>(op);
// Exit if this FC op already has shuffled weights
- if (fc_op->experimental_shuffled_weights) {
+ if (fc_op->weights_format != FullyConnectedWeightsFormat::kDefault) {
return false;
}
const Array& input_array = model->GetArray(fc_op->inputs[0]);
@@ -135,7 +135,7 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) {
CHECK_EQ(shuffled_data_ptr, shuffled_data.data() + rows * cols);
// Switch this FC op to using the shuffled weights.
weights_data = std::move(shuffled_data);
- fc_op->experimental_shuffled_weights = true;
+ fc_op->weights_format = FullyConnectedWeightsFormat::kShuffled4x16Int8;
AddMessageF("Applied experimental shuffling to the weights of %s",
LogName(*op));
// Add a second output array to this FC op, serving as a workspace to perform
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index ef170b3884..89cb061499 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -433,7 +433,8 @@ struct SpaceToDepthOperator : Operator {
// input activations as a matrix, followed by a MatMul node.
struct FullyConnectedOperator : Operator {
FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {}
- bool experimental_shuffled_weights = false;
+ FullyConnectedWeightsFormat weights_format =
+ FullyConnectedWeightsFormat::kDefault;
};
// Dequantization operator, converting a quantized array of integers with
diff --git a/tensorflow/contrib/lite/toco/runtime/types.h b/tensorflow/contrib/lite/toco/runtime/types.h
index f5de5a5781..207f2c1706 100644
--- a/tensorflow/contrib/lite/toco/runtime/types.h
+++ b/tensorflow/contrib/lite/toco/runtime/types.h
@@ -24,6 +24,7 @@ namespace toco {
// TODO(ahentz): These are just stopgaps for now, untils we move all
// the code over to tflite.
using tflite::Dims;
+using tflite::FullyConnectedWeightsFormat;
using tflite::FusedActivationFunctionType;
using tflite::RequiredBufferSizeForDims;