aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib
diff options
context:
space:
mode:
authorGravatar Yu-Cheng Ling <ycling@google.com>2018-02-02 13:49:35 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-02-02 13:53:43 -0800
commit1a92f45677ee66af24f2219c6b1cbaeee87056b7 (patch)
tree1d983d8f089bda2676d3dd80e78725e860c1212f /tensorflow/contrib
parentcf330efa207d635efecfb1b703ee65ccfbc3f98c (diff)
TFLite: Conv CBLAS kernel
PiperOrigin-RevId: 184328848
Diffstat (limited to 'tensorflow/contrib')
-rw-r--r--tensorflow/contrib/lite/kernels/conv.cc32
-rw-r--r--tensorflow/contrib/lite/kernels/conv_test.cc2
-rw-r--r--tensorflow/contrib/lite/kernels/internal/BUILD2
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h92
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h66
5 files changed, 193 insertions, 1 deletions
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index a5095e1e64..7a45647434 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -24,6 +24,7 @@ limitations under the License.
#include "tensorflow/contrib/lite/builtin_op_data.h"
#include "tensorflow/contrib/lite/context.h"
#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h"
#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
@@ -38,11 +39,16 @@ namespace ops {
namespace builtin {
namespace conv {
-// This file has three implementation of Conv.
+// This file has 4 implementation of Conv.
enum KernelType {
kReference,
kGenericOptimized, // Neon-free
kMultithreadOptimized,
+ // The kernel uses use CBLAS interface for matrix multiplication.
+ // It's fast when an optimized CBLAS implementation is available (e.g. Apple
+ // Accelerate Framework), and it's slow when falling back to naive
+ // implementation.
+ kCblasOptimized,
};
struct OpData {
@@ -305,6 +311,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
break;
case kGenericOptimized:
case kMultithreadOptimized:
+ case kCblasOptimized:
// There is only one optimized implementation for Quantized Conv.
optimized_ops::Conv(
GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
@@ -369,6 +376,17 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
GetTensorData<float>(im2col), GetTensorDims(im2col));
break;
}
+ case kCblasOptimized: {
+ cblas_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+ GetTensorData<float>(filter), GetTensorDims(filter),
+ GetTensorData<float>(bias), GetTensorDims(bias),
+ params->stride_width, params->stride_height,
+ data->padding.width, data->padding.height,
+ output_activation_min, output_activation_max,
+ GetTensorData<float>(output), GetTensorDims(output),
+ GetTensorData<float>(im2col), GetTensorDims(im2col));
+ break;
+ }
}
}
@@ -435,8 +453,20 @@ TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
return &r;
}
+TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
+ static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+ conv::Eval<conv::kCblasOptimized>};
+ return &r;
+}
+
TfLiteRegistration* Register_CONV_2D() {
+// TODO(ycling): Define a compilation flag and use CBLAS kernel when a
+// fast CBLAS implementatino is available.
+#ifdef TFLITE_USE_CBLAS_CONVOLUTION_KERNEL
+ return Register_CONVOLUTION_CBLAS_OPT();
+#else
return Register_CONVOLUTION_MULTITHREADED_OPT();
+#endif
}
} // namespace builtin
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 7550f7cc0d..d2393c3c97 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -29,6 +29,7 @@ namespace builtin {
TfLiteRegistration* Register_CONVOLUTION_REF();
TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT();
TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT();
+TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT();
} // namespace builtin
} // namespace ops
@@ -105,6 +106,7 @@ const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
{"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
{"MultithreadedOptimized",
ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
+ {"CblasOptimized", ops::builtin::Register_CONVOLUTION_CBLAS_OPT()},
});
class ConvolutionOpTest : public SingleOpTest {
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index de635cfb4a..288f1f8bbc 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -170,6 +170,8 @@ cc_library(
cc_library(
name = "optimized",
hdrs = [
+ "optimized/cblas_conv.h",
+ "optimized/cblas_reference.h",
"optimized/eigen_spatial_convolutions.h",
"optimized/eigen_tensor_reduced_instantiations_oss.h",
"optimized/multithreaded_conv.h",
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
new file mode 100644
index 0000000000..a9b6663122
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
+
+// The Conv implementation based on CBLAS interface. This is only used on iOS
+// for now, utilizing Apple's Accelerate framework.
+
+#if defined(__APPLE__)
+#include <Accelerate/Accelerate.h>
+#else
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h"
+#endif // __APPLE__
+
+#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace cblas_ops {
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+ const float* filter_data, const Dims<4>& filter_dims,
+ const float* bias_data, const Dims<4>& bias_dims,
+ int stride_width, int stride_height, int pad_width,
+ int pad_height, float output_activation_min,
+ float output_activation_max, float* output_data,
+ const Dims<4>& output_dims, float* im2col_data,
+ const Dims<4>& im2col_dims) {
+ gemmlowp::ScopedProfilingLabel label("Conv/cblas");
+
+ const float* gemm_input_data = nullptr;
+ const Dims<4>* gemm_input_dims = nullptr;
+ const int filter_width = ArraySize(filter_dims, 1);
+ const int filter_height = ArraySize(filter_dims, 2);
+ const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+ filter_width != 1 || filter_height != 1;
+ if (need_im2col) {
+ TFLITE_DCHECK(im2col_data);
+ optimized_ops::Im2col(input_data, input_dims, stride_width, stride_height,
+ pad_width, pad_height, filter_height, filter_width, 0,
+ im2col_data, im2col_dims);
+ gemm_input_data = im2col_data;
+ gemm_input_dims = &im2col_dims;
+ } else {
+ TFLITE_DCHECK(!im2col_data);
+ gemm_input_data = input_data;
+ gemm_input_dims = &input_dims;
+ }
+
+ // The following code computes matrix multiplication c = a * transponse(b)
+ // with CBLAS, where:
+ // * `a` is a matrix with dimensions (m, k).
+ // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+ // * `c` is a matrix with dimensions (m, n).
+ // The naming of variables are aligned with CBLAS specification here.
+ const float* a = gemm_input_data;
+ const float* b = filter_data;
+ float* c = output_data;
+ int m = gemm_input_dims->sizes[1] * gemm_input_dims->sizes[2] *
+ gemm_input_dims->sizes[3];
+ int n = output_dims.sizes[0];
+ int k = gemm_input_dims->sizes[0];
+ // The stride of matrix a, b and c respectively.
+ int stride_a = k;
+ int stride_b = k;
+ int stride_c = n;
+
+ cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+ stride_a, b, stride_b, 0.0f, c, stride_c);
+
+ optimized_ops::AddBiasAndEvalActivationFunction(
+ bias_data, bias_dims, output_data, output_dims, output_activation_min,
+ output_activation_max);
+}
+
+} // namespace cblas_ops
+} // namespace tflite
+
+#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
new file mode 100644
index 0000000000..6578915743
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+
+// The reference implementation for a small subset of CBLAS interface.
+// This is only used for testing CBLAS implementation, and should never be used
+// in production code.
+
+namespace tflite {
+namespace cblas_ops {
+
+// The following code follows the original CBLAS specification, and it might
+// conflict with the TensorFlow naming convention.
+// TODO(ycling): Find another way to test CBLAS with bazel, without writing
+// a reference implementation by ourselves.
+enum CBLAS_ORDER { CblasRowMajor = 0, CblasColMajor = 1 };
+
+enum CBLAS_TRANSPOSE { CblasNoTrans = 0, CblasTrans = 1, CblasConjTrans = 2 };
+
+// A reference implementation for matrix multiplication.
+// The following code computes, c = a * transponse(b) matrix multiplication
+// with CBLAS, where:
+// * `a` is a matrix with dimensions (m, k).
+// * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+// * `c` is a matrix with dimensions (m, n).
+// The naming of variables is aligned with CBLAS specification here.
+void cblas_sgemm(const enum CBLAS_ORDER order,
+ const enum CBLAS_TRANSPOSE trans_a,
+ const enum CBLAS_TRANSPOSE trans_b, const int m, const int n,
+ const int k, const float alpha, const float *a,
+ const int stride_a, const float *b, const int stride_b,
+ const float beta, float *c, const int stride_c) {
+ TFLITE_DCHECK(order == CblasRowMajor);
+ TFLITE_DCHECK(trans_a == CblasNoTrans);
+ TFLITE_DCHECK(trans_b == CblasTrans);
+ for (int row = 0; row < m; ++row) {
+ for (int col = 0; col < n; ++col) {
+ float value = beta * c[stride_c * row + col];
+ for (int idx = 0; idx < k; ++idx) {
+ value += alpha * a[stride_a * row + idx] * b[stride_b * col + idx];
+ }
+ c[stride_c * row + col] = value;
+ }
+ }
+}
+
+} // namespace cblas_ops
+} // namespace tflite
+
+#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_