diff options
author | TensorFlower Gardener <gardener@tensorflow.org> | 2018-07-13 09:35:53 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-07-13 09:36:16 -0700 |
commit | 2a0f8f140acde8262624de847ee06db77c48a99b (patch) | |
tree | 426427345b9b965585a6a969fde7bc1dd8651ed7 | |
parent | 23e26ec2a7521c924b2a9fc435c04761f97fe6ba (diff) | |
parent | 2de343c329ff252ed0cb419f29c3ce3765b3da84 (diff) |
Merge pull request #20350 from jjsjann123:trt4_pr
PiperOrigin-RevId: 204483181
-rw-r--r-- | tensorflow/contrib/tensorrt/convert/convert_graph.cc | 61 | ||||
-rw-r--r-- | tensorflow/contrib/tensorrt/convert/convert_nodes.cc | 1354 | ||||
-rw-r--r-- | tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc | 11 | ||||
-rw-r--r-- | tensorflow/contrib/tensorrt/ops/trt_engine_op.cc | 10 | ||||
-rw-r--r-- | tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc | 62 |
5 files changed, 1009 insertions, 489 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index 5bfc7f9109..6ebc30ca82 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -86,27 +86,48 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) { // TODO(jie): Segmentation shouldn't associated with op name. // Split it into a registration for each kernel. static const std::set<string> candidate_ops = { - "Identity", - "Snapshot", - "Const", - "Conv2D", - "MaxPool", - "BiasAdd", - "Relu", - "Add", - "Mul", - "Sub", - "Rsqrt", - "Pad", - "Mean", - "AvgPool", - "ConcatV2", - "DepthwiseConv2dNative", - "FusedBatchNorm", - "FusedBatchNormV2", - // TODO(ben,jie): ... + "Identity", + "Snapshot", + "Const", + "Conv2D", + "MaxPool", + "BiasAdd", + "Relu", + "Add", + "Mul", + "Sub", + "Rsqrt", + "Pad", + "Mean", + "AvgPool", + "ConcatV2", + "DepthwiseConv2dNative", + "FusedBatchNorm", + "FusedBatchNormV2", + "Div", + "RealDiv", + "Rsqrt", + "Reciprocal", + "Exp", + "Log", + "Sqrt", + "Abs", + "Neg", +#if NV_TENSORRT_MAJOR > 3 + "MatMul", + "BatchMatMul", + "Softmax", + "Minimum", + "Maximum", + "TopKV2", + "Sum", + "Prod", + "Max", + "Min", +#endif + // TODO(ben,jie): ... }; - // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h) + // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc) return (candidate_ops.count(node->type_string()) || PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string())); } diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc index 146b9c7344..0ee708bc1c 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc @@ -49,9 +49,29 @@ limitations under the License. #if GOOGLE_TENSORRT #include "tensorrt/include/NvInfer.h" -// Check if the types are equal. Cast to int first so that failure log message -// would work! -#define CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2) +// Check if the types are equal. Cast to int first so that failure log message +// would work! +#define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2) + +#define TFTRT_INTERNAL_ERROR_AT_NODE(node) \ + do { \ + return tensorflow::errors::Internal( \ + "TFTRT::", __FUNCTION__, "failed to add TRT layer, at: ", node); \ + } while (0) + +#define TFTRT_RETURN_ERROR_IF_FALSE(status, node) \ + do { \ + if (status == false) { \ + TFTRT_INTERNAL_ERROR_AT_NODE(node); \ + } \ + } while (0) + +#define TFTRT_RETURN_ERROR_IF_NULLPTR(ptr, node) \ + do { \ + if (ptr == nullptr) { \ + TFTRT_INTERNAL_ERROR_AT_NODE(node); \ + } \ + } while (0) namespace tensorflow { namespace tensorrt { @@ -75,13 +95,110 @@ inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype, case tensorflow::DataType::DT_HALF: *trt_dtype = nvinfer1::DataType::kHALF; break; +#if NV_TENSORRT_MAJOR > 3 + case tensorflow::DataType::DT_INT32: + *trt_dtype = nvinfer1::DataType::kINT32; + break; +#endif default: return tensorflow::errors::InvalidArgument( - "Unsupported data type " + tensorflow::DataTypeString(tf_dtype)); + "Unsupported data type ", tensorflow::DataTypeString(tf_dtype)); } return tensorflow::Status::OK(); } +// Return whether or not the broadcast is feasible; +bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l, + const bool operand_l_is_tensor, + const nvinfer1::Dims& operand_r, + const bool operand_r_is_tensor, + nvinfer1::Dims* operand_l_new_shape, + nvinfer1::Dims* operand_r_new_shape) { + // *************************************************************************** + // TensorRT Elementwise op supports broadcast but requires both tensor to be + // of Identical rank + // + // We consider case of: + // 1. operand_l to be a Tensor & operand_r to be a Const; + // 2. operand_l to be a Tensor & operand_r to be a Tensor; + // note: const op const (constant folding) should fallback to TensorFlow + // + // broadcast scheme: + // T: 1 3 5 (tensor would not have batch dimension) + // W: 1 1 3 1 (weight would have all explicit dimensions) + // i. fill in explicit dimensions + // -> T: -1 1 3 5 (we put a -1 for batch dimension) + // -> W: 1 1 3 1 + // ii. compare broadcast feasibility + // + // We cannot support the following since TensorRT does not allow manipulation + // on batch dimension, we cannot generate output with proper shape + // T: 3 5 1 + // W: 1 1 1 1 3 5 1 + // -> T: 1 1 1 -1 3 5 1 + // -> W: 1 1 1 1 3 5 1 + // *************************************************************************** + const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1; + const size_t element_size = sizeof(operand_l.d[0]); + + // fill in dimensions + int l_s[max_nb_dims]; + std::fill(l_s, l_s + max_nb_dims, 1); + int l_d = operand_l_is_tensor ? operand_l.nbDims + 1 : operand_l.nbDims; + int r_s[max_nb_dims]; + std::fill(r_s, r_s + max_nb_dims, 1); + int r_d = operand_r_is_tensor ? operand_r.nbDims + 1 : operand_r.nbDims; + + int max_d = std::max(l_d, r_d); + std::memcpy(l_s + max_d - operand_l.nbDims, operand_l.d, + operand_l.nbDims * element_size); + std::memcpy(r_s + max_d - operand_r.nbDims, operand_r.d, + operand_r.nbDims * element_size); + + // set -1 for batch dimension, since batch size is not supposed to be + // broadcasted + if (operand_l_is_tensor) { + if (max_d != l_d) { // if broadcast beyond batch dimension, fail + return false; + } + l_s[0] = -1; + } + if (operand_r_is_tensor) { + if (max_d != r_d) { // if broadcast beyond batch dimension, fail + return false; + } + r_s[0] = -1; + } + + // compare broadcast feasibility + for (int i = max_d - 1; i >= 0; i--) { + if ((l_s[i] != r_s[i]) && (l_s[i] != 1) && (r_s[i] != 1)) { + return false; + } + } + + // output new TensorRT Dimension (stripping the batch dimension) + operand_l_new_shape->nbDims = max_d - 1; + std::memcpy(operand_l_new_shape->d, l_s + 1, (max_d - 1) * element_size); + operand_r_new_shape->nbDims = max_d - 1; + std::memcpy(operand_r_new_shape->d, r_s + 1, (max_d - 1) * element_size); + + return true; +} + +inline bool DimsEqual(const nvinfer1::Dims& dim_l, + const nvinfer1::Dims& dim_r) { + if (dim_l.nbDims != dim_r.nbDims) { + return false; + } + for (int i = 0; i < dim_l.nbDims; i++) { + if (dim_l.d[i] != dim_r.d[i]) { + return false; + } + } + return true; +} + inline nvinfer1::Dims GetTensorShape(const tensorflow::Tensor& tensor) { nvinfer1::Dims dims; dims.nbDims = tensor.dims(); @@ -91,7 +208,7 @@ inline nvinfer1::Dims GetTensorShape(const tensorflow::Tensor& tensor) { return dims; } -inline int64_t GetShapeSize(nvinfer1::Dims shape) { +inline int64_t GetShapeSize(const nvinfer1::Dims& shape) { // Returns total number of elements in shape int64_t count = 1; for (int d = 0; d < shape.nbDims; ++d) { @@ -104,7 +221,7 @@ static std::vector<std::pair<int, int>> CreateSamePadding( const nvinfer1::DimsHW& stride, const nvinfer1::DimsHW& kernel, const std::vector<int64_t>& input_dims) { std::vector<std::pair<int, int>> padding(input_dims.size()); - CHECK_EQ((size_t)stride.nbDims, input_dims.size()); // TODO(jie): N+C? NC+? + CHECK_EQ(stride.nbDims, input_dims.size()); // TODO(jie): N+C? NC+? for (size_t i = 0; i < input_dims.size(); ++i) { // Formula to calculate the padding @@ -134,6 +251,7 @@ string GetCommonNameScope(const string& op_name_a, const string& op_name_b) { return op_name_a.substr(0, last_scope_separator); } +// Class to convert TF weight to TRT weight. class TRT_ShapedWeights { public: TRT_ShapedWeights(tensorflow::DataType type, const void* values, @@ -145,12 +263,14 @@ class TRT_ShapedWeights { explicit TRT_ShapedWeights(tensorflow::DataType type) : shape_(), type_(type), values_(nullptr), empty_weight_flag_(true) {} + // TODO(aaroey): use rvalue reference. TRT_ShapedWeights(const TRT_ShapedWeights& rhs) : shape_(rhs.shape_), type_(rhs.type_), values_(rhs.values_), empty_weight_flag_(rhs.empty_weight_flag_) {} + // TODO(aaroey): use GetShapeSize() instead. int64_t count() const { int64_t c = 1; for (int i = 0; i < shape_.nbDims; i++) c *= shape_.d[i]; @@ -168,6 +288,7 @@ class TRT_ShapedWeights { const void* GetValues() const { return values_; } + // TODO(aaroey): get rid of this method. void SetValues(const void* values) { values_ = values; } size_t size_bytes() const { @@ -178,10 +299,12 @@ class TRT_ShapedWeights { // Default converter operator nvinfer1::Weights() const { return GetWeightsForTRT(); } + // TODO(aaroey): make these private. nvinfer1::Dims shape_; tensorflow::DataType type_; private: + // TODO(aaroey): this should not be const as it's always from TRTWeightStore. const void* values_; bool empty_weight_flag_; }; @@ -192,6 +315,7 @@ class TRT_TensorOrWeights { : tensor_(tensor), weights_(DT_FLOAT), variant_(TRT_NODE_TENSOR) {} explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights) : tensor_(nullptr), weights_(weights), variant_(TRT_NODE_WEIGHTS) {} + // TODO(aaroey): use rvalue reference. TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs) : tensor_(rhs.tensor_), weights_(rhs.weights_), variant_(rhs.variant_) {} ~TRT_TensorOrWeights() {} @@ -200,19 +324,19 @@ class TRT_TensorOrWeights { bool is_weights() const { return variant_ == TRT_NODE_WEIGHTS; } nvinfer1::ITensor* tensor() { - CHECK_EQ(is_tensor(), true); + CHECK(is_tensor()); return tensor_; } const nvinfer1::ITensor* tensor() const { - CHECK_EQ(is_tensor(), true); + CHECK(is_tensor()); return tensor_; } TRT_ShapedWeights& weights() { - CHECK_EQ(is_weights(), true); + CHECK(is_weights()); return weights_; } const TRT_ShapedWeights& weights() const { - CHECK_EQ(is_weights(), true); + CHECK(is_weights()); return weights_; } nvinfer1::Dims shape() const { @@ -236,21 +360,25 @@ class TFAttrs { attrs_.insert({attr.first, &attr.second}); } } - bool count(string key) const { return attrs_.count(key); } - tensorflow::AttrValue const* at(string key) const { + + bool count(const string& key) const { return attrs_.count(key); } + + tensorflow::AttrValue const* at(const string& key) const { if (!attrs_.count(key)) { LOG(FATAL) << "Attribute not found: " << key; } return attrs_.at(key); } + template <typename T> T get(const string& key) const; + template <typename T> T get(const string& key, const T& default_value) const { return attrs_.count(key) ? this->get<T>(key) : default_value; } - std::vector<string> GetAllAttrKey() { + std::vector<string> GetAllAttrKeys() const { std::vector<string> attr_list; for (const auto& attr_item : attrs_) { attr_list.emplace_back(attr_item.first); @@ -285,15 +413,6 @@ std::vector<string> TFAttrs::get<std::vector<string>>(const string& key) const { auto attr = this->at(key)->list().s(); return std::vector<string>(attr.begin(), attr.end()); } -template <> -nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(const string& key) const { - auto values = this->get<std::vector<int>>(key); - nvinfer1::Dims dims; - dims.nbDims = values.size(); - std::copy(values.begin(), values.end(), dims.d); - // Note: No dimension type information is included - return dims; -} template <> nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const { @@ -319,10 +438,11 @@ bool TFAttrs::get<bool>(const string& key) const { } // TODO(jie): reorder4 & reorder2 should be merged? +// TODO(aaroey): fix the order of parameters. template <typename T> -void Reorder4(nvinfer1::DimsNCHW shape, const T* idata, - nvinfer1::DimsNCHW istrides, T* odata, - nvinfer1::DimsNCHW ostrides) { +void Reorder4(const nvinfer1::DimsNCHW& shape, const T* idata, + const nvinfer1::DimsNCHW& istrides, T* odata, + const nvinfer1::DimsNCHW& ostrides) { for (int n = 0; n < shape.n(); ++n) { for (int c = 0; c < shape.c(); ++c) { for (int h = 0; h < shape.h(); ++h) { @@ -337,12 +457,13 @@ void Reorder4(nvinfer1::DimsNCHW shape, const T* idata, } template <typename T> -void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, - T* odata, nvinfer1::DimsHW ostrides) { +void Reorder2(const nvinfer1::DimsHW& shape, const T* idata, + const nvinfer1::DimsHW& istrides, T* odata, + const nvinfer1::DimsHW& ostrides) { for (int h = 0; h < shape.h(); ++h) { for (int w = 0; w < shape.w(); ++w) { odata[h * ostrides.h() + w * ostrides.w()] = - idata[h * ostrides.h() + w * ostrides.w()]; + idata[h * istrides.h() + w * istrides.w()]; } } } @@ -350,16 +471,17 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, // TODO(jie): fallback to tensorflow!! void ReorderCKtoKC(const TRT_ShapedWeights& iweights, TRT_ShapedWeights* oweights) { - int c = iweights.shape_.d[0]; - int k = iweights.shape_.d[1]; + const int c = iweights.shape_.d[0]; + const int k = iweights.shape_.d[1]; oweights->shape_.d[0] = k; oweights->shape_.d[1] = c; - nvinfer1::DimsHW istrides = {1, k}; - nvinfer1::DimsHW ostrides = {c, 1}; + const nvinfer1::DimsHW istrides = {1, k}; + const nvinfer1::DimsHW ostrides = {c, 1}; switch (iweights.type_) { case tensorflow::DataType::DT_FLOAT: { Reorder2({k, c}, static_cast<float const*>(iweights.GetValues()), istrides, + // TODO(aaroey): get rid of all the const_cast like this. static_cast<float*>(const_cast<void*>(oweights->GetValues())), ostrides); break; @@ -382,21 +504,24 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights, TRT_ShapedWeights* oweights, int num_groups) { CHECK_EQ(iweights.type_, oweights->type_); CHECK_EQ(iweights.size_bytes(), oweights->size_bytes()); - int r = iweights.shape_.d[0]; - int s = iweights.shape_.d[1]; - // TRT requires GKcRS, while TF depthwise has RSCK - // where c=1, C=G + // K indexes over output channels, C over input channels, and R and S over the + // height and width of the convolution + const int r = iweights.shape_.d[0]; + const int s = iweights.shape_.d[1]; + // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G VLOG(2) << "num_groups: " << num_groups; - int c = iweights.shape_.d[2] / num_groups; + const int c = iweights.shape_.d[2] / num_groups; VLOG(2) << "c" << iweights.shape_.d[2] << " then " << c; - int k = iweights.shape_.d[3] * num_groups; + const int k = iweights.shape_.d[3] * num_groups; VLOG(2) << "k" << iweights.shape_.d[3] << " then " << k; + VLOG(2) << "r" << iweights.shape_.d[0] << " then " << r; + VLOG(2) << "s" << iweights.shape_.d[1] << " then " << s; oweights->shape_.d[0] = k / num_groups; oweights->shape_.d[1] = c * num_groups; oweights->shape_.d[2] = r; oweights->shape_.d[3] = s; - nvinfer1::DimsNCHW istrides = {1, k, s * k * c, c * k}; - nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1}; + const nvinfer1::DimsNCHW istrides = {1, k, s * k * c, c * k}; + const nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1}; switch (iweights.type_) { case tensorflow::DataType::DT_FLOAT: { Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()), @@ -428,11 +553,14 @@ using OpConverter = std::vector<TRT_TensorOrWeights>*)>; class Converter { + // TODO(aaroey): fix the order of members. std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_; std::unordered_map<string, OpConverter> op_registry_; OpConverter plugin_converter_; nvinfer1::INetworkDefinition* trt_network_; std::list<std::vector<uint8_t>> temp_bufs_; + // TODO(aaroey): inline the definition of TRTWeightStore here, and add APIs to + // operate the stored weights instead of operating it directly. TRTWeightStore* weight_store_; bool fp16_; void register_op_converters(); @@ -440,7 +568,7 @@ class Converter { std::vector<TRT_TensorOrWeights>* inputs) { for (auto const& input_name : node_def.input()) { /************************************************************************* - * TODO(jie) handle case 1) here + * TODO(jie): handle case 1) here. * Normalizes the inputs and extracts associated metadata: * 1) Inputs can contain a colon followed by a suffix of characters. * That suffix may be a single number (e.g. inputName:1) or several @@ -454,6 +582,7 @@ class Converter { if (input_name[0] == '^') continue; string name = input_name; auto first = name.find_first_of(':'); + // TODO(aaroey): why removing the colon but not the zero? A bug? if (first != string::npos && first + 2 == name.size() && name[first + 1] == '0') name.erase(first); @@ -462,12 +591,13 @@ class Converter { if (trt_tensors_.count(name)) { inputs->push_back(trt_tensors_.at(name)); } else { - string str("Node "); - StrAppend(&str, node_def.name(), " should have an input named '", name, + // TODO(aaroey): this should not happen, make it a CHECK. + // TODO(aaroey): use StrCat for pattern like this. + string msg("Node "); + StrAppend(&msg, node_def.name(), " should have an input named '", name, "' but it is not available"); - LOG(WARNING) << "input: " << name << " not available for node at " - << node_def.name(); - return tensorflow::errors::InvalidArgument(str); + LOG(ERROR) << msg; + return tensorflow::errors::InvalidArgument(msg); } } return tensorflow::Status::OK(); @@ -488,6 +618,7 @@ class Converter { weights.SetValues(weight_store_->store_.back().data()); return weights; } + // TODO(aaroey): fix all the namings. bool isFP16() { return fp16_; } TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) { return this->get_temp_weights(weights.type_, weights.shape_); @@ -496,7 +627,7 @@ class Converter { tensorflow::Status convert_node(const tensorflow::NodeDef& node_def) { std::vector<TRT_TensorOrWeights> inputs; TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs)); - string op = node_def.op(); + const string& op = node_def.op(); std::vector<TRT_TensorOrWeights> outputs; if (PluginFactoryTensorRT::GetInstance()->IsPlugin(op)) { TF_RETURN_IF_ERROR(plugin_converter_(*this, node_def, inputs, &outputs)); @@ -509,7 +640,7 @@ class Converter { TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs)); } for (size_t i = 0; i < outputs.size(); ++i) { - TRT_TensorOrWeights output = outputs.at(i); + TRT_TensorOrWeights& output = outputs[i]; // TODO(jie): tf protobuf seems to be omitting the :0 suffix string output_name = node_def.name(); if (i != 0) output_name = StrCat(output_name, ":", i); @@ -527,26 +658,29 @@ class Converter { nvinfer1::INetworkDefinition* network() { return trt_network_; } - TRT_TensorOrWeights get_tensor(string name) { + TRT_TensorOrWeights get_tensor(const string& name) { if (!trt_tensors_.count(name)) { return TRT_TensorOrWeights(nullptr); } return trt_tensors_.at(name); } - bool insert_input_tensor(string name, nvinfer1::ITensor* tensor) { + bool insert_input_tensor(const string& name, nvinfer1::ITensor* tensor) { return trt_tensors_.insert({name, TRT_TensorOrWeights(tensor)}).second; } nvinfer1::ITensor* TransposeTensor(nvinfer1::ITensor* input_tensor, - std::vector<int> order) { - auto dims = input_tensor->getDimensions(); + const std::vector<int>& order) { + const auto dims = input_tensor->getDimensions(); // TODO(jie): change the return to status and properly exit if (order.size() - 1 != size_t(dims.nbDims)) LOG(ERROR) << "Dimension does not match, fail gracefully"; nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor); + if (layer == nullptr) { + return nullptr; + } nvinfer1::Permutation permutation; for (int32_t i = 0; i < dims.nbDims; ++i) { permutation.order[i] = order[i + 1] - 1; @@ -577,13 +711,14 @@ TRT_ShapedWeights ConvertFP32ToFP16(Converter& ctx, } return weights; } + // **************************************************************************** // Constant folding functions // TODO(jie): once optimizer kicks in, we should have done constant folding // there. -//*****************************************************************************/ +// ***************************************************************************** struct LambdaFactory { - enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB }; + enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB, RECIP }; OP_CATEGORY op; template <typename T> @@ -595,6 +730,8 @@ struct LambdaFactory { } case OP_CATEGORY::NEG: return [](T t) -> T { return -t; }; + case OP_CATEGORY::RECIP: + return [](T t) -> T { return 1.0 / t; }; default: VLOG(2) << "Not supported op for unary: " << static_cast<int>(op); return nullptr; @@ -628,7 +765,6 @@ struct LambdaFactory { VLOG(2) << "LAMBDA VAL : " << val; return l + val; }; - // Return [val](T l)-> T {return l+val;}; case OP_CATEGORY::SUB: return [val](T l) -> T { VLOG(2) << "LAMBDA VAL : " << val; @@ -688,11 +824,13 @@ std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() { } case OP_CATEGORY::NEG: return [](Eigen::half t) -> Eigen::half { return -t; }; + // TODO(aaroey): can we support RECIP? default: VLOG(2) << "Not supported op for unary: " << static_cast<int>(op); return nullptr; } } + tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights, TRT_ShapedWeights* oweights, LambdaFactory unary_op) { @@ -738,6 +876,7 @@ tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l, if (iweights_l.count() != iweights_r.count()) { // We only supports broadcast of RankZero if (iweights_l.count() == 1) { + // TODO(aaroey): Remove loggings like this. VLOG(2) << "I bet it is not working!" << (*inp_l); std::transform(inp_r, inp_r + iweights_r.count(), oup, binary_op.broadcast_l<float>(*inp_l)); @@ -790,117 +929,21 @@ tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l, return tensorflow::Status::OK(); } -tensorflow::Status ConstantFoldUnary( - Converter& ctx, const tensorflow::NodeDef& node_def, - const std::vector<TRT_TensorOrWeights>& inputs, - std::vector<TRT_TensorOrWeights>* outputs) { - TRT_ShapedWeights weights_input = inputs.at(0).weights(); - - // Allocate output weights - TRT_ShapedWeights weights_output = ctx.get_temp_weights_like(weights_input); - - // FIXME assume type matches input weights - // Get trt type & shape - // Maybe this part has to be moved into the block of rsqrt later - // Check type consistency - CHECK_EQ(weights_input.type_, - TFAttrs(node_def).get<tensorflow::DataType>("T")); - - LambdaFactory unary_op; - if (node_def.op() == "Rsqrt") { - // Compute rsqrt - unary_op.op = LambdaFactory::OP_CATEGORY::RSQRT; - auto ret = UnaryCompute(weights_input, &weights_output, unary_op); - // Pass the output - if (ret == tensorflow::Status::OK()) { - outputs->push_back(TRT_TensorOrWeights(weights_output)); - } - return ret; - } else { - return tensorflow::errors::Unimplemented("Binary op not supported: " + - node_def.op()); - } -} - -// TODO(jie,ben) broadcast is needed yet not implemented -// Let's get the simple stuff working first. Maybe we should fall back to TF -// approach for constant folding -tensorflow::Status ConstantFoldBinary( - Converter& ctx, const tensorflow::NodeDef& node_def, - const std::vector<TRT_TensorOrWeights>& inputs, - std::vector<TRT_TensorOrWeights>* outputs) { - TRT_ShapedWeights weights_input_l = inputs.at(0).weights(); - TRT_ShapedWeights weights_input_r = inputs.at(1).weights(); - - // Check type consistency - CHECK_EQ(weights_input_l.type_, weights_input_r.type_); - - if (weights_input_l.shape_.nbDims != weights_input_r.shape_.nbDims) - return tensorflow::errors::Unimplemented( - "Binary op implicit broadcast not supported: " + node_def.op()); - - // TODO(jie): constant fold should really fall back to TF. - int num_dims = weights_input_l.shape_.nbDims; - nvinfer1::Dims output_shape; - output_shape.nbDims = num_dims; - VLOG(2) << "nb_dims: " << num_dims - << ", the other: " << weights_input_r.shape_.nbDims; - for (int i = 0; i < num_dims; i++) { - if (weights_input_l.shape_.d[i] == weights_input_r.shape_.d[i]) { - output_shape.d[i] = weights_input_l.shape_.d[i]; - } else if (weights_input_l.shape_.d[i] == 1 || - weights_input_r.shape_.d[i] == 1) { - output_shape.d[i] = - std::max(weights_input_l.shape_.d[i], weights_input_r.shape_.d[i]); - } else { - return tensorflow::errors::Unimplemented( - "Binary op with incompatible shape at, " + node_def.op()); - } - VLOG(2) << "left: " << weights_input_l.shape_.d[i] - << "right: " << weights_input_r.shape_.d[i] - << "output: " << output_shape.d[i]; - } - - // FIXME assume type matches input weights - // Get trt type & shape - TFAttrs attrs(node_def); - // Maybe this part has to be moved into the block of rsqrt later - tensorflow::DataType dtype = attrs.get<tensorflow::DataType>("T"); - - // Allocate output weights - TRT_ShapedWeights weights_output = ctx.get_temp_weights(dtype, output_shape); - - LambdaFactory binary_op; - if (node_def.op() == "Sub") { - binary_op.op = LambdaFactory::OP_CATEGORY::SUB; - } else if (node_def.op() == "Mul") { - binary_op.op = LambdaFactory::OP_CATEGORY::MUL; - } else if (node_def.op() == "Add") { - binary_op.op = LambdaFactory::OP_CATEGORY::ADD; - } else { - return tensorflow::errors::Unimplemented("Binary op not supported: " + - node_def.op()); - } - auto ret = BinaryCompute(weights_input_l, weights_input_r, &weights_output, - binary_op); - - // Pass the output - if (ret == tensorflow::Status::OK()) { - outputs->push_back(TRT_TensorOrWeights(weights_output)); - } - - return ret; -} - // TODO(jie): broadcast is needed yet not implemented. // Only implemented channel wise for the time being tensorflow::Status BinaryTensorOpWeight( Converter& ctx, const tensorflow::NodeDef& node_def, const nvinfer1::ITensor* tensor, TRT_ShapedWeights weights, - std::vector<TRT_TensorOrWeights>* outputs) { - // FIXME assume type matches input weights - // Get trt type & shape - // Maybe this part has to be moved into the block of rsqrt later + bool swapped_inputs, std::vector<TRT_TensorOrWeights>* outputs) { + // tensor is the left operand while weights is the right operand; + // when swapped_inputs set to true, those two are swapped. + // TODO(aaroey): use a set. + if (node_def.op() != "Sub" && node_def.op() != "Add" && + node_def.op() != "Mul" && node_def.op() != "Div" && + node_def.op() != "RealDiv") { + return tensorflow::errors::Unimplemented( + "op not supported: " + node_def.op() + ", at: " + node_def.name()); + } // Check type consistency nvinfer1::DataType ttype; @@ -910,6 +953,12 @@ tensorflow::Status BinaryTensorOpWeight( auto dims_w = weights.shape_; auto dims_t = tensor->getDimensions(); + // TODO(jie): addScale checks for input tensor dimension + if (dims_t.nbDims != 3) { + return tensorflow::errors::InvalidArgument( + "addScale requires tensor with rank 3, " + node_def.name()); + } + // default to element-wise auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; @@ -980,6 +1029,7 @@ tensorflow::Status BinaryTensorOpWeight( permutation[dims_t.nbDims] = 1; tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor), permutation); + TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name()); } else { return tensorflow::errors::InvalidArgument( "Transpose cannot be applied, " + node_def.name()); @@ -997,11 +1047,35 @@ tensorflow::Status BinaryTensorOpWeight( // Maybe I should do a switch if (node_def.op() == "Sub") { - TRT_ShapedWeights neg_weights = ctx.get_temp_weights_like(weights); - LambdaFactory unary_op; - unary_op.op = LambdaFactory::OP_CATEGORY::NEG; - TF_RETURN_IF_ERROR(UnaryCompute(weights, &neg_weights, unary_op)); - shift_weights = neg_weights; + if (swapped_inputs) { + shift_weights = weights; + nvinfer1::IUnaryLayer* layer = + ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor), + nvinfer1::UnaryOperation::kNEG); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + tensor = layer->getOutput(0); + } else { + TRT_ShapedWeights neg_weights = ctx.get_temp_weights_like(weights); + LambdaFactory unary_op; + unary_op.op = LambdaFactory::OP_CATEGORY::NEG; + TF_RETURN_IF_ERROR(UnaryCompute(weights, &neg_weights, unary_op)); + shift_weights = neg_weights; + } + } else if (node_def.op() == "Div" || node_def.op() == "RealDiv") { + if (swapped_inputs) { + scale_weights = weights; + nvinfer1::IUnaryLayer* layer = + ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor), + nvinfer1::UnaryOperation::kRECIP); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + tensor = layer->getOutput(0); + } else { + TRT_ShapedWeights recip_weights = ctx.get_temp_weights_like(weights); + LambdaFactory unary_op; + unary_op.op = LambdaFactory::OP_CATEGORY::RECIP; + TF_RETURN_IF_ERROR(UnaryCompute(weights, &recip_weights, unary_op)); + scale_weights = recip_weights; + } } else if (node_def.op() == "Mul") { scale_weights = weights; } else if (node_def.op() == "Add") { @@ -1014,11 +1088,13 @@ tensorflow::Status BinaryTensorOpWeight( nvinfer1::IScaleLayer* layer = ctx.network()->addScale( *const_cast<nvinfer1::ITensor*>(tensor), scale_mode, shift_weights, scale_weights, power_weights); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); nvinfer1::ITensor* output_tensor = layer->getOutput(0); // transpose back dimension if (permutation_flag) { output_tensor = ctx.TransposeTensor(output_tensor, permutation); + TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name()); } // Pass the output @@ -1042,20 +1118,31 @@ tensorflow::Status ConvertConv2DHelper( if (data_format == "NHWC") { tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}); + TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name()); h_index = 1; w_index = 2; // TODO(jie): transpose it } // tensor after transpose (NCHW) - auto tensor_dim = tensor->getDimensions(); + const auto tensor_dim = tensor->getDimensions(); int num_groups = group; - if (num_groups == 0) // depthwise convolution - num_groups = tensor_dim.d[0]; + if (num_groups == 0) num_groups = tensor_dim.d[0]; // depthwise convolution VLOG(2) << "groups count: " << num_groups; TRT_ShapedWeights weights_rsck = inputs.at(1).weights(); + + VLOG(2) << "weight shape: " << weights_rsck.shape_.nbDims; + for (int i = 0; i < weights_rsck.shape_.nbDims; i++) { + VLOG(2) << weights_rsck.shape_.d[i]; + } + + if (weights_rsck.shape_.nbDims != 4) { + return tensorflow::errors::Internal( + "Conv2D expects kernel of dimension 4, at: " + node_def.name()); + } + if (ctx.isFP16()) { weights_rsck = ConvertFP32ToFP16(ctx, inputs.at(1).weights()); } @@ -1063,18 +1150,22 @@ tensorflow::Status ConvertConv2DHelper( TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck); ReorderRSCKToKCRS(weights_rsck, &weights, num_groups); TRT_ShapedWeights biases(weights.type_); - int noutput = weights.shape_.d[0] * num_groups; + const int noutput = weights.shape_.d[0] * num_groups; nvinfer1::DimsHW kernel_size; kernel_size.h() = weights.shape_.d[2]; kernel_size.w() = weights.shape_.d[3]; + VLOG(2) << "RSCK: "; + for (int i = 0; i < 4; i++) { + VLOG(2) << " " << weights.shape_.d[i]; + } VLOG(2) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w(); // TODO(jie): stride. (NHWC/NCHW) - auto tf_stride = attrs.get<std::vector<int>>("strides"); + const auto tf_stride = attrs.get<std::vector<int>>("strides"); VLOG(2) << "h_INDEX" << h_index << ", w_index " << w_index; VLOG(2) << "stride!!!: " << tf_stride[0] << tf_stride[1] << tf_stride[2] << tf_stride[3]; - nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]); + const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]); std::vector<std::pair<int, int>> padding; // TODO(jie): padding. @@ -1102,6 +1193,7 @@ tensorflow::Status ConvertConv2DHelper( *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::DimsHW(padding[0].first, padding[1].first), nvinfer1::DimsHW(padding[0].second, padding[1].second)); + TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name()); padding = {{0, 0}, {0, 0}}; tensor = pad_layer->getOutput(0); auto dim_after = tensor->getDimensions(); @@ -1112,6 +1204,7 @@ tensorflow::Status ConvertConv2DHelper( nvinfer1::IConvolutionLayer* layer = ctx.network()->addConvolution(*const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size, weights, biases); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); layer->setStride(stride); layer->setPadding({padding[0].first, padding[1].first}); @@ -1126,6 +1219,7 @@ tensorflow::Status ConvertConv2DHelper( if (data_format == "NHWC") { // TODO(jie): transpose it back! output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1}); + TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name()); } else { VLOG(2) << "NCHW !!!!"; } @@ -1147,35 +1241,91 @@ tensorflow::Status ConvertConv2DHelper( node_def.name()); } +// Helper function converts input into tensor with shape specified by dims. +bool PrepareTensorForShape(Converter& ctx, const TRT_TensorOrWeights& input, + const nvinfer1::Dims& dims, + const nvinfer1::ITensor** tensor) { + if (input.is_tensor()) { + if (DimsEqual(input.shape(), dims)) { + *tensor = input.tensor(); + } else { + nvinfer1::IShuffleLayer* layer = ctx.network()->addShuffle( + *const_cast<nvinfer1::ITensor*>(input.tensor())); + if (layer != nullptr) { + layer->setReshapeDimensions(dims); + *tensor = layer->getOutput(0); + } else { + return false; + } + } + } else { +#if NV_TENSORRT_MAJOR > 3 + nvinfer1::IConstantLayer* layer = + ctx.network()->addConstant(dims, input.weights()); + if (layer != nullptr) { + *tensor = layer->getOutput(0); + } else { + return false; + } +#else + return false; +#endif + } + return true; +} + tensorflow::Status BinaryTensorOpTensor( Converter& ctx, const tensorflow::NodeDef& node_def, - const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r, + const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r, std::vector<TRT_TensorOrWeights>* outputs) { static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{ {"Add", nvinfer1::ElementWiseOperation::kSUM}, {"Mul", nvinfer1::ElementWiseOperation::kPROD}, {"Sub", nvinfer1::ElementWiseOperation::kSUB}, {"Div", nvinfer1::ElementWiseOperation::kDIV}, + {"RealDiv", nvinfer1::ElementWiseOperation::kDIV}, + {"Minimum", nvinfer1::ElementWiseOperation::kMIN}, + {"Maximum", nvinfer1::ElementWiseOperation::kMAX}, }; - // FIXME assume type matches input weights + const nvinfer1::ITensor* tensor_l; + const nvinfer1::ITensor* tensor_r; + + nvinfer1::Dims dim_l; + nvinfer1::Dims dim_r; + + if (!TensorRTGetBroadcastShape(operand_l.shape(), operand_l.is_tensor(), + operand_r.shape(), operand_r.is_tensor(), + &dim_l, &dim_r)) { + return tensorflow::errors::InvalidArgument( + "Binary op broadcast scheme not supported by TensorRT op: " + + node_def.op() + ", at: " + node_def.name()); + } + + TFTRT_RETURN_ERROR_IF_FALSE( + PrepareTensorForShape(ctx, operand_l, dim_l, &tensor_l), node_def.name()); + TFTRT_RETURN_ERROR_IF_FALSE( + PrepareTensorForShape(ctx, operand_r, dim_r, &tensor_r), node_def.name()); + // get trt type & shape TFAttrs attrs(node_def); // maybe this part has to be moved into the block of rsqrt later nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T"); // check type consistency - CHECK_EQ_TYPE(tensor_l->getType(), dtype); - CHECK_EQ_TYPE(tensor_r->getType(), dtype); + TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype); + TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype); auto op_pair = ops.find(node_def.op()); - if (op_pair == ops.end()) + if (op_pair == ops.end()) { return tensorflow::errors::Unimplemented( - "binary op: " + node_def.op() + - " not supported at: " + node_def.name()); + "binary op: ", node_def.op(), " not supported at: ", node_def.name()); + } nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise( + // TODO(aaroey): will tensor_l/tensor_r get modified? *const_cast<nvinfer1::ITensor*>(tensor_l), *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); nvinfer1::ITensor* output_tensor = layer->getOutput(0); @@ -1202,7 +1352,7 @@ tensorflow::Status ConvertPlugin(Converter& ctx, // passing attributes // TODO(jie): support more general attribute TFAttrs attrs(node_def); - auto attr_key_vector = attrs.GetAllAttrKey(); + auto attr_key_vector = attrs.GetAllAttrKeys(); for (auto attr_key : attr_key_vector) { // TODO(jie): support only list of float for toy example here. auto data = attrs.get<std::vector<float>>(attr_key); @@ -1223,29 +1373,6 @@ tensorflow::Status ConvertPlugin(Converter& ctx, return tensorflow::Status::OK(); } -tensorflow::Status ConvertPlaceholder( - Converter& ctx, const tensorflow::NodeDef& node_def, - const std::vector<TRT_TensorOrWeights>& inputs, - std::vector<TRT_TensorOrWeights>* outputs) { - VLOG(2) << "Placeholder should have been replace already"; - return tensorflow::errors::Unimplemented("cannot convert Placeholder op"); - // OK this make sense since we are supposed to replace it with input - TFAttrs attrs(node_def); - nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype"); - nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape"); - - dims.nbDims--; - for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1]; - - nvinfer1::ITensor* output = - ctx.network()->addInput(node_def.name().c_str(), dtype, dims); - if (!output) { - return tensorflow::errors::InvalidArgument("Failed to create Input layer"); - } - outputs->push_back(TRT_TensorOrWeights(output)); - return tensorflow::Status::OK(); -} - tensorflow::Status ConvertConv2D(Converter& ctx, const tensorflow::NodeDef& node_def, const std::vector<TRT_TensorOrWeights>& inputs, @@ -1271,65 +1398,64 @@ tensorflow::Status ConvertPool(Converter& ctx, int h_index = 2; int w_index = 3; - auto data_format = attrs.get<string>("data_format"); + const auto data_format = attrs.get<string>("data_format"); if (data_format == "NHWC") { h_index = 1; w_index = 2; tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}); - } else { - VLOG(2) << "NCHW !!!!"; + TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name()); } + nvinfer1::PoolingType type; - // TODO(jie): support other pooling type - if (node_def.op() == "MaxPool") + if (node_def.op() == "MaxPool") { type = nvinfer1::PoolingType::kMAX; - else if (node_def.op() == "AvgPool") + } else if (node_def.op() == "AvgPool") { type = nvinfer1::PoolingType::kAVERAGE; - else - return tensorflow::errors::Unimplemented("Only supports Max pool"); + } else { + return tensorflow::errors::Unimplemented("Unsupported pool type: ", + node_def.op()); + } - // TODO(jie): NCHW - auto tf_stride = attrs.get<std::vector<int>>("strides"); - nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]); + const auto tf_stride = attrs.get<std::vector<int>>("strides"); + const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]); - auto tf_kernel = attrs.get<std::vector<int>>("ksize"); - nvinfer1::DimsHW ksize(tf_kernel[h_index], tf_kernel[w_index]); + const auto tf_kernel = attrs.get<std::vector<int>>("ksize"); + const nvinfer1::DimsHW ksize(tf_kernel[h_index], tf_kernel[w_index]); auto tensor_dim = tensor->getDimensions(); std::vector<std::pair<int, int>> padding; - // TODO(jie): padding. - if (attrs.get<string>("padding") == "SAME") { + const string padding_type = attrs.get<string>("padding"); + if (padding_type == "SAME") { // This is NCHW tensor with no batch dimension. // 1 -> h // 2 -> w padding = CreateSamePadding( stride, ksize, {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])}); - } else if (attrs.get<string>("padding") == "VALID") { - // No padding for valid padding here - VLOG(2) << "No padding added for VALID padding in pool" << node_def.name(); + } else if (padding_type == "VALID") { padding = {{0, 0}, {0, 0}}; } else { - return tensorflow::errors::Unimplemented( - "Current MaxPool cannot support padding other than SAME"); + return tensorflow::errors::Unimplemented("Unsupported padding type: ", + padding_type); } if (padding[0].first != padding[0].second || padding[1].first != padding[1].second) { - // TODO(jie): handle asymmetric padding VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second << padding[1].first << padding[1].second; auto pad_layer = ctx.network()->addPadding( *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::DimsHW(padding[0].first, padding[1].first), nvinfer1::DimsHW(padding[0].second, padding[1].second)); + TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name()); padding = {{0, 0}, {0, 0}}; tensor = pad_layer->getOutput(0); } nvinfer1::IPoolingLayer* layer = ctx.network()->addPooling( *const_cast<nvinfer1::ITensor*>(tensor), type, ksize); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); layer->setStride(stride); layer->setPadding({padding[0].first, padding[1].first}); @@ -1337,10 +1463,8 @@ tensorflow::Status ConvertPool(Converter& ctx, nvinfer1::ITensor* output_tensor = layer->getOutput(0); if (data_format == "NHWC") { - // TODO(jie): transpose it back! output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1}); - } else { - VLOG(2) << "NCHW !!!!"; + TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name()); } outputs->push_back(TRT_TensorOrWeights(output_tensor)); return tensorflow::Status::OK(); @@ -1353,6 +1477,7 @@ tensorflow::Status ConvertActivation( const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); nvinfer1::IActivationLayer* layer = ctx.network()->addActivation( *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ActivationType::kRELU); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); nvinfer1::ITensor* output_tensor = layer->getOutput(0); outputs->push_back(TRT_TensorOrWeights(output_tensor)); return tensorflow::Status::OK(); @@ -1363,40 +1488,61 @@ tensorflow::Status ConvertScale(Converter& ctx, const std::vector<TRT_TensorOrWeights>& inputs, std::vector<TRT_TensorOrWeights>* outputs) { if (inputs.size() != 2 || !inputs.at(0).is_tensor() || - !inputs.at(1).is_weights()) + !inputs.at(1).is_weights()) { return tensorflow::errors::Unimplemented( - "Only supports tensor op weight for now, at " + node_def.name()); - // Implement tensor binaryOp weight [channel wise] for now; - const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); + "ConvertScale only supports tensor<op>weight: ", node_def.name()); + } + const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); TRT_ShapedWeights weights = inputs.at(1).weights(); if (ctx.isFP16()) { weights = ConvertFP32ToFP16(ctx, inputs.at(1).weights()); } TRT_ShapedWeights empty_weights(weights.type_); - TFAttrs attrs(node_def); - // Transpose NHWC - auto data_format = attrs.get<string>("data_format"); + const auto data_format = attrs.get<string>("data_format"); + int channel_index; + const auto dims = tensor->getDimensions(); if (data_format == "NHWC") { - tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor), - {0, 3, 1, 2}); - // TODO(jie): transpose it + // 1). NHWC is really N+C + channel_index = dims.nbDims - 1; // batch dimension is implicit here! } else { - VLOG(2) << "NCHW !!!!"; + // 2). NCHW is really N+CHW + channel_index = dims.nbDims - 3; // batch dimension is implicit here! } - auto dims = tensor->getDimensions(); - VLOG(2) << "tensor dimensions: " << dims.nbDims; - for (int i = 0; i < dims.nbDims; i++) { - VLOG(2) << "i: " << dims.d[i]; + nvinfer1::Permutation permutation; + for (int32_t i = 0; i < dims.nbDims; ++i) { + permutation.order[i] = i; } - dims = weights.shape_; - VLOG(2) << "tensor dimensions: " << dims.nbDims; - for (int i = 0; i < dims.nbDims; i++) { - VLOG(2) << "i: " << dims.d[i]; + + if (channel_index >= 0) { + permutation.order[0] = channel_index; + permutation.order[channel_index] = 0; + } else { + return tensorflow::errors::Unimplemented( + "TFTRT::BiasAdd cannot apply on batch dimension, at ", node_def.name()); + } + + // TensorRT addScale requires input to be of rank 3, we need to apply + // transpose as well as reshape + if (channel_index != 0 || dims.nbDims != 3) { + nvinfer1::IShuffleLayer* shuffle_layer = + ctx.network()->addShuffle(*const_cast<nvinfer1::ITensor*>(tensor)); + TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name()); + nvinfer1::Dims reshape_dims; + reshape_dims.nbDims = 3; + reshape_dims.d[0] = 0; // 0 copy from the input + reshape_dims.d[1] = dims.nbDims >= 2 ? 0 : 1; // 0 copy from the input + reshape_dims.d[2] = dims.nbDims >= 3 ? -1 : 1; // -1 infer from the rest + if (channel_index != 0) { + // maybe we do not need this check. concerned about TRT optimization + shuffle_layer->setFirstTranspose(permutation); + } + shuffle_layer->setReshapeDimensions(reshape_dims); + tensor = shuffle_layer->getOutput(0); } nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL; @@ -1407,14 +1553,26 @@ tensorflow::Status ConvertScale(Converter& ctx, nvinfer1::IScaleLayer* layer = ctx.network()->addScale(*const_cast<nvinfer1::ITensor*>(tensor), mode, weights, empty_weights, empty_weights); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); nvinfer1::ITensor* output_tensor = layer->getOutput(0); - if (data_format == "NHWC") { - // TODO(jie): transpose it back! - output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1}); - } else { - VLOG(2) << "NCHW !!!!"; + + // restore transpose & reshape + if (channel_index != 0 || dims.nbDims != 3) { + nvinfer1::IShuffleLayer* shuffle_layer = ctx.network()->addShuffle( + *const_cast<nvinfer1::ITensor*>(output_tensor)); + TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name()); + nvinfer1::Dims reshape_dims = dims; + int tmp = reshape_dims.d[channel_index]; + reshape_dims.d[channel_index] = reshape_dims.d[0]; + reshape_dims.d[0] = tmp; + shuffle_layer->setReshapeDimensions(reshape_dims); + if (channel_index != 0) { + shuffle_layer->setSecondTranspose(permutation); + } + output_tensor = shuffle_layer->getOutput(0); } + outputs->push_back(TRT_TensorOrWeights(output_tensor)); return tensorflow::Status::OK(); } @@ -1431,11 +1589,13 @@ tensorflow::Status ConvertConst(Converter& ctx, // Create shaped weights as output tensorflow::Tensor tensor; - if (!tensor.FromProto(weights_tensor)) - return tensorflow::errors::Internal("Cannot parse weight tensor proto: " + + if (!tensor.FromProto(weights_tensor)) { + return tensorflow::errors::Internal("Cannot parse weight tensor proto: ", node_def.name()); + } TRT_ShapedWeights weights(dtype); + // TODO(aaroey): we should choose the array using dtype and shape. if (!weights_tensor.float_val().empty()) { VLOG(2) << "SCALAR!!!" << node_def.name(); nvinfer1::Dims scalar_shape; @@ -1443,22 +1603,16 @@ tensorflow::Status ConvertConst(Converter& ctx, VLOG(2) << "dimensions: " << tensor.dims(); VLOG(2) << "size: " << weights_tensor.float_val_size(); scalar_shape = GetTensorShape(tensor); + VLOG(2) << "details: "; for (int i = 0; i < scalar_shape.nbDims; i++) VLOG(2) << scalar_shape.d[i]; - if (GetShapeSize(scalar_shape) != weights_tensor.float_val_size()) { - if (weights_tensor.float_val_size() == 1 || - scalar_shape.d[0] == weights_tensor.float_val_size()) { - scalar_shape.nbDims = 1; - // no dimension provided. flatten it - scalar_shape.d[0] = weights_tensor.float_val_size(); - scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL; - } else { - LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and" - << " kUNIFORM, at: " << node_def.name(); - string err_str("Broadcast method is not supported for '"); - StrAppend(&err_str, node_def.name(), "' of type ", node_def.op()); - return tensorflow::errors::InvalidArgument(err_str); - } + if (GetShapeSize(scalar_shape) != weights_tensor.float_val_size() && + weights_tensor.float_val_size() != 1) { + LOG(ERROR) << "Broadcast on weights only supports kCHANNEL and" + << " kUNIFORM, at: " << node_def.name(); + string err_str("Broadcast method is not supported for '"); + StrAppend(&err_str, node_def.name(), "' of type ", node_def.op()); + return tensorflow::errors::InvalidArgument(err_str); } } else { VLOG(2) << "Dimensions: " << tensor.dims(); @@ -1468,39 +1622,42 @@ tensorflow::Status ConvertConst(Converter& ctx, scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL; for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) { scalar_shape.d[i] = 0; - scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL; } } + // TODO(aaroey): use GetShapeSize(). size_t len_data = tensorflow::DataTypeSize(dtype); for (int i = 0; i < scalar_shape.nbDims; i++) len_data *= scalar_shape.d[i]; ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data)); void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0])); - std::vector<float> tensor_data( - weights_tensor.float_val().begin(), - weights_tensor.float_val() - .end()); // make a local copy first to flatten - memcpy(dst, tensor_data.data(), len_data); // store into weight store + if (weights_tensor.float_val_size() == 1) { + std::fill_n((float*)dst, GetShapeSize(scalar_shape), + *weights_tensor.float_val().begin()); + } else { + // TODO(aaroey): get rid of this copy as RepeatedField is always + // contiguous make a local copy first to flatten doesn't have to be + // contiguous + std::vector<float> tensor_data(weights_tensor.float_val().begin(), + weights_tensor.float_val().end()); + memcpy(dst, tensor_data.data(), len_data); // store into weight store + } + VLOG(2) << "create shape details: "; + for (int i = 0; i < scalar_shape.nbDims; i++) VLOG(2) << scalar_shape.d[i]; weights = TRT_ShapedWeights(dtype, dst, scalar_shape); } else if (!weights_tensor.int_val().empty()) { + // TODO(aaroey): this is very similar to the above code for float, merge + // them. VLOG(2) << "int!!!" << node_def.name(); nvinfer1::Dims scalar_shape; if (tensor.dims() > 0) { VLOG(2) << "dimensions: " << tensor.dims(); scalar_shape = GetTensorShape(tensor); - if (GetShapeSize(scalar_shape) != weights_tensor.int_val_size()) { - if (weights_tensor.int_val_size() == 1 || - scalar_shape.d[0] == weights_tensor.int_val_size()) { - scalar_shape.nbDims = 1; - // no dimension provided. flatten it - scalar_shape.d[0] = weights_tensor.int_val_size(); - scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL; - } else { - LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and" - << " kUNIFORM, at: " << node_def.name(); - string err_str("Broadcast method is not supported for '"); - StrAppend(&err_str, node_def.name(), "' of type ", node_def.op()); - return tensorflow::errors::InvalidArgument(err_str); - } + if (GetShapeSize(scalar_shape) != weights_tensor.int_val_size() && + weights_tensor.int_val_size() != 1) { + LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and" + << " kUNIFORM, at: " << node_def.name(); + string err_str("Broadcast method is not supported for '"); + StrAppend(&err_str, node_def.name(), "' of type ", node_def.op()); + return tensorflow::errors::InvalidArgument(err_str); } } else { VLOG(2) << "dimensions: " << tensor.dims(); @@ -1513,23 +1670,30 @@ tensorflow::Status ConvertConst(Converter& ctx, scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL; } } - // we should not have converted //if (ctx.isFP16()) { + // we should not have converted size_t len_data = tensorflow::DataTypeSize(dtype); for (int i = 0; i < scalar_shape.nbDims; i++) len_data *= scalar_shape.d[i]; size_t len_tensor = weights_tensor.int_val_size() * sizeof(int32); len_data = std::max(len_data, len_tensor); ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data)); void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0])); - std::vector<int32> tensor_data( - weights_tensor.int_val().begin(), - weights_tensor.int_val().end()); // make a local copy first to flatten - // doesn't have to be contigous - memcpy(dst, tensor_data.data(), len_tensor); // store into weight store + if (weights_tensor.int_val_size() == 1) { + std::fill_n((int*)dst, GetShapeSize(scalar_shape), + *weights_tensor.int_val().begin()); + } else { + // TODO(aaroey): get rid of this copy as RepeatedField is always + // contiguous make a local copy first to flatten doesn't have to be + // contiguous + std::vector<int32> tensor_data(weights_tensor.int_val().begin(), + weights_tensor.int_val().end()); + memcpy(dst, tensor_data.data(), len_tensor); // store into weight store + } weights = TRT_ShapedWeights(dtype, dst, scalar_shape); } else if (!weights_tensor.tensor_content().empty()) { - // obsolete method. - // After optimization path, we do not see weights in this format. - // fp16 conversion technically should be needed here. + // obsolete method. + // After optimization path, we do not see weights in this format. + // TODO(aaroey): why? + // fp16 conversion technically should be needed here. VLOG(2) << "TENSOR!!!" << node_def.name(); const auto& content = weights_tensor.tensor_content(); @@ -1543,8 +1707,8 @@ tensorflow::Status ConvertConst(Converter& ctx, content, static_cast<char*>(const_cast<void*>(weights.GetValues()))); } } else { - return tensorflow::errors::Unimplemented( - "Not supported constant type, at " + node_def.name()); + return tensorflow::errors::Unimplemented("Not supported constant type, at ", + node_def.name()); } // Pass the output outputs->push_back(TRT_TensorOrWeights(weights)); @@ -1563,96 +1727,144 @@ tensorflow::Status ConvertBinary(Converter& ctx, const tensorflow::NodeDef& node_def, const std::vector<TRT_TensorOrWeights>& inputs, std::vector<TRT_TensorOrWeights>* outputs) { - if (inputs.size() != 2) + if (inputs.size() != 2) { return tensorflow::errors::FailedPrecondition( - "Binary ops require two tensor input, at " + node_def.name()); - - if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) - return ConstantFoldBinary(ctx, node_def, inputs, outputs); - - if (inputs.at(0).is_tensor() && inputs.at(1).is_weights()) - return BinaryTensorOpWeight(ctx, node_def, inputs.at(0).tensor(), - inputs.at(1).weights(), outputs); + "Binary ops require two tensor input, at ", node_def.name()); + } - if (inputs.at(0).is_weights() && inputs.at(1).is_tensor()) - return BinaryTensorOpWeight(ctx, node_def, inputs.at(1).tensor(), - inputs.at(0).weights(), outputs); + // Constant folding should have been done by TensorFlow - if (inputs.at(0).is_tensor() && inputs.at(1).is_tensor()) - return BinaryTensorOpTensor(ctx, node_def, inputs.at(0).tensor(), - inputs.at(1).tensor(), outputs); + if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) { + return tensorflow::errors::Unimplemented( + "Constant folding is falled back to TensorFlow, binary op received " + "both input as constant at: ", + node_def.name()); + } - return tensorflow::errors::Unknown("Binary op input error, at " + - node_def.name()); + // Try to convert into Scale layer first (for better performance) + // Since scale layer supports restricted broadcast policy and op types, we + // allow failure and try to handle it through Elementwise op + // (BinaryTensorOpTensor) + Status status = tensorflow::Status::OK(); + if (inputs.at(0).is_tensor() && inputs.at(1).is_weights()) { + status = BinaryTensorOpWeight(ctx, node_def, inputs.at(0).tensor(), + inputs.at(1).weights(), false, outputs); + } else if (inputs.at(0).is_weights() && inputs.at(1).is_tensor()) { + status = BinaryTensorOpWeight(ctx, node_def, inputs.at(1).tensor(), + inputs.at(0).weights(), true, outputs); +#if NV_TENSORRT_MAJOR == 3 + } else { +#else + } + if (inputs.at(0).is_tensor() && inputs.at(1).is_tensor() || !status.ok()) { +#endif + status = BinaryTensorOpTensor(ctx, node_def, inputs.at(0), inputs.at(1), + outputs); + } + return status; } tensorflow::Status ConvertUnary(Converter& ctx, const tensorflow::NodeDef& node_def, const std::vector<TRT_TensorOrWeights>& inputs, std::vector<TRT_TensorOrWeights>* outputs) { - if (inputs.size() != 1) + static const std::unordered_map<string, nvinfer1::UnaryOperation> ops{ + {"Neg", nvinfer1::UnaryOperation::kNEG}, + {"Exp", nvinfer1::UnaryOperation::kEXP}, + {"Log", nvinfer1::UnaryOperation::kLOG}, + {"Sqrt", nvinfer1::UnaryOperation::kSQRT}, + {"Abs", nvinfer1::UnaryOperation::kABS}, + {"Reciprocal", nvinfer1::UnaryOperation::kRECIP}, + }; + + if (inputs.size() != 1) { return tensorflow::errors::FailedPrecondition( - "Unary ops require single tensor input, at " + node_def.name()); + "Unary ops require single tensor input, at ", node_def.name()); + } - if (inputs.at(0).is_weights()) - return ConstantFoldUnary(ctx, node_def, inputs, outputs); - else if (inputs.at(0).is_tensor()) +#if NV_TENSORRT_MAJOR == 3 + if (inputs.at(0).is_weights()) { return tensorflow::errors::Unimplemented( - "Unary op for tensor not supported, at " + node_def.name()); + "Constant folding for unary op is not supported", node_def.name()); + } +#endif + + // TODO(jie): check type + const nvinfer1::ITensor* tensor; + TFTRT_RETURN_ERROR_IF_FALSE( + PrepareTensorForShape(ctx, inputs.at(0), inputs.at(0).shape(), &tensor), + node_def.name()); + + nvinfer1::IUnaryLayer* layer; + if (node_def.op() == "Rsqrt") { + layer = ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor), + nvinfer1::UnaryOperation::kSQRT); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + tensor = layer->getOutput(0); + layer = ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor), + nvinfer1::UnaryOperation::kRECIP); + } else if (ops.count(node_def.op()) != 0) { + layer = ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor), + ops.at(node_def.op())); + } else { + return tensorflow::errors::InvalidArgument( + "Binary op: ", node_def.op(), " not supported, at ", node_def.name()); + } - return tensorflow::errors::Unknown("Binary op input error, at " + - node_def.name()); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + nvinfer1::ITensor* output_tensor = layer->getOutput(0); + outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return tensorflow::Status::OK(); } -tensorflow::Status ConvertReduce(Converter& ctx, - const tensorflow::NodeDef& node_def, - const std::vector<TRT_TensorOrWeights>& inputs, - std::vector<TRT_TensorOrWeights>* outputs) { +#if NV_TENSORRT_MAJOR == 3 +tensorflow::Status ConvertReducePool( + Converter& ctx, const tensorflow::NodeDef& node_def, + const std::vector<TRT_TensorOrWeights>& inputs, + std::vector<TRT_TensorOrWeights>* outputs) { if (inputs.size() != 2 || !inputs.at(0).is_tensor() || - !inputs.at(1).is_weights()) + !inputs.at(1).is_weights()) { return tensorflow::errors::InvalidArgument( - "Input expects tensor and weights, at" + node_def.name()); + "Input expects tensor and weights, at", node_def.name()); + } // Implement tensor binaryOp weight [channel wise] for now; const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); - auto dims = tensor->getDimensions(); + const auto dims = tensor->getDimensions(); // Restore implicit batch dimension - int nb_dims = dims.nbDims + 1; + const int nb_dims = dims.nbDims + 1; TRT_ShapedWeights index_list = inputs.at(1).weights(); - TFAttrs attrs(node_def); - // TODO(jie): handle data type. - // Index type here is done through TF type, so I can leverage their - // EnumToDataType for my cast auto index_type = attrs.get<tensorflow::DataType>("Tidx"); // Only expect to handle INT32 as attributes for now - if (index_type != tensorflow::DataType::DT_INT32) + if (index_type != tensorflow::DataType::DT_INT32) { return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32"); - auto index_list_data = + } + const auto index_list_data = static_cast<int*>(const_cast<void*>(index_list.GetValues())); - // Hack warning: have to fall back to pool layer since reduce is not in public - // TRT yet. - if (nb_dims != 4) + if (nb_dims != 4) { return tensorflow::errors::InvalidArgument( - "TRT only support reduce on 4 dimensional tensors, at" + + "TRT only support reduce on 4 dimensional tensors, at", node_def.name()); - if (index_list.count() > 2) + } + if (index_list.count() > 2) { return tensorflow::errors::InvalidArgument( - "TRT cannot support reduce on more than 2 dimensions, at" + + "TRT cannot support reduce on more than 2 dimensions, at", node_def.name()); + } std::set<int> idx_set; // We cannot operate on Channel. permutation flag used to transpose tensor int permuted_index = -1; for (int i = 0; i < index_list.count(); i++) { - if (index_list_data[i] == 0) - return tensorflow::errors::InvalidArgument("TRT cannot reduce at 0, at" + + if (index_list_data[i] == 0) { + return tensorflow::errors::InvalidArgument("TRT cannot reduce at 0, at", node_def.name()); + } if (index_list_data[i] == 1) permuted_index = 1; - idx_set.emplace(index_list_data[i]); } @@ -1673,6 +1885,7 @@ tensorflow::Status ConvertReduce(Converter& ctx, // Apply permutation before extracting dimension for pool_kernel tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor), permutation_order); + TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name()); } // Apply permutation before extracting dimension for pool_kernel @@ -1685,34 +1898,104 @@ tensorflow::Status ConvertReduce(Converter& ctx, nvinfer1::IPoolingLayer* layer = ctx.network()->addPooling(*const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::PoolingType::kAVERAGE, pool_kernel); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); output_tensor = layer->getOutput(0); } else { - return tensorflow::errors::Unimplemented( - "Op not supported " + node_def.op() + " , at " + node_def.name()); + return tensorflow::errors::Unimplemented("Op not supported ", node_def.op(), + " , at ", node_def.name()); } if (permuted_index != -1) { // Apply permutation before extracting dimension for pool_kernel output_tensor = ctx.TransposeTensor( const_cast<nvinfer1::ITensor*>(output_tensor), permutation_order); + TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name()); } outputs->push_back(TRT_TensorOrWeights(output_tensor)); return tensorflow::Status::OK(); } +#elif NV_TENSORRT_MAJOR > 3 +tensorflow::Status ConvertReduce(Converter& ctx, + const tensorflow::NodeDef& node_def, + const std::vector<TRT_TensorOrWeights>& inputs, + std::vector<TRT_TensorOrWeights>* outputs) { + if (inputs.size() != 2 || !inputs.at(0).is_tensor() || + !inputs.at(1).is_weights()) { + return tensorflow::errors::InvalidArgument( + "Input expects tensor and weights, at", node_def.name()); + } + + const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); + TRT_ShapedWeights index_list = inputs.at(1).weights(); + + TFAttrs attrs(node_def); + auto index_type = attrs.get<tensorflow::DataType>("Tidx"); + + // Only expect to handle INT32 as attributes for now + if (index_type != tensorflow::DataType::DT_INT32) { + return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32"); + } + + const auto keep_dims = attrs.get<bool>("keep_dims"); + auto index_list_data = + static_cast<int*>(const_cast<void*>(index_list.GetValues())); + + int axes = 0; + if (index_list.count() == 0) { + return tensorflow::errors::InvalidArgument( + "TRT cannot support reduce on all (batch) dimensions, at", + node_def.name()); + } else { + for (int i = 0; i < index_list.count(); i++) { + if (index_list_data[i] == 0) { + return tensorflow::errors::InvalidArgument( + "TRT cannot reduce at batch dimension, at", node_def.name()); + } + axes |= (1 << (index_list_data[i] - 1)); + } + } + + nvinfer1::ReduceOperation reduce_operation; + if (node_def.op() == "Sum") { + reduce_operation = nvinfer1::ReduceOperation::kSUM; + } else if (node_def.op() == "Prod") { + reduce_operation = nvinfer1::ReduceOperation::kPROD; + } else if (node_def.op() == "Max") { + reduce_operation = nvinfer1::ReduceOperation::kMAX; + } else if (node_def.op() == "Min") { + reduce_operation = nvinfer1::ReduceOperation::kMIN; + } else if (node_def.op() == "Mean") { + reduce_operation = nvinfer1::ReduceOperation::kAVG; + } else { + return tensorflow::errors::Unimplemented("Op not supported ", node_def.op(), + " , at ", node_def.name()); + } + + nvinfer1::ILayer* layer = + ctx.network()->addReduce(*const_cast<nvinfer1::ITensor*>(tensor), + reduce_operation, axes, keep_dims); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + + outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0))); + return tensorflow::Status::OK(); +} +#endif tensorflow::Status ConvertPad(Converter& ctx, const tensorflow::NodeDef& node_def, const std::vector<TRT_TensorOrWeights>& inputs, std::vector<TRT_TensorOrWeights>* outputs) { + // TODO(aaroey): make a routine for this check and reuse it. if (inputs.size() != 2 || !inputs.at(0).is_tensor() || - !inputs.at(1).is_weights()) + !inputs.at(1).is_weights()) { return tensorflow::errors::InvalidArgument( - "Input expects tensor and weights, at" + node_def.name()); + "Input expects tensor and weights, at", node_def.name()); + } // Implement tensor binaryOp weight [channel wise] for now; const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); - auto dims = tensor->getDimensions(); + const auto dims = tensor->getDimensions(); // Restore implicit batch dimension - int nb_dims = dims.nbDims + 1; + const int nb_dims = dims.nbDims + 1; TRT_ShapedWeights pads = inputs.at(1).weights(); @@ -1722,21 +2005,24 @@ tensorflow::Status ConvertPad(Converter& ctx, auto padding_type = attrs.get<tensorflow::DataType>("Tpaddings"); // TODO(jie): handle data type conversion for TRT? - if (pads.shape_.d[0] != nb_dims || pads.shape_.d[1] != 2) + if (pads.shape_.d[0] != nb_dims || pads.shape_.d[1] != 2) { return tensorflow::errors::InvalidArgument( - "Pad only supports explicit padding on 4 dimensional tensor, at " + + "Pad only supports explicit padding on 4 dimensional tensor, at ", node_def.name()); + } // Only expect to handle INT32 as attributes for now - if (padding_type != tensorflow::DataType::DT_INT32) + if (padding_type != tensorflow::DataType::DT_INT32) { return tensorflow::errors::Unimplemented( "Tpaddings supports only DT_INT32"); + } auto pad_data = static_cast<int*>(const_cast<void*>(pads.GetValues())); std::vector<int32_t> pad_index; for (int i = 0; i < nb_dims; i++) { - if (pad_data[2 * i] != 0 || pad_data[2 * i + 1] != 0) + if (pad_data[2 * i] != 0 || pad_data[2 * i + 1] != 0) { pad_index.push_back(i); + } } // No padding at all, we should exit @@ -1746,20 +2032,23 @@ tensorflow::Status ConvertPad(Converter& ctx, } // Only supports padding on less than 2 axis GIE-2579 - if (pad_index.size() > 2) + if (pad_index.size() > 2) { return tensorflow::errors::InvalidArgument( "Padding layer does not support padding on > 2"); + } // Padding on batch dimension is not supported - if (pad_index[0] == 0) + if (pad_index[0] == 0) { return tensorflow::errors::InvalidArgument( "Padding layer does not support padding on batch dimension"); + } // Not doing the legit thing here. ignoring padding on dim 1 and 3; // TODO(jie): implement pad as uff parser - if (pad_index.size() == 2 && pad_index[0] == 0 && pad_index[1] == 3) + if (pad_index.size() == 2 && pad_index[0] == 0 && pad_index[1] == 3) { return tensorflow::errors::Unimplemented( "Padding layer does not support padding on dimension 1 and 3 yet"); + } bool legit_pad = true; nvinfer1::DimsHW pre_padding(0, 0); @@ -1770,6 +2059,7 @@ tensorflow::Status ConvertPad(Converter& ctx, legit_pad = false; tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 2, 1}); + TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name()); permuted_pad_index[0] = 3; } @@ -1786,11 +2076,14 @@ tensorflow::Status ConvertPad(Converter& ctx, nvinfer1::IPaddingLayer* layer = ctx.network()->addPadding( *const_cast<nvinfer1::ITensor*>(tensor), pre_padding, post_padding); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); nvinfer1::ITensor* output_tensor = layer->getOutput(0); - if (!legit_pad) + if (!legit_pad) { output_tensor = ctx.TransposeTensor( const_cast<nvinfer1::ITensor*>(output_tensor), {0, 3, 2, 1}); + TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name()); + } outputs->push_back(TRT_TensorOrWeights(output_tensor)); return tensorflow::Status::OK(); @@ -1803,9 +2096,10 @@ tensorflow::Status ConvertConcat(Converter& ctx, // not including the last input (axis) here int input_size = static_cast<int>(inputs.size()) - 1; - if (!inputs.at(0).is_tensor()) + if (!inputs.at(0).is_tensor()) { return tensorflow::errors::InvalidArgument( - "Concat in TRT support only Tensor input, at " + node_def.name()); + "Concat in TRT support only Tensor input, at ", node_def.name()); + } // We are retrieving the axis TRT_ShapedWeights axis = inputs.at(input_size).weights(); @@ -1816,8 +2110,8 @@ tensorflow::Status ConvertConcat(Converter& ctx, // TODO(jie): handle data type // Only expect to handle INT32 as index attributes for now if (index_type != tensorflow::DataType::DT_INT32) - return tensorflow::errors::Unimplemented( - "Tidx supports only DT_INT32, at " + node_def.name()); + return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32, at ", + node_def.name()); int index = *(static_cast<int*>(const_cast<void*>(axis.GetValues()))); @@ -1825,23 +2119,29 @@ tensorflow::Status ConvertConcat(Converter& ctx, auto dim = inputs.at(0).tensor()->getDimensions(); // dimension check - if (index > dim.nbDims + 1) + if (index > dim.nbDims + 1) { return tensorflow::errors::InvalidArgument( - "Concatenate on axis out of dimension range, at " + node_def.name()); - - if (index == 0) + "Concatenate on axis out of dimension range, at ", node_def.name()); + } + if (index == 0) { return tensorflow::errors::InvalidArgument( - "Concatenate on batch dimension not supported, at " + node_def.name()); + "Concatenate on batch dimension not supported, at ", node_def.name()); + } + if (index < 0) { + index = dim.nbDims + index + 1; + } +#if NV_TENSORRT_MAJOR == 3 // incase we need permutation; std::vector<int> permutation_order(dim.nbDims + 1); for (int i = 0; i < dim.nbDims + 1; i++) permutation_order[i] = i; if (index != 1) { - permutation_order[1] = index - 1; - permutation_order[index - 1] = 1; + permutation_order[1] = index; + permutation_order[index] = 1; } +#endif std::vector<nvinfer1::ITensor const*> inputs_vec; // Shap chack (all input tensor should have same shape) @@ -1849,24 +2149,28 @@ tensorflow::Status ConvertConcat(Converter& ctx, for (int i = 0; i < input_size; i++) { auto tensor_i = inputs.at(i).tensor(); auto dim_i = tensor_i->getDimensions(); - if (dim_i.nbDims != dim.nbDims) + if (dim_i.nbDims != dim.nbDims) { return tensorflow::errors::InvalidArgument( - "Concatenate receives inputs with inconsistent dimensions, at " + + "Concatenate receives inputs with inconsistent dimensions, at ", node_def.name()); - + } for (int j = 0; j < dim.nbDims; j++) { // check dimension consistency on non-concatenate axis - if (j != index - 1 && dim_i.d[j] != dim.d[j]) + if (j != index - 1 && dim_i.d[j] != dim.d[j]) { return tensorflow::errors::InvalidArgument( - "Concatenate receives inputs with inconsistent shape, at" + + "Concatenate receives inputs with inconsistent shape, at", node_def.name()); + } } - // TRT does concatenation only on channel! - if (index != 1) +#if NV_TENSORRT_MAJOR == 3 + // TRT3 does concatenation only on channel! + if (index != 1) { tensor_i = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor_i), permutation_order); - + TFTRT_RETURN_ERROR_IF_NULLPTR(tensor_i, node_def.name()); + } +#endif inputs_vec.push_back(tensor_i); } @@ -1874,11 +2178,18 @@ tensorflow::Status ConvertConcat(Converter& ctx, nvinfer1::IConcatenationLayer* layer = ctx.network()->addConcatenation( const_cast<nvinfer1::ITensor* const*>(inputs_vec.data()), inputs_vec.size()); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); +#if NV_TENSORRT_MAJOR > 3 + layer->setAxis(index - 1); +#endif nvinfer1::ITensor* output_tensor = layer->getOutput(0); +#if NV_TENSORRT_MAJOR == 3 if (index != 1) { output_tensor = ctx.TransposeTensor(output_tensor, permutation_order); + TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name()); } +#endif outputs->push_back(TRT_TensorOrWeights(output_tensor)); return tensorflow::Status::OK(); } @@ -1997,112 +2308,249 @@ tensorflow::Status ConvertFusedBatchNorm( combined_offset_weights.GetWeightsForTRT(), combined_scale_weights.GetWeightsForTRT(), dummy_power_weights.GetWeightsForTRT()); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); nvinfer1::ITensor* output_tensor = layer->getOutput(0); outputs->push_back(TRT_TensorOrWeights(output_tensor)); return tensorflow::Status::OK(); } +#if NV_TENSORRT_MAJOR > 3 +tensorflow::Status ConvertMatMulHelper( + Converter& ctx, TRT_TensorOrWeights tensor_input, + TRT_ShapedWeights weights_raw, bool transpose_weight, string node_name, + std::vector<TRT_TensorOrWeights>* outputs) { + nvinfer1::ITensor* output_tensor; + if (!tensor_input.is_tensor()) { + return tensorflow::errors::InvalidArgument("Input 0 expects tensor"); + } + const nvinfer1::ITensor* tensor = tensor_input.tensor(); + + TRT_ShapedWeights weights(weights_raw.type_); + if (transpose_weight) { + weights = weights_raw; + } else { + TRT_ShapedWeights weights_ck = weights_raw; + weights = ctx.get_temp_weights_like(weights_ck); + ReorderCKtoKC(weights_raw, &weights); + } + TRT_ShapedWeights biases(weights.type_); + + int noutput = weights.shape_.d[0]; + + auto input_dim = tensor->getDimensions(); + while (input_dim.nbDims != 3) { + input_dim.d[input_dim.nbDims++] = 1; + } + TFTRT_RETURN_ERROR_IF_FALSE( + PrepareTensorForShape(ctx, tensor_input, input_dim, &tensor), node_name); + + nvinfer1::IFullyConnectedLayer* layer = ctx.network()->addFullyConnected( + *const_cast<nvinfer1::ITensor*>(tensor), noutput, weights, biases); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name); + output_tensor = layer->getOutput(0); + + const nvinfer1::ITensor* temp_tensor; + auto output_dim = output_tensor->getDimensions(); + output_dim.nbDims = 1; + TFTRT_RETURN_ERROR_IF_FALSE( + PrepareTensorForShape(ctx, TRT_TensorOrWeights(output_tensor), output_dim, + &temp_tensor), + node_name); + output_tensor = const_cast<nvinfer1::ITensor*>(temp_tensor); + outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return tensorflow::Status::OK(); +} + +// inputs are both two dimensional (tensorflow::ops::MatMul) tensorflow::Status ConvertMatMul(Converter& ctx, const tensorflow::NodeDef& node_def, const std::vector<TRT_TensorOrWeights>& inputs, std::vector<TRT_TensorOrWeights>* outputs) { + if (!inputs.at(0).is_tensor()) { + return tensorflow::errors::InvalidArgument("Input 0 expects tensor, at" + + node_def.name()); + } + const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); - // TODO(jie): transpose! TFAttrs attrs(node_def); - TRT_ShapedWeights weights_ck = inputs.at(1).weights(); - TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_ck); - ReorderCKtoKC(weights_ck, &weights); - TRT_ShapedWeights biases(weights.type_); + // TODO(jie): INT32 should be converted? + tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T"); + if (tf_dtype != tensorflow::DataType::DT_FLOAT && + tf_dtype != tensorflow::DataType::DT_HALF) { + return tensorflow::errors::Unimplemented( + "data type is not supported, for node " + node_def.name() + " got " + + tensorflow::DataTypeString(tf_dtype)); + } - int noutput = weights.shape_.d[0]; + bool transpose_a = attrs.get<bool>("transpose_a"); + bool transpose_b = attrs.get<bool>("transpose_b"); - nvinfer1::IFullyConnectedLayer* layer = ctx.network()->addFullyConnected( - *const_cast<nvinfer1::ITensor*>(tensor), noutput, weights, biases); + nvinfer1::ITensor* output_tensor; - nvinfer1::ITensor* output_tensor = layer->getOutput(0); - outputs->push_back(TRT_TensorOrWeights(output_tensor)); - return tensorflow::Status::OK(); + // FullyConnected: + if (transpose_a) { + return tensorflow::errors::Internal( + "Transpose_a is not supported for TensorRT FullyConnected (op: " + + node_def.op() + "), at: " + node_def.name()); + } + if (inputs.at(1).is_tensor()) { + return tensorflow::errors::Internal( + "Operand 1 must be constant for TensorRT FullyConnected (op: " + + node_def.op() + "), at: " + node_def.name()); + } + return ConvertMatMulHelper(ctx, inputs.at(0), inputs.at(1).weights(), + transpose_b, node_def.name(), outputs); } -tensorflow::Status ConvertReshape( +tensorflow::Status ConvertBatchMatMul( Converter& ctx, const tensorflow::NodeDef& node_def, const std::vector<TRT_TensorOrWeights>& inputs, std::vector<TRT_TensorOrWeights>* outputs) { - if (inputs.size() != 2 || !inputs.at(0).is_tensor() || - !inputs.at(1).is_weights()) - return tensorflow::errors::InvalidArgument( - "Input expects tensor and weights, at" + node_def.name()); + TFAttrs attrs(node_def); - // implement tensor binaryOp weight [channel wise] for now; - const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); - auto dims = tensor->getDimensions(); - // restore implicit batch dimension + // TODO(jie): INT32 should be converted? + tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T"); + if (tf_dtype != tensorflow::DataType::DT_FLOAT && + tf_dtype != tensorflow::DataType::DT_HALF) { + return tensorflow::errors::Unimplemented( + "data type is not supported, for node " + node_def.name() + " got " + + tensorflow::DataTypeString(tf_dtype)); + } - TRT_ShapedWeights shape = inputs.at(1).weights(); + bool transpose_a = attrs.get<bool>("adj_x"); + bool transpose_b = attrs.get<bool>("adj_y"); - TFAttrs attrs(node_def); + auto dims = inputs.at(0).shape(); + if (dims.nbDims == 1) { // NC * CK is only supported through fully connected + if (transpose_a == false && inputs.at(0).is_tensor() && + inputs.at(1).is_weights()) { + return ConvertMatMulHelper(ctx, inputs.at(0), inputs.at(1).weights(), + transpose_b, node_def.name(), outputs); + } else { + return tensorflow::errors::InvalidArgument( + "Invalid configuration for MatMul, at: " + node_def.name()); + } + } - auto padding_type = attrs.get<tensorflow::DataType>("Tshape"); + const nvinfer1::ITensor* tensor_l; + const nvinfer1::ITensor* tensor_r; + auto dims_l = inputs.at(0).shape(); + auto dims_r = inputs.at(1).shape(); + if (inputs.at(0).is_weights()) { + if (inputs.at(0).shape().d[0] != 1) { + return tensorflow::errors::InvalidArgument( + "Input 0 as weight assumes broadcast across batch for MatMul, at: " + + node_def.name()); + } else { + for (int i = 0; i < dims_l.nbDims - 1; i++) { + dims_l.d[i] = dims_l.d[i + 1]; + } + dims_l.nbDims--; + } + } + if (inputs.at(1).is_weights()) { + if (inputs.at(1).shape().d[0] != 1) { + return tensorflow::errors::InvalidArgument( + "Input 1 as weight assumes broadcast across batch for MatMul, at: " + + node_def.name()); + } else { + for (int i = 0; i < dims_r.nbDims - 1; i++) { + dims_r.d[i] = dims_r.d[i + 1]; + } + dims_r.nbDims--; + } + } - if (shape.shape_.nbDims != 1) - return tensorflow::errors::InvalidArgument( - "reshape new shape is not 1 dimensional, at " + node_def.name()); + TFTRT_RETURN_ERROR_IF_FALSE( + PrepareTensorForShape(ctx, inputs.at(0), dims_l, &tensor_l), + node_def.name()); + TFTRT_RETURN_ERROR_IF_FALSE( + PrepareTensorForShape(ctx, inputs.at(1), dims_r, &tensor_r), + node_def.name()); - // Only expect to handle INT32 as attributes for now - if (padding_type != tensorflow::DataType::DT_INT32) - return tensorflow::errors::Unimplemented( - "reshape new shape supports only DT_INT32, at " + node_def.name()); + nvinfer1::IMatrixMultiplyLayer* layer = ctx.network()->addMatrixMultiply( + *const_cast<nvinfer1::ITensor*>(tensor_l), transpose_a, + *const_cast<nvinfer1::ITensor*>(tensor_r), transpose_b); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + nvinfer1::ITensor* output_tensor = layer->getOutput(0); + outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return tensorflow::Status::OK(); +} +#endif - auto shape_data = static_cast<int*>(const_cast<void*>(shape.GetValues())); +#if NV_TENSORRT_MAJOR > 3 +tensorflow::Status ConvertSoftmax( + Converter& ctx, const tensorflow::NodeDef& node_def, + const std::vector<TRT_TensorOrWeights>& inputs, + std::vector<TRT_TensorOrWeights>* outputs) { + const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); - if (shape_data[0] != -1) + int nbDims = tensor->getDimensions().nbDims; + if (nbDims == 0) { return tensorflow::errors::InvalidArgument( - "reshape new shape first dimension is not -1, at " + node_def.name()); + "TensorRT Softmax cannot apply on batch dimension, at" + + node_def.name()); + } + nvinfer1::ISoftMaxLayer* layer = + ctx.network()->addSoftMax(*const_cast<nvinfer1::ITensor*>(tensor)); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + // Tensorflow SoftMax assumes applying softmax on the last dimension. + layer->setAxes(1 << (nbDims - 1)); - auto shape_num_dims = shape.shape_.d[0]; - VLOG(2) << "shape dimensions: " << shape_num_dims; - int volume_w = 1; - for (int i = 1; i < shape.shape_.d[0]; i++) volume_w *= shape_data[i]; + nvinfer1::ITensor* output_tensor = layer->getOutput(0); + outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return tensorflow::Status::OK(); +} +#endif - int volume_t = 1; - for (int i = 0; i < dims.nbDims; i++) volume_t *= dims.d[i]; +#if NV_TENSORRT_MAJOR > 3 +tensorflow::Status ConvertTopK(Converter& ctx, + const tensorflow::NodeDef& node_def, + const std::vector<TRT_TensorOrWeights>& inputs, + std::vector<TRT_TensorOrWeights>* outputs) { + const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); - VLOG(2) << "volume: " << volume_t << " volume weights: " << volume_w; - if (volume_w != volume_t) + int nbDims = tensor->getDimensions().nbDims; + if (nbDims == 0) { return tensorflow::errors::InvalidArgument( - "volume does not agree between tensor and new shape, at " + - node_def.name()); + "TensorRT TopK cannot apply on batch dimension, at" + node_def.name()); + } - nvinfer1::IShuffleLayer* layer = - ctx.network()->addShuffle(*const_cast<nvinfer1::ITensor*>(tensor)); + TRT_ShapedWeights k_w = inputs.at(1).weights(); + int k = *(static_cast<int*>(const_cast<void*>(k_w.GetValues()))); - nvinfer1::Dims reshape_dims; - VLOG(2) << "new dimension: " << shape_num_dims - 1; - reshape_dims.nbDims = shape_num_dims - 1; - for (int32_t i = 0; i < reshape_dims.nbDims; ++i) { - reshape_dims.d[i] = shape_data[i + 1]; + nvinfer1::TopKOperation op; + uint32_t reducedAxes = 0; + if (node_def.op() == "TopKV2") { + op = nvinfer1::TopKOperation::kMAX; + reducedAxes |= 1 << (nbDims - 1); + } else { + return tensorflow::errors::Unimplemented( + "Operation: " + node_def.op() + + " not implemented, at: " + node_def.name()); } - layer->setReshapeDimensions(reshape_dims); - VLOG(2) << "new dimension: " << shape_num_dims - 1; - nvinfer1::ITensor* output_tensor = layer->getOutput(0); - auto dims_output = output_tensor->getDimensions(); - VLOG(2) << "output tensor dimension:" << dims_output.nbDims; - outputs->push_back(TRT_TensorOrWeights(output_tensor)); + nvinfer1::ITopKLayer* layer = ctx.network()->addTopK( + *const_cast<nvinfer1::ITensor*>(tensor), op, k, reducedAxes); + TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + + nvinfer1::ITensor* output_value_tensor = layer->getOutput(0); + nvinfer1::ITensor* output_indices_tensor = layer->getOutput(1); + outputs->push_back(TRT_TensorOrWeights(output_value_tensor)); + outputs->push_back(TRT_TensorOrWeights(output_indices_tensor)); return tensorflow::Status::OK(); } +#endif void Converter::register_op_converters() { // vgg_16 slim implementation - op_registry_["Placeholder"] = ConvertPlaceholder; op_registry_["Conv2D"] = ConvertConv2D; op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise; op_registry_["Relu"] = ConvertActivation; op_registry_["MaxPool"] = ConvertPool; op_registry_["AvgPool"] = ConvertPool; - // This could be really handled as ConvertBinary op_registry_["BiasAdd"] = ConvertScale; op_registry_["Const"] = ConvertConst; // TODO(ben,jie): this is a temp hack. @@ -2113,18 +2561,38 @@ void Converter::register_op_converters() { op_registry_["Add"] = ConvertBinary; op_registry_["Mul"] = ConvertBinary; op_registry_["Sub"] = ConvertBinary; - op_registry_["Rsqrt"] = ConvertUnary; - op_registry_["Mean"] = ConvertReduce; op_registry_["Pad"] = ConvertPad; - // TODO(ben,jie): Add more ops op_registry_["ConcatV2"] = ConvertConcat; - op_registry_["MatMul"] = ConvertMatMul; - op_registry_["Reshape"] = ConvertReshape; op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm; op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm; - plugin_converter_ = ConvertPlugin; + op_registry_["Div"] = ConvertBinary; + op_registry_["RealDiv"] = ConvertBinary; + + op_registry_["Rsqrt"] = ConvertUnary; + op_registry_["Reciprocal"] = ConvertUnary; + op_registry_["Exp"] = ConvertUnary; + op_registry_["Log"] = ConvertUnary; + op_registry_["Sqrt"] = ConvertUnary; + op_registry_["Abs"] = ConvertUnary; + op_registry_["Neg"] = ConvertUnary; +#if NV_TENSORRT_MAJOR == 3 + op_registry_["Mean"] = ConvertReducePool; +#endif +#if NV_TENSORRT_MAJOR > 3 + op_registry_["Sum"] = ConvertReduce; + op_registry_["Prod"] = ConvertReduce; + op_registry_["Max"] = ConvertReduce; + op_registry_["Min"] = ConvertReduce; + op_registry_["Mean"] = ConvertReduce; + op_registry_["Maximum"] = ConvertBinary; + op_registry_["Minimum"] = ConvertBinary; + op_registry_["Softmax"] = ConvertSoftmax; + op_registry_["MatMul"] = ConvertMatMul; + op_registry_["BatchMatMul"] = ConvertBatchMatMul; + op_registry_["TopKV2"] = ConvertTopK; +#endif } } // namespace diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index 8a17eb02f1..04d072f5d9 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -316,6 +316,11 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx, ctx->SetStatus(tensorflow::errors::InvalidArgument( "INT8 inputs are not supported!")); return; +#if NV_TENSORRT_MAJOR > 3 + case nvinfer1::DataType::kINT32: + buffers[binding_index] = (void*)(input_tensor.flat<int32>().data()); + break; +#endif default: LOG(ERROR) << "Unknown TRT data type: " << int(dtype); ctx->SetStatus(tensorflow::errors::InvalidArgument( @@ -368,6 +373,12 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx, ctx->SetStatus(tensorflow::errors::InvalidArgument( "INT8 outputs are not supported!")); return; +#if NV_TENSORRT_MAJOR > 3 + case nvinfer1::DataType::kINT32: + buffers[binding_index] = + reinterpret_cast<void*>(output_tensor->flat<int32>().data()); + break; +#endif default: LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype); ctx->SetStatus(tensorflow::errors::InvalidArgument( diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc index 383635f428..e0c7b62723 100644 --- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc @@ -42,8 +42,14 @@ REGISTER_OP("TRTEngineOp") .Attr("precision_mode: {'FP32', 'FP16', 'INT8', 'INT8CALIB'}") .Attr("calibration_data: string = ''") .Input("in_tensor: InT") - .Output("out_tensor: OutT") - .SetShapeFn(shape_inference::TRTEngineOpShapeInference); + .Output("out_tensor: OutT"); +// TODO(jie): TF requires concrete output shape for concrete input shapes. +// This is tricky for batch dimension, since we cannot ensure which input +// would carry the correct batch dimension (for the current stage of the +// implementation, we do require all input tensor to carry the same batch +// size, but this could change in the future). Hence we disable shape +// inference function as a workaround. +// .SetShapeFn(shape_inference::TRTEngineOpShapeInference); } // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc index 227ac120dd..f30dba59ad 100644 --- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc +++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc @@ -28,36 +28,50 @@ limitations under the License. namespace tensorflow { namespace shape_inference { -tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) { - std::vector<tensorflow::TensorShape> shapes; - for (int i = 0; i < context->num_outputs(); ++i) { - context->set_output(i, context->UnknownShape()); +tensorflow::Status TRTEngineOpShapeInference(InferenceContext* c) { + for (int i = 0; i < c->num_outputs(); ++i) { + c->set_output(i, c->UnknownShape()); } - auto status = context->GetAttr("input_shapes", &shapes); - // it is ok to not to have shapes - if (!status.ok()) return Status::OK(); - if ((int)shapes.size() != context->num_inputs()) return Status::OK(); - bool different_input = false; - for (int i = 0; i < context->num_inputs(); ++i) { - if (shapes.at(i) != context->input_tensor(i)->shape()) - different_input = true; + + // Check the sanity of the input shapes. + std::vector<tensorflow::TensorShape> input_shapes; + TF_RETURN_IF_ERROR(c->GetAttr("input_shapes", &input_shapes)); + if (input_shapes.size() != c->num_inputs()) { + return tensorflow::errors::InvalidArgument( + "The actual number of inputs doesn't match the number of input " + "shapes set in the attr: ", + c->num_inputs(), " vs ", input_shapes.size()); + } + bool input_match = true; + for (int i = 0; i < c->num_inputs(); ++i) { + ShapeHandle handle; + TF_RETURN_IF_ERROR( + c->MakeShapeFromTensorShape(input_shapes.at(i), &handle)); + ShapeHandle merged; + if (!c->Merge(c->input(i), handle, &merged).ok()) { + // Input shape doesn't match what was set in attr, fine. + input_match = false; + } } - if (different_input) return Status::OK(); - shapes.resize(0); - status = context->GetAttr("output_shapes", &shapes); - if (!status.ok()) return Status::OK(); - if ((int)shapes.size() != context->num_outputs()) return Status::OK(); - std::vector<ShapeHandle> shape_handles(shapes.size()); - for (size_t i = 0; i < shapes.size(); ++i) { - status = - context->MakeShapeFromTensorShape(shapes.at(i), &shape_handles.at(i)); - if (!status.ok()) return Status::OK(); + + // Check the sanity of the output shapes. + std::vector<tensorflow::TensorShape> output_shapes; + TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes)); + if (output_shapes.size() != c->num_outputs()) { + return tensorflow::errors::InvalidArgument( + "The actual number of outputs doesn't match the number of output " + "shapes set in the attr: ", + c->num_outputs(), " vs ", output_shapes.size()); } - for (int i = 0; i < context->num_outputs(); ++i) { - context->set_output(i, shape_handles.at(i)); + for (size_t i = 0; i < output_shapes.size(); ++i) { + ShapeHandle handle; + TF_RETURN_IF_ERROR( + c->MakeShapeFromTensorShape(output_shapes.at(i), &handle)); + if (input_match) c->set_output(i, handle); } return Status::OK(); } + } // namespace shape_inference } // namespace tensorflow |