// Copyright 2017 The TensorFlow Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= #include #include #include #include #include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h" #include "tensorflow/contrib/boosted_trees/proto/split_info.pb.h" #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/types.h" namespace tensorflow { using boosted_trees::learner::LearnerConfig; using boosted_trees::learner::LearnerConfig_MultiClassStrategy; using boosted_trees::learner::ObliviousSplitInfo; using boosted_trees::learner::SplitInfo; using boosted_trees::learner::stochastic::GradientStats; using boosted_trees::learner::stochastic::NodeStats; namespace { const int32 DUMMY_FEATURE_DIMENSION = -1; } // namespace class SplitBuilderState { public: explicit SplitBuilderState(OpKernelContext* const context) { const Tensor* l1_regularization_t; OP_REQUIRES_OK(context, context->input("l1_regularization", &l1_regularization_t)); const Tensor* l2_regularization_t; OP_REQUIRES_OK(context, context->input("l2_regularization", &l2_regularization_t)); const Tensor* tree_complexity_regularization_t; OP_REQUIRES_OK(context, context->input("tree_complexity_regularization", &tree_complexity_regularization_t)); const Tensor* min_node_weight_t; OP_REQUIRES_OK(context, context->input("min_node_weight", &min_node_weight_t)); const Tensor* feature_column_group_id_t; OP_REQUIRES_OK(context, context->input("feature_column_group_id", &feature_column_group_id_t)); const Tensor* multiclass_strategy_t; OP_REQUIRES_OK( context, context->input("multiclass_strategy", &multiclass_strategy_t)); int strategy = multiclass_strategy_t->scalar()(); OP_REQUIRES( context, boosted_trees::learner::LearnerConfig_MultiClassStrategy_IsValid( strategy), errors::InvalidArgument("Wrong multiclass strategy passed.")); multiclass_strategy_ = LearnerConfig_MultiClassStrategy(strategy); const Tensor* class_id_t; OP_REQUIRES_OK(context, context->input("class_id", &class_id_t)); OP_REQUIRES(context, TensorShapeUtils::IsScalar(class_id_t->shape()), errors::InvalidArgument("class_id must be a scalar.")); class_id_ = class_id_t->scalar()(); l1_regularization_ = l1_regularization_t->scalar()(); l2_regularization_ = l2_regularization_t->scalar()(); tree_complexity_regularization_ = tree_complexity_regularization_t->scalar()(); min_node_weight_ = min_node_weight_t->scalar()(); feature_column_group_id_ = feature_column_group_id_t->scalar()(); } NodeStats ComputeNodeStats(const GradientStats& grad_stats) { return NodeStats(l1_regularization_, l2_regularization_, min_node_weight_, multiclass_strategy_, grad_stats); } void FillLeaf(const NodeStats& best_node_stats, boosted_trees::trees::Leaf* leaf) const { if (class_id_ == -1) { // This would be the case either for TREE_PER_CLASS with only 2 classes, // or for other multiclass strategies. for (float f : best_node_stats.weight_contribution) { leaf->mutable_vector()->add_value(f); } } else { CHECK(best_node_stats.weight_contribution.size() == 1) << "Weight contribution size = " << best_node_stats.weight_contribution.size(); leaf->mutable_sparse_vector()->add_index(class_id_); leaf->mutable_sparse_vector()->add_value( best_node_stats.weight_contribution[0]); } } int32 feature_column_group_id() { return feature_column_group_id_; } float tree_complexity_regularization() { return tree_complexity_regularization_; } private: LearnerConfig_MultiClassStrategy multiclass_strategy_; float l1_regularization_; float l2_regularization_; float tree_complexity_regularization_; float min_node_weight_; int32 class_id_; int32 feature_column_group_id_; }; class BuildDenseInequalitySplitsOp : public OpKernel { public: explicit BuildDenseInequalitySplitsOp(OpKernelConstruction* const context) : OpKernel(context) {} void Compute(OpKernelContext* const context) override { const Tensor* num_minibatches_t; OP_REQUIRES_OK(context, context->input("num_minibatches", &num_minibatches_t)); const int64 num_minibatches = num_minibatches_t->scalar()(); const float normalizer_ratio = (1.0f / num_minibatches); const Tensor* bucket_boundaries_t; OP_REQUIRES_OK(context, context->input("bucket_boundaries", &bucket_boundaries_t)); const auto& bucket_boundaries = bucket_boundaries_t->vec(); const Tensor* partition_ids_t; OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t)); const auto& partition_ids = partition_ids_t->vec(); const Tensor* bucket_ids_t; OP_REQUIRES_OK(context, context->input("bucket_ids", &bucket_ids_t)); const auto& bucket_ids = bucket_ids_t->matrix(); const Tensor* gradients_t; OP_REQUIRES_OK(context, context->input("gradients", &gradients_t)); const Tensor* hessians_t; OP_REQUIRES_OK(context, context->input("hessians", &hessians_t)); const Tensor* weak_learner_type_t; OP_REQUIRES_OK(context, context->input("weak_learner_type", &weak_learner_type_t)); const int32 weak_learner_type = weak_learner_type_t->scalar()(); // Find the number of unique partitions before we allocate the output. std::vector partition_boundaries; partition_boundaries.push_back(0); for (int i = 1; i < partition_ids.size(); ++i) { if (partition_ids(i) != partition_ids(i - 1)) { // Make sure the input is sorted by partition_ids; OP_REQUIRES(context, partition_ids(i) >= partition_ids(i - 1), errors::InvalidArgument("Input should be sorted.")); partition_boundaries.push_back(i); } } if (partition_ids.size() > 0) { partition_boundaries.push_back(partition_ids.size()); } int32 num_elements = partition_boundaries.size() - 1; // When the handler is inactive, no bucket boundaries are built for it. if (bucket_boundaries.size() == 0) { num_elements = 0; } Tensor* output_partition_ids_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids", TensorShape({num_elements}), &output_partition_ids_t)); tensorflow::TTypes::Vec output_partition_ids = output_partition_ids_t->vec(); // For a normal tree, we output a split per partition. For an oblivious // tree, we output one split for all partitions of the layer int32 size_output = num_elements; if (weak_learner_type == LearnerConfig::OBLIVIOUS_DECISION_TREE && num_elements > 0) { size_output = 1; } Tensor* gains_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output( "gains", TensorShape({size_output}), &gains_t)); tensorflow::TTypes::Vec gains = gains_t->vec(); Tensor* output_splits_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output("split_infos", TensorShape({size_output}), &output_splits_t)); tensorflow::TTypes::Vec output_splits = output_splits_t->vec(); if (num_elements == 0) { return; } SplitBuilderState state(context); switch (weak_learner_type) { case LearnerConfig::NORMAL_DECISION_TREE: { ComputeNormalDecisionTree( &state, normalizer_ratio, num_elements, partition_boundaries, bucket_boundaries, partition_ids, bucket_ids, gradients_t, hessians_t, &output_partition_ids, &gains, &output_splits); break; } case LearnerConfig::OBLIVIOUS_DECISION_TREE: { ComputeObliviousDecisionTree( &state, normalizer_ratio, num_elements, partition_boundaries, bucket_boundaries, partition_ids, bucket_ids, gradients_t, hessians_t, &output_partition_ids, &gains, &output_splits); break; } } } private: void ComputeNormalDecisionTree( SplitBuilderState* state, const float normalizer_ratio, const int num_elements, const std::vector& partition_boundaries, const tensorflow::TTypes::ConstVec& bucket_boundaries, const tensorflow::TTypes::ConstVec& partition_ids, const tensorflow::TTypes::ConstMatrix& bucket_ids, const Tensor* gradients_t, const Tensor* hessians_t, tensorflow::TTypes::Vec* output_partition_ids, tensorflow::TTypes::Vec* gains, tensorflow::TTypes::Vec* output_splits) { for (int root_idx = 0; root_idx < num_elements; ++root_idx) { float best_gain = std::numeric_limits::lowest(); int start_index = partition_boundaries[root_idx]; int end_index = partition_boundaries[root_idx + 1]; GradientStats root_gradient_stats; for (int64 bucket_idx = start_index; bucket_idx < end_index; ++bucket_idx) { root_gradient_stats += GradientStats(*gradients_t, *hessians_t, bucket_idx); } root_gradient_stats *= normalizer_ratio; NodeStats root_stats = state->ComputeNodeStats(root_gradient_stats); int32 best_bucket_idx = 0; NodeStats best_right_node_stats(0); NodeStats best_left_node_stats(0); GradientStats left_gradient_stats; for (int64 bucket_idx = start_index; bucket_idx < end_index; ++bucket_idx) { GradientStats g(*gradients_t, *hessians_t, bucket_idx); g *= normalizer_ratio; left_gradient_stats += g; NodeStats left_stats = state->ComputeNodeStats(left_gradient_stats); GradientStats right_gradient_stats = root_gradient_stats - left_gradient_stats; NodeStats right_stats = state->ComputeNodeStats(right_gradient_stats); if (left_stats.gain + right_stats.gain > best_gain) { best_gain = left_stats.gain + right_stats.gain; best_left_node_stats = left_stats; best_right_node_stats = right_stats; best_bucket_idx = bucket_idx; } } SplitInfo split_info; auto* dense_split = split_info.mutable_split_node()->mutable_dense_float_binary_split(); dense_split->set_feature_column(state->feature_column_group_id()); dense_split->set_threshold( bucket_boundaries(bucket_ids(best_bucket_idx, 0))); auto* left_child = split_info.mutable_left_child(); auto* right_child = split_info.mutable_right_child(); state->FillLeaf(best_left_node_stats, left_child); state->FillLeaf(best_right_node_stats, right_child); split_info.SerializeToString(&(*output_splits)(root_idx)); (*gains)(root_idx) = best_gain - root_stats.gain - state->tree_complexity_regularization(); (*output_partition_ids)(root_idx) = partition_ids(start_index); } } void ComputeObliviousDecisionTree( SplitBuilderState* state, const float normalizer_ratio, const int num_elements, const std::vector& partition_boundaries, const tensorflow::TTypes::ConstVec& bucket_boundaries, const tensorflow::TTypes::ConstVec& partition_ids, const tensorflow::TTypes::ConstMatrix& bucket_ids, const Tensor* gradients_t, const Tensor* hessians_t, tensorflow::TTypes::Vec* output_partition_ids, tensorflow::TTypes::Vec* gains, tensorflow::TTypes::Vec* output_splits) { // Holds the root stats per each node to be split. std::vector current_layer_stats; current_layer_stats.reserve(num_elements); for (int root_idx = 0; root_idx < num_elements; root_idx++) { const int start_index = partition_boundaries[root_idx]; const int end_index = partition_boundaries[root_idx + 1]; GradientStats root_gradient_stats; for (int64 bucket_idx = start_index; bucket_idx < end_index; ++bucket_idx) { root_gradient_stats += GradientStats(*gradients_t, *hessians_t, bucket_idx); } root_gradient_stats *= normalizer_ratio; current_layer_stats.push_back(root_gradient_stats); } float best_gain = std::numeric_limits::lowest(); int64 best_bucket_id = 0; std::vector best_right_node_stats(num_elements, NodeStats(0)); std::vector best_left_node_stats(num_elements, NodeStats(0)); std::vector current_left_node_stats(num_elements, NodeStats(0)); std::vector current_right_node_stats(num_elements, NodeStats(0)); int64 current_bucket_id = std::numeric_limits::max(); int64 last_bucket_id = -1; // Find the lowest bucket id, this is going to be the first bucket id to // try. for (int root_idx = 0; root_idx < num_elements; root_idx++) { const int start_index = partition_boundaries[root_idx]; if (bucket_ids(start_index, 0) < current_bucket_id) { current_bucket_id = bucket_ids(start_index, 0); } } // Indexes offsets for each of the partitions that can be used to access // gradients of a partition for a current bucket we consider. std::vector current_layer_offsets(num_elements, 0); std::vector left_gradient_stats(num_elements); // The idea is to try every bucket id in increasing order. In each iteration // we calculate the gain of the layer using the current bucket id as split // value, and we also obtain the following bucket id to try. while (current_bucket_id > last_bucket_id) { last_bucket_id = current_bucket_id; int64 next_bucket_id = -1; for (int root_idx = 0; root_idx < num_elements; root_idx++) { int idx = current_layer_offsets[root_idx] + partition_boundaries[root_idx]; const int end_index = partition_boundaries[root_idx + 1]; if (idx < end_index && bucket_ids(idx, 0) == current_bucket_id) { GradientStats g(*gradients_t, *hessians_t, idx); g *= normalizer_ratio; left_gradient_stats[root_idx] += g; current_layer_offsets[root_idx]++; idx++; } if (idx < end_index && (bucket_ids(idx, 0) < next_bucket_id || next_bucket_id == -1)) { next_bucket_id = bucket_ids(idx, 0); } } float gain_of_split = 0.0; for (int root_idx = 0; root_idx < num_elements; root_idx++) { GradientStats right_gradient_stats = current_layer_stats[root_idx] - left_gradient_stats[root_idx]; NodeStats left_stat = state->ComputeNodeStats(left_gradient_stats[root_idx]); NodeStats right_stat = state->ComputeNodeStats(right_gradient_stats); gain_of_split += left_stat.gain + right_stat.gain; current_left_node_stats[root_idx] = left_stat; current_right_node_stats[root_idx] = right_stat; } if (gain_of_split > best_gain) { best_gain = gain_of_split; best_left_node_stats = current_left_node_stats; best_right_node_stats = current_right_node_stats; best_bucket_id = current_bucket_id; } current_bucket_id = next_bucket_id; } for (int root_idx = 0; root_idx < num_elements; root_idx++) { best_gain -= state->ComputeNodeStats(current_layer_stats[root_idx]).gain; } best_gain -= num_elements * state->tree_complexity_regularization(); ObliviousSplitInfo oblivious_split_info; auto* oblivious_dense_split = oblivious_split_info.mutable_split_node() ->mutable_oblivious_dense_float_binary_split(); oblivious_dense_split->set_feature_column(state->feature_column_group_id()); oblivious_dense_split->set_threshold(bucket_boundaries(best_bucket_id)); (*gains)(0) = best_gain; for (int root_idx = 0; root_idx < num_elements; root_idx++) { auto* left_child = oblivious_split_info.add_children(); auto* right_child = oblivious_split_info.add_children(); state->FillLeaf(best_left_node_stats[root_idx], left_child); state->FillLeaf(best_right_node_stats[root_idx], right_child); const int start_index = partition_boundaries[root_idx]; (*output_partition_ids)(root_idx) = partition_ids(start_index); oblivious_split_info.add_children_parent_id(partition_ids(start_index)); } oblivious_split_info.SerializeToString(&(*output_splits)(0)); } }; REGISTER_KERNEL_BUILDER(Name("BuildDenseInequalitySplits").Device(DEVICE_CPU), BuildDenseInequalitySplitsOp); class BuildSparseInequalitySplitsOp : public OpKernel { public: explicit BuildSparseInequalitySplitsOp(OpKernelConstruction* const context) : OpKernel(context) {} void Compute(OpKernelContext* const context) override { const Tensor* num_minibatches_t; OP_REQUIRES_OK(context, context->input("num_minibatches", &num_minibatches_t)); const int64 num_minibatches = num_minibatches_t->scalar()(); const float normalizer_ratio = (1.0f / num_minibatches); const Tensor* bucket_boundaries_t; OP_REQUIRES_OK(context, context->input("bucket_boundaries", &bucket_boundaries_t)); const auto& bucket_boundaries = bucket_boundaries_t->vec(); const Tensor* partition_ids_t; OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t)); const auto& partition_ids = partition_ids_t->vec(); const Tensor* bucket_ids_t; OP_REQUIRES_OK(context, context->input("bucket_ids", &bucket_ids_t)); const auto& bucket_ids_and_dimensions = bucket_ids_t->matrix(); const int32 tensor_elements = partition_ids.size(); const Tensor* gradients_t; OP_REQUIRES_OK(context, context->input("gradients", &gradients_t)); const Tensor* hessians_t; OP_REQUIRES_OK(context, context->input("hessians", &hessians_t)); const Tensor* bias_feature_id_t; OP_REQUIRES_OK(context, context->input("bias_feature_id", &bias_feature_id_t)); int64 bias_feature_id = bias_feature_id_t->scalar()(); // For each partition (tree node), store starting index for each dimension. PartitionAndDimensionBoundaries partition_boundaries; // Stores indices in partition_boundaries for those partitions that are // not empty (have at least one dimension and a bucket apart from catch-all // bucket of -1 bucket id and dimension 0. std::vector non_empty_partitions; bool non_empty_partition = false; for (int i = 0; i < partition_ids.size(); ++i) { // Make sure the input is sorted by partition_ids; if (i > 0) { CHECK_LE(partition_ids(i - 1), partition_ids(i)) << "Partition ids should be sorted. Not sorted for " << i; } const int32 dimension = bucket_ids_and_dimensions(i, 1); if (i == 0 || (partition_ids(i) != partition_ids(i - 1))) { if (i != 0) { // Not the first entry, so partition has changed. if (non_empty_partition) { // Saves the id of a previous partition in a list of non empty // partitions, since it was non empty (had more than just a bias // bucket -1. non_empty_partitions.push_back(partition_boundaries.size() - 1); } // Add dummy dimension to signify the end for the previous dimension. partition_boundaries.back().emplace_back(DUMMY_FEATURE_DIMENSION, i); } // Allocate for a new partition. partition_boundaries.emplace_back(); // Save info about the first dimension for a new partition. partition_boundaries.back().emplace_back(dimension, i); // Each partition has dummy -1 bucket with all gradients and then info // for all other dimensions -> if we have >1 elements for a partition, // then it is not empty. non_empty_partition = (i < partition_ids.size() - 1) && (partition_ids(i) == partition_ids(i + 1)); } else if (bucket_ids_and_dimensions(i, 1) != bucket_ids_and_dimensions(i - 1, 1)) { // Dimension changed. partition_boundaries.back().emplace_back(dimension, i); } } if (tensor_elements > 0) { if (non_empty_partition) { non_empty_partitions.push_back(partition_boundaries.size() - 1); } // Add dummy dimension to signify the end for the previous dimension. partition_boundaries.back().emplace_back(DUMMY_FEATURE_DIMENSION, partition_ids.size()); } int num_elements = non_empty_partitions.size(); Tensor* output_partition_ids_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids", TensorShape({num_elements}), &output_partition_ids_t)); tensorflow::TTypes::Vec output_partition_ids = output_partition_ids_t->vec(); Tensor* gains_t = nullptr; OP_REQUIRES_OK( context, context->allocate_output("gains", TensorShape({num_elements}), &gains_t)); tensorflow::TTypes::Vec gains = gains_t->vec(); Tensor* output_splits_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output( "split_infos", TensorShape({num_elements}), &output_splits_t)); tensorflow::TTypes::Vec output_splits = output_splits_t->vec(); SplitBuilderState state(context); // For each tree node that needs to be split. for (int root_idx = 0; root_idx < num_elements; ++root_idx) { const auto& dimension_boundaries = partition_boundaries[non_empty_partitions[root_idx]]; float best_gain = std::numeric_limits::lowest(); int32 best_dimension_idx = 0; bool default_right = false; int32 best_element_idx = 0; NodeStats best_right_node_stats(0); NodeStats best_left_node_stats(0); // For each partition, the first bucket is dummy catch all. int32 bias_start_index = dimension_boundaries[0].start_index; OP_REQUIRES( context, bucket_ids_and_dimensions(bias_start_index, 0) == bias_feature_id, errors::InvalidArgument("Bias feature ID missing.")); // Dimension for bias feature is always 0 OP_REQUIRES( context, bucket_ids_and_dimensions(bias_start_index, 1) == 0, errors::InvalidArgument("Bias feature ID must be with dimension 0.")); // For each root, we do two passes over the quantized feature buckets // accumulating gradients on one side and using the root aggregate // gradients to get the gradients for the other side. // Split gains are evaluated for each pass at every threshold and the best // split is picked. GradientStats root_gradient_stats(*gradients_t, *hessians_t, bias_start_index); root_gradient_stats *= normalizer_ratio; NodeStats root_stats = state.ComputeNodeStats(root_gradient_stats); // Iterate through dimensions. for (int j = 0; j < dimension_boundaries.size() - 1; ++j) { const DimensionBoundary& dimension_and_start = dimension_boundaries[j]; const int32 dimension_id = dimension_and_start.dimension_id; int start_index = dimension_and_start.start_index; // Even for the last dimension, we always have additional dummy // dimension that we can use to find the end index. const int end_index = partition_boundaries[non_empty_partitions[root_idx]][j + 1] .start_index; if (bucket_ids_and_dimensions(start_index, 0) == bias_feature_id) { // 0-dimension case which has a first bucket for catch all feature. CHECK(bucket_ids_and_dimensions(start_index, 1) == 0) << "Dimension of bias feature should be 0"; ++start_index; } GradientStats present_gradient_stats; for (int64 bucket_idx = start_index; bucket_idx < end_index; ++bucket_idx) { present_gradient_stats += GradientStats(*gradients_t, *hessians_t, bucket_idx); } present_gradient_stats *= normalizer_ratio; GradientStats not_present = root_gradient_stats - present_gradient_stats; // If there was (almost) no sparsity, fix the default direction to LEFT. bool fixed_default_direction = not_present.IsAlmostZero(); GradientStats left_gradient_stats; for (int64 element_idx = start_index; element_idx < end_index; ++element_idx) { // Check that bucket ids are sorted. if (element_idx != start_index) { CHECK(bucket_ids_and_dimensions(element_idx - 1, 0) < bucket_ids_and_dimensions(element_idx, 0)) << "Bucket ids must be sorted." << ", problem on " << element_idx << " and dimension is " << j; } GradientStats g(*gradients_t, *hessians_t, element_idx); g *= normalizer_ratio; left_gradient_stats += g; // We have the sum of all present gradients. Use that to compute the // backward pass gradients. GradientStats right_gradient_stats = present_gradient_stats - left_gradient_stats; { NodeStats left_stats_default_left = state.ComputeNodeStats( root_gradient_stats - right_gradient_stats); NodeStats right_stats_default_left = state.ComputeNodeStats(right_gradient_stats); if (left_stats_default_left.gain + right_stats_default_left.gain > best_gain) { best_gain = left_stats_default_left.gain + right_stats_default_left.gain; best_left_node_stats = left_stats_default_left; best_right_node_stats = right_stats_default_left; best_element_idx = element_idx; default_right = false; best_dimension_idx = dimension_id; } } // Consider calculating the default direction only when there were // enough missing examples. if (!fixed_default_direction) { NodeStats left_stats_default_right = state.ComputeNodeStats(left_gradient_stats); NodeStats right_stats_default_right = state.ComputeNodeStats( root_gradient_stats - left_gradient_stats); if (left_stats_default_right.gain + right_stats_default_right.gain > best_gain) { best_gain = left_stats_default_right.gain + right_stats_default_right.gain; best_left_node_stats = left_stats_default_right; best_right_node_stats = right_stats_default_right; best_element_idx = element_idx; default_right = true; best_dimension_idx = dimension_id; } } } } SplitInfo split_info; boosted_trees::trees::DenseFloatBinarySplit* dense_split = nullptr; if (default_right) { dense_split = split_info.mutable_split_node() ->mutable_sparse_float_binary_split_default_right() ->mutable_split(); } else { dense_split = split_info.mutable_split_node() ->mutable_sparse_float_binary_split_default_left() ->mutable_split(); } dense_split->set_feature_column(state.feature_column_group_id()); // Set the feature index for the best feature column. const int64 best_dimension_id = bucket_ids_and_dimensions(best_element_idx, 1); const int32 best_bucket_id = bucket_ids_and_dimensions(best_element_idx, 0); dense_split->set_dimension_id(best_dimension_id); dense_split->set_threshold(bucket_boundaries(best_bucket_id)); auto* left_child = split_info.mutable_left_child(); auto* right_child = split_info.mutable_right_child(); state.FillLeaf(best_left_node_stats, left_child); state.FillLeaf(best_right_node_stats, right_child); split_info.SerializeToString(&output_splits(root_idx)); gains(root_idx) = best_gain - root_stats.gain - state.tree_complexity_regularization(); output_partition_ids(root_idx) = partition_ids(bias_start_index); } } private: struct DimensionBoundary { DimensionBoundary(const int32 dimension_id, const int32 start_index) : dimension_id(dimension_id), start_index(start_index) {} int32 dimension_id; int32 start_index; }; // For each partition, store start indices of feature column dimensions. typedef std::vector> PartitionAndDimensionBoundaries; }; REGISTER_KERNEL_BUILDER(Name("BuildSparseInequalitySplits").Device(DEVICE_CPU), BuildSparseInequalitySplitsOp); class BuildCategoricalEqualitySplitsOp : public OpKernel { public: explicit BuildCategoricalEqualitySplitsOp(OpKernelConstruction* const context) : OpKernel(context) {} void Compute(OpKernelContext* const context) override { const Tensor* num_minibatches_t; OP_REQUIRES_OK(context, context->input("num_minibatches", &num_minibatches_t)); const int64 num_minibatches = num_minibatches_t->scalar()(); const float normalizer_ratio = (1.0f / num_minibatches); const Tensor* partition_ids_t; OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t)); const auto& partition_ids = partition_ids_t->vec(); const Tensor* feature_ids_t; OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t)); const auto& feature_ids = feature_ids_t->matrix(); const Tensor* gradients_t; OP_REQUIRES_OK(context, context->input("gradients", &gradients_t)); const Tensor* hessians_t; OP_REQUIRES_OK(context, context->input("hessians", &hessians_t)); const Tensor* bias_feature_id_t; OP_REQUIRES_OK(context, context->input("bias_feature_id", &bias_feature_id_t)); int64 bias_feature_id = bias_feature_id_t->scalar()(); const Tensor* weak_learner_type_t; OP_REQUIRES_OK(context, context->input("weak_learner_type", &weak_learner_type_t)); const int32 weak_learner_type = weak_learner_type_t->scalar()(); // Find the number of unique partitions before we allocate the output. std::vector partition_boundaries; partition_boundaries.push_back(0); for (int i = 1; i < partition_ids.size(); ++i) { // Make sure the input is sorted by partition_ids; OP_REQUIRES(context, partition_ids(i - 1) <= partition_ids(i), errors::InvalidArgument("Partition IDs must be sorted.")); if (partition_ids(i) != partition_ids(i - 1)) { partition_boundaries.push_back(i); } } std::vector non_empty_partitions; partition_boundaries.push_back(partition_ids.size()); for (int i = 0; i < partition_boundaries.size() - 1; ++i) { // We want to ignore partitions with only the bias term. if (partition_boundaries[i + 1] - partition_boundaries[i] >= 2) { non_empty_partitions.push_back(i); } } int num_elements = non_empty_partitions.size(); Tensor* output_partition_ids_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids", TensorShape({num_elements}), &output_partition_ids_t)); tensorflow::TTypes::Vec output_partition_ids = output_partition_ids_t->vec(); // For a normal tree, we output a split per partition. For an oblivious // tree, we output one split for all partitions of the layer. int size_output = num_elements; if (weak_learner_type == LearnerConfig::OBLIVIOUS_DECISION_TREE && num_elements > 0) { size_output = 1; } Tensor* gains_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output( "gains", TensorShape({size_output}), &gains_t)); tensorflow::TTypes::Vec gains = gains_t->vec(); Tensor* output_splits_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output("split_infos", TensorShape({size_output}), &output_splits_t)); tensorflow::TTypes::Vec output_splits = output_splits_t->vec(); if (num_elements == 0) { return; } SplitBuilderState state(context); switch (weak_learner_type) { case LearnerConfig::NORMAL_DECISION_TREE: { ComputeNormalDecisionTree( context, &state, normalizer_ratio, num_elements, partition_boundaries, non_empty_partitions, bias_feature_id, partition_ids, feature_ids, gradients_t, hessians_t, &output_partition_ids, &gains, &output_splits); break; } case LearnerConfig::OBLIVIOUS_DECISION_TREE: { ComputeObliviousDecisionTree( context, &state, normalizer_ratio, num_elements, partition_boundaries, non_empty_partitions, bias_feature_id, partition_ids, feature_ids, gradients_t, hessians_t, &output_partition_ids, &gains, &output_splits); break; } } } private: void ComputeNormalDecisionTree( OpKernelContext* const context, SplitBuilderState* state, const float normalizer_ratio, const int num_elements, const std::vector& partition_boundaries, const std::vector& non_empty_partitions, const int64 bias_feature_id, const tensorflow::TTypes::ConstVec& partition_ids, const tensorflow::TTypes::ConstMatrix& feature_ids, const Tensor* gradients_t, const Tensor* hessians_t, tensorflow::TTypes::Vec* output_partition_ids, tensorflow::TTypes::Vec* gains, tensorflow::TTypes::Vec* output_splits) { for (int root_idx = 0; root_idx < num_elements; ++root_idx) { float best_gain = std::numeric_limits::lowest(); int start_index = partition_boundaries[non_empty_partitions[root_idx]]; int end_index = partition_boundaries[non_empty_partitions[root_idx] + 1]; // First feature ID in each partition should be the bias feature. OP_REQUIRES(context, feature_ids(start_index, 0) == bias_feature_id, errors::InvalidArgument("Bias feature ID missing.")); GradientStats root_gradient_stats(*gradients_t, *hessians_t, start_index); root_gradient_stats *= normalizer_ratio; NodeStats root_stats = state->ComputeNodeStats(root_gradient_stats); int32 best_feature_idx = 0; NodeStats best_right_node_stats(0); NodeStats best_left_node_stats(0); for (int64 feature_idx = start_index + 1; feature_idx < end_index; ++feature_idx) { GradientStats left_gradient_stats(*gradients_t, *hessians_t, feature_idx); left_gradient_stats *= normalizer_ratio; GradientStats right_gradient_stats = root_gradient_stats - left_gradient_stats; NodeStats left_stats = state->ComputeNodeStats(left_gradient_stats); NodeStats right_stats = state->ComputeNodeStats(right_gradient_stats); if (left_stats.gain + right_stats.gain > best_gain) { best_gain = left_stats.gain + right_stats.gain; best_left_node_stats = left_stats; best_right_node_stats = right_stats; best_feature_idx = feature_idx; } } SplitInfo split_info; auto* equality_split = split_info.mutable_split_node() ->mutable_categorical_id_binary_split(); equality_split->set_feature_column(state->feature_column_group_id()); CHECK(feature_ids(best_feature_idx, 0) != bias_feature_id) << "Unexpected feature ID selected. " << "Start feature ID: [" << start_index << "] " << feature_ids(start_index, 0) << ", " << feature_ids(start_index, 1) << "\nBest feature ID: [" << best_feature_idx << "] " << feature_ids(best_feature_idx, 0) << ", " << feature_ids(best_feature_idx, 1) << "\nPartition IDS: " << partition_ids(start_index) << " " << partition_ids(best_feature_idx); equality_split->set_feature_id(feature_ids(best_feature_idx, 0)); auto* left_child = split_info.mutable_left_child(); auto* right_child = split_info.mutable_right_child(); state->FillLeaf(best_left_node_stats, left_child); state->FillLeaf(best_right_node_stats, right_child); split_info.SerializeToString(&(*output_splits)(root_idx)); (*gains)(root_idx) = best_gain - root_stats.gain - state->tree_complexity_regularization(); (*output_partition_ids)(root_idx) = partition_ids(start_index); } } void ComputeObliviousDecisionTree( OpKernelContext* const context, SplitBuilderState* state, const float normalizer_ratio, const int num_elements, const std::vector& partition_boundaries, const std::vector& non_empty_partitions, const int64 bias_feature_id, const tensorflow::TTypes::ConstVec& partition_ids, const tensorflow::TTypes::ConstMatrix& feature_ids, const Tensor* gradients_t, const Tensor* hessians_t, tensorflow::TTypes::Vec* output_partition_ids, tensorflow::TTypes::Vec* gains, tensorflow::TTypes::Vec* output_splits) { // Holds the root stats per each node to be split. std::vector current_layer_stats; current_layer_stats.reserve(num_elements); for (int root_idx = 0; root_idx < num_elements; root_idx++) { const int start_index = partition_boundaries[root_idx]; // First feature ID in each partition should be the bias feature. OP_REQUIRES(context, feature_ids(start_index, 0) == bias_feature_id, errors::InvalidArgument("Bias feature ID missing.")); GradientStats root_gradient_stats(*gradients_t, *hessians_t, start_index); root_gradient_stats *= normalizer_ratio; current_layer_stats.push_back(root_gradient_stats); } float best_gain = std::numeric_limits::lowest(); int64 best_feature_id = 0; std::vector best_right_node_stats(num_elements, NodeStats(0)); std::vector best_left_node_stats(num_elements, NodeStats(0)); std::vector current_left_node_stats(num_elements, NodeStats(0)); std::vector current_right_node_stats(num_elements, NodeStats(0)); int64 current_feature_id = std::numeric_limits::max(); int64 last_feature_id = -1; // Find the lowest feature id, this is going to be the first feature id to // try. for (int root_idx = 0; root_idx < num_elements; root_idx++) { const int start_index = partition_boundaries[root_idx]; if (feature_ids(start_index + 1, 0) < current_feature_id) { current_feature_id = feature_ids(start_index + 1, 0); } } // Indexes offsets for each of the partitions that can be used to access // gradients of a partition for a current feature we consider. Start at one // beacuse the zero index is for the bias. std::vector current_layer_offsets(num_elements, 1); // The idea is to try every feature id in increasing order. In each // iteration we calculate the gain of the layer using the current feature id // as split value, and we also obtain the following feature id to try. while (current_feature_id > last_feature_id) { last_feature_id = current_feature_id; int64 next_feature_id = -1; // Left gradient stats per node. std::vector left_gradient_stats(num_elements); for (int root_idx = 0; root_idx < num_elements; root_idx++) { int idx = current_layer_offsets[root_idx] + partition_boundaries[root_idx]; const int end_index = partition_boundaries[root_idx + 1]; if (idx < end_index && feature_ids(idx, 0) == current_feature_id) { GradientStats g(*gradients_t, *hessians_t, idx); g *= normalizer_ratio; left_gradient_stats[root_idx] = g; current_layer_offsets[root_idx]++; idx++; } if (idx < end_index && (feature_ids(idx, 0) < next_feature_id || next_feature_id == -1)) { next_feature_id = feature_ids(idx, 0); } } float gain_of_split = 0.0; for (int root_idx = 0; root_idx < num_elements; root_idx++) { GradientStats right_gradient_stats = current_layer_stats[root_idx] - left_gradient_stats[root_idx]; NodeStats left_stat = state->ComputeNodeStats(left_gradient_stats[root_idx]); NodeStats right_stat = state->ComputeNodeStats(right_gradient_stats); gain_of_split += left_stat.gain + right_stat.gain; current_left_node_stats[root_idx] = left_stat; current_right_node_stats[root_idx] = right_stat; } if (gain_of_split > best_gain) { best_gain = gain_of_split; best_left_node_stats = current_left_node_stats; best_right_node_stats = current_right_node_stats; best_feature_id = current_feature_id; } current_feature_id = next_feature_id; } for (int root_idx = 0; root_idx < num_elements; root_idx++) { best_gain -= state->ComputeNodeStats(current_layer_stats[root_idx]).gain; } best_gain -= num_elements * state->tree_complexity_regularization(); ObliviousSplitInfo oblivious_split_info; auto* equality_split = oblivious_split_info.mutable_split_node() ->mutable_oblivious_categorical_id_binary_split(); equality_split->set_feature_column(state->feature_column_group_id()); equality_split->set_feature_id(best_feature_id); (*gains)(0) = best_gain; for (int root_idx = 0; root_idx < num_elements; root_idx++) { auto* left_child = oblivious_split_info.add_children(); auto* right_child = oblivious_split_info.add_children(); state->FillLeaf(best_left_node_stats[root_idx], left_child); state->FillLeaf(best_right_node_stats[root_idx], right_child); const int start_index = partition_boundaries[root_idx]; (*output_partition_ids)(root_idx) = partition_ids(start_index); oblivious_split_info.add_children_parent_id(partition_ids(start_index)); } oblivious_split_info.SerializeToString(&(*output_splits)(0)); } }; REGISTER_KERNEL_BUILDER( Name("BuildCategoricalEqualitySplits").Device(DEVICE_CPU), BuildCategoricalEqualitySplitsOp); } // namespace tensorflow