// Copyright 2016 The TensorFlow Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= #include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h" #include #include #include "tensorflow/core/lib/random/philox_random.h" #include "tensorflow/core/platform/logging.h" namespace tensorflow { namespace tensorforest { using tensorflow::Tensor; DataColumnTypes FindDenseFeatureSpec( int32 input_feature, const tensorforest::TensorForestDataSpec& spec) { return static_cast(spec.GetDenseFeatureType(input_feature)); } DataColumnTypes FindSparseFeatureSpec( int32 input_feature, const tensorforest::TensorForestDataSpec& spec) { // TODO(thomaswc): Binary search here, especially when we start using more // than one sparse column int32 size_sum = spec.sparse(0).size(); int32 column_num = 0; while (input_feature >= size_sum && column_num < spec.sparse_size()) { ++column_num; size_sum += spec.sparse(column_num).size(); } return static_cast(spec.sparse(column_num).original_type()); } void GetTwoBest(int max, const std::function& score_fn, float* best_score, int* best_index, float* second_best_score, int* second_best_index) { *best_index = -1; *second_best_index = -1; *best_score = FLT_MAX; *second_best_score = FLT_MAX; for (int i = 0; i < max; i++) { float score = score_fn(i); if (score < *best_score) { *second_best_score = *best_score; *second_best_index = *best_index; *best_score = score; *best_index = i; } else if (score < *second_best_score) { *second_best_score = score; *second_best_index = i; } } } float ClassificationSplitScore( const Eigen::Tensor& splits, const Eigen::Tensor& rights, int32 num_classes, int i) { Eigen::array offsets; // Class counts are stored with the total in [0], so the length of each // count vector is num_classes + 1. offsets[0] = i * (num_classes + 1) + 1; Eigen::array extents; extents[0] = num_classes; return WeightedGiniImpurity(splits.slice(offsets, extents)) + WeightedGiniImpurity(rights.slice(offsets, extents)); } void GetTwoBestClassification(const Tensor& total_counts, const Tensor& split_counts, int32 accumulator, float* best_score, int* best_index, float* second_best_score, int* second_best_index) { const int32 num_splits = static_cast(split_counts.shape().dim_size(1)); const int32 num_classes = static_cast(split_counts.shape().dim_size(2)) - 1; // Ideally, Eigen::Tensor::chip would be best to use here but it results // in seg faults, so we have to go with flat views of these tensors. However, // it is still pretty efficient because we put off evaluation until the // score is actually returned. const auto tc = total_counts.Slice(accumulator, accumulator + 1).unaligned_flat(); // TODO(gilberth): See if we can delay evaluation here by templating the // arguments to ClassificationSplitScore. const Eigen::Tensor splits = split_counts.Slice(accumulator, accumulator + 1).unaligned_flat(); Eigen::array bcast; bcast[0] = num_splits; const Eigen::Tensor rights = tc.broadcast(bcast) - splits; std::function score_fn = std::bind(ClassificationSplitScore, splits, rights, num_classes, std::placeholders::_1); GetTwoBest(num_splits, score_fn, best_score, best_index, second_best_score, second_best_index); } int32 BestFeatureClassification(const Tensor& total_counts, const Tensor& split_counts, int32 accumulator) { float best_score; float second_best_score; int best_feature_index; int second_best_index; GetTwoBestClassification(total_counts, split_counts, accumulator, &best_score, &best_feature_index, &second_best_score, &second_best_index); return best_feature_index; } float RegressionSplitScore( const Eigen::Tensor& splits_count_accessor, const Eigen::Tensor& totals_count_accessor, const Eigen::Tensor& splits_sum, const Eigen::Tensor& splits_square, const Eigen::Tensor& right_sums, const Eigen::Tensor& right_squares, int32 accumulator, int32 num_regression_dims, int i) { Eigen::array offsets = {i * num_regression_dims + 1}; Eigen::array extents = {num_regression_dims - 1}; float left_count = splits_count_accessor(accumulator, i, 0); float right_count = totals_count_accessor(accumulator, 0) - left_count; float score = 0; // Guard against divide-by-zero. if (left_count > 0) { score += WeightedVariance(splits_sum.slice(offsets, extents), splits_square.slice(offsets, extents), left_count); } if (right_count > 0) { score += WeightedVariance(right_sums.slice(offsets, extents), right_squares.slice(offsets, extents), right_count); } return score; } void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares, const Tensor& split_sums, const Tensor& split_squares, int32 accumulator, float* best_score, int* best_index, float* second_best_score, int* second_best_index) { const int32 num_splits = static_cast(split_sums.shape().dim_size(1)); const int32 num_regression_dims = static_cast(split_sums.shape().dim_size(2)); // Ideally, Eigen::Tensor::chip would be best to use here but it results // in seg faults, so we have to go with flat views of these tensors. However, // it is still pretty efficient because we put off evaluation until the // score is actually returned. const auto tc_sum = total_sums.Slice(accumulator, accumulator + 1).unaligned_flat(); const auto tc_square = total_squares.Slice(accumulator, accumulator + 1).unaligned_flat(); const auto splits_sum = split_sums.Slice(accumulator, accumulator + 1).unaligned_flat(); const auto splits_square = split_squares.Slice(accumulator, accumulator + 1).unaligned_flat(); // Eigen is infuriating to work with, usually resulting in all kinds of // unhelpful compiler errors when trying something that seems sane. This // helps us do a simple thing like access the first element (the counts) // of these tensors so we can calculate expected value in Variance(). const auto splits_count_accessor = split_sums.tensor(); const auto totals_count_accessor = total_sums.tensor(); Eigen::array bcast; bcast[0] = num_splits; const auto right_sums = tc_sum.broadcast(bcast) - splits_sum; const auto right_squares = tc_square.broadcast(bcast) - splits_square; GetTwoBest(num_splits, std::bind(RegressionSplitScore, splits_count_accessor, totals_count_accessor, splits_sum, splits_square, right_sums, right_squares, accumulator, num_regression_dims, std::placeholders::_1), best_score, best_index, second_best_score, second_best_index); } int32 BestFeatureRegression(const Tensor& total_sums, const Tensor& total_squares, const Tensor& split_sums, const Tensor& split_squares, int32 accumulator) { float best_score; float second_best_score; int best_feature_index; int second_best_index; GetTwoBestRegression(total_sums, total_squares, split_sums, split_squares, accumulator, &best_score, &best_feature_index, &second_best_score, &second_best_index); return best_feature_index; } bool BestSplitDominatesRegression(const Tensor& total_sums, const Tensor& total_squares, const Tensor& split_sums, const Tensor& split_squares, int32 accumulator) { // TODO(thomaswc): Implement this, probably as part of v3. return false; } int BootstrapGini(int n, int s, const random::DistributionSampler& ds, random::SimplePhilox* rand) { std::vector counts(s, 0); for (int i = 0; i < n; i++) { int j = ds.Sample(rand); counts[j] += 1; } int g = 0; for (int j = 0; j < s; j++) { g += counts[j] * counts[j]; } // The true gini is 1 + (-g) / n^2 return -g; } // Populate *weights with the smoothed per-class frequencies needed to // initialize a DistributionSampler. Returns the total number of samples // seen by this accumulator. int MakeBootstrapWeights(const Tensor& total_counts, const Tensor& split_counts, int32 accumulator, int index, std::vector* weights) { const int32 num_classes = static_cast(split_counts.shape().dim_size(2)) - 1; auto tc = total_counts.tensor(); auto lc = split_counts.tensor(); int n = tc(accumulator, 0); float denom = static_cast(n) + static_cast(num_classes); weights->resize(num_classes * 2); for (int i = 0; i < num_classes; i++) { // Use the Laplace smoothed per-class probabilities when generating the // bootstrap samples. float left_count = lc(accumulator, index, i + 1); (*weights)[i] = (left_count + 1.0) / denom; float right_count = tc(accumulator, i + 1) - left_count; (*weights)[num_classes + i] = (right_count + 1.0) / denom; } return n; } bool BestSplitDominatesClassificationBootstrap(const Tensor& total_counts, const Tensor& split_counts, int32 accumulator, float dominate_fraction, random::SimplePhilox* rand) { float best_score; float second_best_score; int best_feature_index; int second_best_index; GetTwoBestClassification(total_counts, split_counts, accumulator, &best_score, &best_feature_index, &second_best_score, &second_best_index); std::vector weights1; int n1 = MakeBootstrapWeights(total_counts, split_counts, accumulator, best_feature_index, &weights1); random::DistributionSampler ds1(weights1); std::vector weights2; int n2 = MakeBootstrapWeights(total_counts, split_counts, accumulator, second_best_index, &weights2); random::DistributionSampler ds2(weights2); const int32 num_classes = static_cast(split_counts.shape().dim_size(2)) - 1; float p = 1.0 - dominate_fraction; if (p <= 0 || p > 1.0) { LOG(FATAL) << "Invalid dominate fraction " << dominate_fraction; } int bootstrap_samples = 1; while (p < 1.0) { bootstrap_samples += 1; p = p * 2; } int worst_g1 = 0; for (int i = 0; i < bootstrap_samples; i++) { int g1 = BootstrapGini(n1, 2 * num_classes, ds1, rand); worst_g1 = std::max(worst_g1, g1); } int best_g2 = 99; for (int i = 0; i < bootstrap_samples; i++) { int g2 = BootstrapGini(n2, 2 * num_classes, ds2, rand); best_g2 = std::min(best_g2, g2); } return worst_g1 < best_g2; } bool BestSplitDominatesClassificationHoeffding(const Tensor& total_counts, const Tensor& split_counts, int32 accumulator, float dominate_fraction) { float best_score; float second_best_score; int best_feature_index; int second_best_index; VLOG(1) << "BSDC for accumulator " << accumulator; GetTwoBestClassification(total_counts, split_counts, accumulator, &best_score, &best_feature_index, &second_best_score, &second_best_index); VLOG(1) << "Best score = " << best_score; VLOG(1) << "2nd best score = " << second_best_score; const int32 num_classes = static_cast(split_counts.shape().dim_size(2)) - 1; const float n = total_counts.Slice(accumulator, accumulator + 1) .unaligned_flat()(0); // Each term in the Gini impurity can range from 0 to 0.5 * 0.5. float range = 0.25 * static_cast(num_classes) * n; float hoeffding_bound = range * sqrt(log(1.0 / (1.0 - dominate_fraction)) / (2.0 * n)); VLOG(1) << "num_classes = " << num_classes; VLOG(1) << "n = " << n; VLOG(1) << "range = " << range; VLOG(1) << "hoeffding_bound = " << hoeffding_bound; return (second_best_score - best_score) > hoeffding_bound; } double DirichletCovarianceTrace(const Tensor& total_counts, const Tensor& split_counts, int32 accumulator, int index) { const int32 num_classes = static_cast(split_counts.shape().dim_size(2)) - 1; auto tc = total_counts.tensor(); auto lc = split_counts.tensor(); double leftc = 0.0; double leftc2 = 0.0; double rightc = 0.0; double rightc2 = 0.0; for (int i = 1; i <= num_classes; i++) { double l = lc(accumulator, index, i) + 1.0; leftc += l; leftc2 += l * l; double r = tc(accumulator, i) - lc(accumulator, index, i) + 1.0; rightc += r; rightc2 += r * r; } double left_trace = (1.0 - leftc2 / (leftc * leftc)) / (leftc + 1.0); double right_trace = (1.0 - rightc2 / (rightc * rightc)) / (rightc + 1.0); return left_trace + right_trace; } void getDirichletMean(const Tensor& total_counts, const Tensor& split_counts, int32 accumulator, int index, std::vector* mu) { const int32 num_classes = static_cast(split_counts.shape().dim_size(2)) - 1; mu->resize(num_classes * 2); auto tc = total_counts.tensor(); auto lc = split_counts.tensor(); double total = tc(accumulator, 0); for (int i = 0; i < num_classes; i++) { double l = lc(accumulator, index, i + 1); mu->at(i) = (l + 1.0) / (total + num_classes); double r = tc(accumulator, i) - l; mu->at(i + num_classes) = (r + 1.) / (total + num_classes); } } // Given lambda3, returns the distance from (mu1, mu2) to the surface. double getDistanceFromLambda3(double lambda3, const std::vector& mu1, const std::vector& mu2) { if (fabs(lambda3) == 1.0) { return 0.0; } int n = mu1.size(); double lambda1 = -2.0 * lambda3 / n; double lambda2 = 2.0 * lambda3 / n; // From below, // x = (lambda_1 1 + 2 mu1) / (2 - 2 lambda_3) // y = (lambda_2 1 + 2 mu2) / (2 + 2 lambda_3) double dist = 0.0; for (size_t i = 0; i < mu1.size(); i++) { double diff = (lambda1 + 2.0 * mu1[i]) / (2.0 - 2.0 * lambda3) - mu1[i]; dist += diff * diff; diff = (lambda2 + 2.0 * mu2[i]) / (2.0 + 2.0 * lambda3) - mu2[i]; dist += diff * diff; } return dist; } // Returns the distance between (mu1, mu2) and (x, y), where (x, y) is the // nearest point that lies on the surface defined by // {x dot 1 = 1, y dot 1 = 1, x dot x - y dot y = 0}. double getChebyshevEpsilon(const std::vector& mu1, const std::vector& mu2) { // Math time!! // We are trying to minimize d = |mu1 - x|^2 + |mu2 - y|^2 over the surface. // Using Lagrange multipliers, we get // partial d / partial x = -2 mu1 + 2 x = lambda_1 1 + 2 lambda_3 x // partial d / partial y = -2 mu2 + 2 y = lambda_2 1 - 2 lambda_3 y // or // x = (lambda_1 1 + 2 mu1) / (2 - 2 lambda_3) // y = (lambda_2 1 + 2 mu2) / (2 + 2 lambda_3) // which implies // 2 - 2 lambda_3 = lambda_1 1 dot 1 + 2 mu1 dot 1 // 2 + 2 lambda_3 = lambda_2 1 dot 1 + 2 mu2 dot 1 // |lambda_1 1 + 2 mu1|^2 (2 + 2 lambda_3)^2 = // |lambda_2 1 + 2 mu2|^2 (2 - 2 lambda_3)^2 // So solving for the lambda's and using the fact that // mu1 dot 1 = 1 and mu2 dot 1 = 1, // lambda_1 = -2 lambda_3 / (1 dot 1) // lambda_2 = 2 lambda_3 / (1 dot 1) // and (letting n = 1 dot 1) // | - lambda_3 1 + n mu1 |^2 (1 + lambda_3)^2 = // | lambda_3 1 + n mu2 |^2 (1 - lambda_3)^2 // or // (lambda_3^2 n - 2 n lambda_3 + n^2 mu1 dot mu1)(1 + lambda_3)^2 = // (lambda_3^2 n + 2 n lambda_3 + n^2 mu2 dot mu2)(1 - lambda_3)^2 // or // (lambda_3^2 - 2 lambda_3 + n mu1 dot mu1)(1 + 2 lambda_3 + lambda_3^2) = // (lambda_3^2 + 2 lambda_3 + n mu2 dot mu2)(1 - 2 lambda_3 + lambda_3^2) // or // lambda_3^2 - 2 lambda_3 + n mu1 dot mu1 // + 2 lambda_3^3 - 2 lambda_3^2 + 2n lambda_3 mu1 dot mu1 // + lambda_3^4 - 2 lambda_3^3 + n lambda_3^2 mu1 dot mu1 // = // lambda_3^2 + 2 lambda_3 + n mu2 dot mu2 // - 2 lambda_3^3 -4 lambda_3^2 - 2n lambda_3 mu2 dot mu2 // + lambda_3^4 + 2 lambda_3^3 + n lambda_3^2 mu2 dot mu2 // or // - 2 lambda_3 + n mu1 dot mu1 // - 2 lambda_3^2 + 2n lambda_3 mu1 dot mu1 // + n lambda_3^2 mu1 dot mu1 // = // + 2 lambda_3 + n mu2 dot mu2 // -4 lambda_3^2 - 2n lambda_3 mu2 dot mu2 // + n lambda_3^2 mu2 dot mu2 // or // lambda_3^2 (2 + n mu1 dot mu1 + n mu2 dot mu2) // + lambda_3 (2n mu1 dot mu1 + 2n mu2 dot mu2 - 4) // + n mu1 dot mu1 - n mu2 dot mu2 = 0 // which can be solved using the quadratic formula. int n = mu1.size(); double len1 = 0.0; for (float m : mu1) { len1 += m * m; } double len2 = 0.0; for (float m : mu2) { len2 += m * m; } double a = 2 + n * (len1 + len2); double b = 2 * n * (len1 + len2) - 4; double c = n * (len1 - len2); double discrim = b * b - 4 * a * c; if (discrim < 0.0) { LOG(WARNING) << "Negative discriminant " << discrim; return 0.0; } double sdiscrim = sqrt(discrim); // TODO(thomaswc): Analyze whatever one of these is always closer. double v1 = (-b + sdiscrim) / (2 * a); double v2 = (-b - sdiscrim) / (2 * a); double dist1 = getDistanceFromLambda3(v1, mu1, mu2); double dist2 = getDistanceFromLambda3(v2, mu1, mu2); return std::min(dist1, dist2); } bool BestSplitDominatesClassificationChebyshev(const Tensor& total_counts, const Tensor& split_counts, int32 accumulator, float dominate_fraction) { float best_score; float second_best_score; int best_feature_index; int second_best_index; VLOG(1) << "BSDC for accumulator " << accumulator; GetTwoBestClassification(total_counts, split_counts, accumulator, &best_score, &best_feature_index, &second_best_score, &second_best_index); VLOG(1) << "Best score = " << best_score; VLOG(1) << "2nd best score = " << second_best_score; const int32 num_classes = static_cast(split_counts.shape().dim_size(2)) - 1; const float n = total_counts.Slice(accumulator, accumulator + 1) .unaligned_flat()(0); VLOG(1) << "num_classes = " << num_classes; VLOG(1) << "n = " << n; double trace = DirichletCovarianceTrace(total_counts, split_counts, accumulator, best_feature_index) + DirichletCovarianceTrace(total_counts, split_counts, accumulator, second_best_index); std::vector mu1; getDirichletMean(total_counts, split_counts, accumulator, best_feature_index, &mu1); std::vector mu2; getDirichletMean(total_counts, split_counts, accumulator, second_best_index, &mu2); double epsilon = getChebyshevEpsilon(mu1, mu2); if (epsilon == 0.0) { return false; } double dirichlet_bound = 1.0 - trace / (epsilon * epsilon); return dirichlet_bound > dominate_fraction; } GetFeatureFnType GetDenseFunctor(const Tensor& dense) { if (dense.shape().dims() == 2) { const auto dense_features = dense.matrix(); // Here we capture by value, which shouldn't incur a copy of the data // because of the underlying use of Eigen::TensorMap. return [dense_features](int32 i, int32 feature) { return dense_features(i, feature); }; } else { return [](int32 i, int32 feature) { LOG(ERROR) << "trying to access nonexistent dense features."; return 0; }; } } GetFeatureFnType GetSparseFunctor(const Tensor& sparse_indices, const Tensor& sparse_values) { if (sparse_indices.shape().dims() == 2) { const auto indices = sparse_indices.matrix(); const auto values = sparse_values.vec(); // Here we capture by value, which shouldn't incur a copy of the data // because of the underlying use of Eigen::TensorMap. return [indices, values](int32 i, int32 feature) { return tensorforest::FindSparseValue(indices, values, i, feature); }; } else { return [](int32 i, int32 feature) { LOG(ERROR) << "trying to access nonexistent sparse features."; return 0; }; } } bool DecideNode(const GetFeatureFnType& get_dense, const GetFeatureFnType& get_sparse, int32 i, int32 feature, float bias, const tensorforest::TensorForestDataSpec& spec) { if (feature < spec.dense_features_size()) { return Decide(get_dense(i, feature), bias, FindDenseFeatureSpec(feature, spec)); } else { const int32 sparse_feature = feature - spec.dense_features_size(); return Decide(get_sparse(i, sparse_feature), bias, FindSparseFeatureSpec(sparse_feature, spec)); } } bool Decide(float value, float bias, DataColumnTypes type) { switch (type) { case kDataFloat: return value >= bias; case kDataCategorical: // We arbitrarily define categorical equality as going left. return value != bias; default: LOG(ERROR) << "Got unknown column type: " << type; return false; } } void GetParentWeightedMean(float leaf_sum, const float* leaf_data, float parent_sum, const float* parent_data, float valid_leaf_threshold, int num_outputs, std::vector* mean) { float parent_weight = 0.0; if (leaf_sum < valid_leaf_threshold && parent_sum >= 0) { VLOG(1) << "not enough samples at leaf, including parent counts." << "child sum = " << leaf_sum; // Weight the parent's counts just enough so that the new sum is // valid_leaf_threshold_, but never give any counts a weight of // more than 1. parent_weight = std::min(1.0f, (valid_leaf_threshold - leaf_sum) / parent_sum); leaf_sum += parent_weight * parent_sum; VLOG(1) << "Sum w/ parent included = " << leaf_sum; } for (int c = 0; c < num_outputs; c++) { float w = leaf_data[c]; if (parent_weight > 0.0) { w += parent_weight * parent_data[c]; } (*mean)[c] = w / leaf_sum; } } } // namespace tensorforest } // namespace tensorflow