blob: af7006bff2f98702072d2000dce1a5fd1fed295f [file] [log] [blame]
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include <limits>
#include <memory>
#include <string>
#include <vector>
#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h"
#include "tensorflow/contrib/boosted_trees/proto/split_info.pb.h"
#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
#include "tensorflow/core/framework/device_base.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/random/random.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
using boosted_trees::learner::LearnerConfig;
using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
using boosted_trees::learner::ObliviousSplitInfo;
using boosted_trees::learner::SplitInfo;
using boosted_trees::learner::stochastic::GradientStats;
using boosted_trees::learner::stochastic::NodeStats;
namespace {
const int32 DUMMY_FEATURE_DIMENSION = -1;
} // namespace
class SplitBuilderState {
public:
explicit SplitBuilderState(OpKernelContext* const context) {
const Tensor* l1_regularization_t;
OP_REQUIRES_OK(context,
context->input("l1_regularization", &l1_regularization_t));
const Tensor* l2_regularization_t;
OP_REQUIRES_OK(context,
context->input("l2_regularization", &l2_regularization_t));
const Tensor* tree_complexity_regularization_t;
OP_REQUIRES_OK(context, context->input("tree_complexity_regularization",
&tree_complexity_regularization_t));
const Tensor* min_node_weight_t;
OP_REQUIRES_OK(context,
context->input("min_node_weight", &min_node_weight_t));
const Tensor* feature_column_group_id_t;
OP_REQUIRES_OK(context, context->input("feature_column_group_id",
&feature_column_group_id_t));
const Tensor* multiclass_strategy_t;
OP_REQUIRES_OK(
context, context->input("multiclass_strategy", &multiclass_strategy_t));
int strategy = multiclass_strategy_t->scalar<int32>()();
OP_REQUIRES(
context,
boosted_trees::learner::LearnerConfig_MultiClassStrategy_IsValid(
strategy),
errors::InvalidArgument("Wrong multiclass strategy passed."));
multiclass_strategy_ = LearnerConfig_MultiClassStrategy(strategy);
const Tensor* class_id_t;
OP_REQUIRES_OK(context, context->input("class_id", &class_id_t));
OP_REQUIRES(context, TensorShapeUtils::IsScalar(class_id_t->shape()),
errors::InvalidArgument("class_id must be a scalar."));
class_id_ = class_id_t->scalar<int32>()();
l1_regularization_ = l1_regularization_t->scalar<float>()();
l2_regularization_ = l2_regularization_t->scalar<float>()();
tree_complexity_regularization_ =
tree_complexity_regularization_t->scalar<float>()();
min_node_weight_ = min_node_weight_t->scalar<float>()();
feature_column_group_id_ = feature_column_group_id_t->scalar<int32>()();
}
NodeStats ComputeNodeStats(const GradientStats& grad_stats) {
return NodeStats(l1_regularization_, l2_regularization_, min_node_weight_,
multiclass_strategy_, grad_stats);
}
void FillLeaf(const NodeStats& best_node_stats,
boosted_trees::trees::Leaf* leaf) const {
if (class_id_ == -1) {
// This would be the case either for TREE_PER_CLASS with only 2 classes,
// or for other multiclass strategies.
for (float f : best_node_stats.weight_contribution) {
leaf->mutable_vector()->add_value(f);
}
} else {
CHECK(best_node_stats.weight_contribution.size() == 1)
<< "Weight contribution size = "
<< best_node_stats.weight_contribution.size();
leaf->mutable_sparse_vector()->add_index(class_id_);
leaf->mutable_sparse_vector()->add_value(
best_node_stats.weight_contribution[0]);
}
}
int32 feature_column_group_id() { return feature_column_group_id_; }
float tree_complexity_regularization() {
return tree_complexity_regularization_;
}
private:
LearnerConfig_MultiClassStrategy multiclass_strategy_;
float l1_regularization_;
float l2_regularization_;
float tree_complexity_regularization_;
float min_node_weight_;
int32 class_id_;
int32 feature_column_group_id_;
};
class BuildDenseInequalitySplitsOp : public OpKernel {
public:
explicit BuildDenseInequalitySplitsOp(OpKernelConstruction* const context)
: OpKernel(context) {}
void Compute(OpKernelContext* const context) override {
const Tensor* num_minibatches_t;
OP_REQUIRES_OK(context,
context->input("num_minibatches", &num_minibatches_t));
const int64 num_minibatches = num_minibatches_t->scalar<int64>()();
const float normalizer_ratio = (1.0f / num_minibatches);
const Tensor* bucket_boundaries_t;
OP_REQUIRES_OK(context,
context->input("bucket_boundaries", &bucket_boundaries_t));
const auto& bucket_boundaries = bucket_boundaries_t->vec<float>();
const Tensor* partition_ids_t;
OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t));
const auto& partition_ids = partition_ids_t->vec<int32>();
const Tensor* bucket_ids_t;
OP_REQUIRES_OK(context, context->input("bucket_ids", &bucket_ids_t));
const auto& bucket_ids = bucket_ids_t->matrix<int64>();
const Tensor* gradients_t;
OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
const Tensor* hessians_t;
OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
const Tensor* weak_learner_type_t;
OP_REQUIRES_OK(context,
context->input("weak_learner_type", &weak_learner_type_t));
const int32 weak_learner_type = weak_learner_type_t->scalar<int32>()();
// Find the number of unique partitions before we allocate the output.
std::vector<int32> partition_boundaries;
partition_boundaries.push_back(0);
for (int i = 1; i < partition_ids.size(); ++i) {
if (partition_ids(i) != partition_ids(i - 1)) {
// Make sure the input is sorted by partition_ids;
OP_REQUIRES(context, partition_ids(i) >= partition_ids(i - 1),
errors::InvalidArgument("Input should be sorted."));
partition_boundaries.push_back(i);
}
}
if (partition_ids.size() > 0) {
partition_boundaries.push_back(partition_ids.size());
}
int32 num_elements = partition_boundaries.size() - 1;
// When the handler is inactive, no bucket boundaries are built for it.
if (bucket_boundaries.size() == 0) {
num_elements = 0;
}
Tensor* output_partition_ids_t = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output("output_partition_ids",
TensorShape({num_elements}),
&output_partition_ids_t));
tensorflow::TTypes<int32>::Vec output_partition_ids =
output_partition_ids_t->vec<int32>();
// For a normal tree, we output a split per partition. For an oblivious
// tree, we output one split for all partitions of the layer
int32 size_output = num_elements;
if (weak_learner_type == LearnerConfig::OBLIVIOUS_DECISION_TREE &&
num_elements > 0) {
size_output = 1;
}
Tensor* gains_t = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(
"gains", TensorShape({size_output}), &gains_t));
tensorflow::TTypes<float>::Vec gains = gains_t->vec<float>();
Tensor* output_splits_t = nullptr;
OP_REQUIRES_OK(context, context->allocate_output("split_infos",
TensorShape({size_output}),
&output_splits_t));
tensorflow::TTypes<string>::Vec output_splits =
output_splits_t->vec<string>();
if (num_elements == 0) {
return;
}
SplitBuilderState state(context);
switch (weak_learner_type) {
case LearnerConfig::NORMAL_DECISION_TREE: {
ComputeNormalDecisionTree(
&state, normalizer_ratio, num_elements, partition_boundaries,
bucket_boundaries, partition_ids, bucket_ids, gradients_t,
hessians_t, &output_partition_ids, &gains, &output_splits);
break;
}
case LearnerConfig::OBLIVIOUS_DECISION_TREE: {
ComputeObliviousDecisionTree(
&state, normalizer_ratio, num_elements, partition_boundaries,
bucket_boundaries, partition_ids, bucket_ids, gradients_t,
hessians_t, &output_partition_ids, &gains, &output_splits);
break;
}
}
}
private:
void ComputeNormalDecisionTree(
SplitBuilderState* state, const float normalizer_ratio,
const int num_elements, const std::vector<int32>& partition_boundaries,
const tensorflow::TTypes<float>::ConstVec& bucket_boundaries,
const tensorflow::TTypes<int32>::ConstVec& partition_ids,
const tensorflow::TTypes<int64>::ConstMatrix& bucket_ids,
const Tensor* gradients_t, const Tensor* hessians_t,
tensorflow::TTypes<int32>::Vec* output_partition_ids,
tensorflow::TTypes<float>::Vec* gains,
tensorflow::TTypes<string>::Vec* output_splits) {
for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
float best_gain = std::numeric_limits<float>::lowest();
int start_index = partition_boundaries[root_idx];
int end_index = partition_boundaries[root_idx + 1];
GradientStats root_gradient_stats;
for (int64 bucket_idx = start_index; bucket_idx < end_index;
++bucket_idx) {
root_gradient_stats +=
GradientStats(*gradients_t, *hessians_t, bucket_idx);
}
root_gradient_stats *= normalizer_ratio;
NodeStats root_stats = state->ComputeNodeStats(root_gradient_stats);
int32 best_bucket_idx = 0;
NodeStats best_right_node_stats(0);
NodeStats best_left_node_stats(0);
GradientStats left_gradient_stats;
for (int64 bucket_idx = start_index; bucket_idx < end_index;
++bucket_idx) {
GradientStats g(*gradients_t, *hessians_t, bucket_idx);
g *= normalizer_ratio;
left_gradient_stats += g;
NodeStats left_stats = state->ComputeNodeStats(left_gradient_stats);
GradientStats right_gradient_stats =
root_gradient_stats - left_gradient_stats;
NodeStats right_stats = state->ComputeNodeStats(right_gradient_stats);
if (left_stats.gain + right_stats.gain > best_gain) {
best_gain = left_stats.gain + right_stats.gain;
best_left_node_stats = left_stats;
best_right_node_stats = right_stats;
best_bucket_idx = bucket_idx;
}
}
SplitInfo split_info;
auto* dense_split =
split_info.mutable_split_node()->mutable_dense_float_binary_split();
dense_split->set_feature_column(state->feature_column_group_id());
dense_split->set_threshold(
bucket_boundaries(bucket_ids(best_bucket_idx, 0)));
auto* left_child = split_info.mutable_left_child();
auto* right_child = split_info.mutable_right_child();
state->FillLeaf(best_left_node_stats, left_child);
state->FillLeaf(best_right_node_stats, right_child);
split_info.SerializeToString(&(*output_splits)(root_idx));
(*gains)(root_idx) =
best_gain - root_stats.gain - state->tree_complexity_regularization();
(*output_partition_ids)(root_idx) = partition_ids(start_index);
}
}
void ComputeObliviousDecisionTree(
SplitBuilderState* state, const float normalizer_ratio,
const int num_elements, const std::vector<int32>& partition_boundaries,
const tensorflow::TTypes<float>::ConstVec& bucket_boundaries,
const tensorflow::TTypes<int32>::ConstVec& partition_ids,
const tensorflow::TTypes<int64>::ConstMatrix& bucket_ids,
const Tensor* gradients_t, const Tensor* hessians_t,
tensorflow::TTypes<int32>::Vec* output_partition_ids,
tensorflow::TTypes<float>::Vec* gains,
tensorflow::TTypes<string>::Vec* output_splits) {
// Holds the root stats per each node to be split.
std::vector<GradientStats> current_layer_stats;
current_layer_stats.reserve(num_elements);
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
const int start_index = partition_boundaries[root_idx];
const int end_index = partition_boundaries[root_idx + 1];
GradientStats root_gradient_stats;
for (int64 bucket_idx = start_index; bucket_idx < end_index;
++bucket_idx) {
root_gradient_stats +=
GradientStats(*gradients_t, *hessians_t, bucket_idx);
}
root_gradient_stats *= normalizer_ratio;
current_layer_stats.push_back(root_gradient_stats);
}
float best_gain = std::numeric_limits<float>::lowest();
int64 best_bucket_id = 0;
std::vector<NodeStats> best_right_node_stats(num_elements, NodeStats(0));
std::vector<NodeStats> best_left_node_stats(num_elements, NodeStats(0));
std::vector<NodeStats> current_left_node_stats(num_elements, NodeStats(0));
std::vector<NodeStats> current_right_node_stats(num_elements, NodeStats(0));
int64 current_bucket_id = std::numeric_limits<int64>::max();
int64 last_bucket_id = -1;
// Find the lowest bucket id, this is going to be the first bucket id to
// try.
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
const int start_index = partition_boundaries[root_idx];
if (bucket_ids(start_index, 0) < current_bucket_id) {
current_bucket_id = bucket_ids(start_index, 0);
}
}
// Indexes offsets for each of the partitions that can be used to access
// gradients of a partition for a current bucket we consider.
std::vector<int> current_layer_offsets(num_elements, 0);
std::vector<GradientStats> left_gradient_stats(num_elements);
// The idea is to try every bucket id in increasing order. In each iteration
// we calculate the gain of the layer using the current bucket id as split
// value, and we also obtain the following bucket id to try.
while (current_bucket_id > last_bucket_id) {
last_bucket_id = current_bucket_id;
int64 next_bucket_id = -1;
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
int idx =
current_layer_offsets[root_idx] + partition_boundaries[root_idx];
const int end_index = partition_boundaries[root_idx + 1];
if (idx < end_index && bucket_ids(idx, 0) == current_bucket_id) {
GradientStats g(*gradients_t, *hessians_t, idx);
g *= normalizer_ratio;
left_gradient_stats[root_idx] += g;
current_layer_offsets[root_idx]++;
idx++;
}
if (idx < end_index &&
(bucket_ids(idx, 0) < next_bucket_id || next_bucket_id == -1)) {
next_bucket_id = bucket_ids(idx, 0);
}
}
float gain_of_split = 0.0;
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
GradientStats right_gradient_stats =
current_layer_stats[root_idx] - left_gradient_stats[root_idx];
NodeStats left_stat =
state->ComputeNodeStats(left_gradient_stats[root_idx]);
NodeStats right_stat = state->ComputeNodeStats(right_gradient_stats);
gain_of_split += left_stat.gain + right_stat.gain;
current_left_node_stats[root_idx] = left_stat;
current_right_node_stats[root_idx] = right_stat;
}
if (gain_of_split > best_gain) {
best_gain = gain_of_split;
best_left_node_stats = current_left_node_stats;
best_right_node_stats = current_right_node_stats;
best_bucket_id = current_bucket_id;
}
current_bucket_id = next_bucket_id;
}
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
best_gain -= state->ComputeNodeStats(current_layer_stats[root_idx]).gain;
}
best_gain -= num_elements * state->tree_complexity_regularization();
ObliviousSplitInfo oblivious_split_info;
auto* oblivious_dense_split =
oblivious_split_info.mutable_split_node()
->mutable_oblivious_dense_float_binary_split();
oblivious_dense_split->set_feature_column(state->feature_column_group_id());
oblivious_dense_split->set_threshold(bucket_boundaries(best_bucket_id));
(*gains)(0) = best_gain;
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
auto* left_child = oblivious_split_info.add_children();
auto* right_child = oblivious_split_info.add_children();
state->FillLeaf(best_left_node_stats[root_idx], left_child);
state->FillLeaf(best_right_node_stats[root_idx], right_child);
const int start_index = partition_boundaries[root_idx];
(*output_partition_ids)(root_idx) = partition_ids(start_index);
oblivious_split_info.add_children_parent_id(partition_ids(start_index));
}
oblivious_split_info.SerializeToString(&(*output_splits)(0));
}
};
REGISTER_KERNEL_BUILDER(Name("BuildDenseInequalitySplits").Device(DEVICE_CPU),
BuildDenseInequalitySplitsOp);
class BuildSparseInequalitySplitsOp : public OpKernel {
public:
explicit BuildSparseInequalitySplitsOp(OpKernelConstruction* const context)
: OpKernel(context) {}
void Compute(OpKernelContext* const context) override {
const Tensor* num_minibatches_t;
OP_REQUIRES_OK(context,
context->input("num_minibatches", &num_minibatches_t));
const int64 num_minibatches = num_minibatches_t->scalar<int64>()();
const float normalizer_ratio = (1.0f / num_minibatches);
const Tensor* bucket_boundaries_t;
OP_REQUIRES_OK(context,
context->input("bucket_boundaries", &bucket_boundaries_t));
const auto& bucket_boundaries = bucket_boundaries_t->vec<float>();
const Tensor* partition_ids_t;
OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t));
const auto& partition_ids = partition_ids_t->vec<int32>();
const Tensor* bucket_ids_t;
OP_REQUIRES_OK(context, context->input("bucket_ids", &bucket_ids_t));
const auto& bucket_ids_and_dimensions = bucket_ids_t->matrix<int64>();
const int32 tensor_elements = partition_ids.size();
const Tensor* gradients_t;
OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
const Tensor* hessians_t;
OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
const Tensor* bias_feature_id_t;
OP_REQUIRES_OK(context,
context->input("bias_feature_id", &bias_feature_id_t));
int64 bias_feature_id = bias_feature_id_t->scalar<int64>()();
// For each partition (tree node), store starting index for each dimension.
PartitionAndDimensionBoundaries partition_boundaries;
// Stores indices in partition_boundaries for those partitions that are
// not empty (have at least one dimension and a bucket apart from catch-all
// bucket of -1 bucket id and dimension 0.
std::vector<int32> non_empty_partitions;
bool non_empty_partition = false;
for (int i = 0; i < partition_ids.size(); ++i) {
// Make sure the input is sorted by partition_ids;
if (i > 0) {
CHECK_LE(partition_ids(i - 1), partition_ids(i))
<< "Partition ids should be sorted. Not sorted for " << i;
}
const int32 dimension = bucket_ids_and_dimensions(i, 1);
if (i == 0 || (partition_ids(i) != partition_ids(i - 1))) {
if (i != 0) {
// Not the first entry, so partition has changed.
if (non_empty_partition) {
// Saves the id of a previous partition in a list of non empty
// partitions, since it was non empty (had more than just a bias
// bucket -1.
non_empty_partitions.push_back(partition_boundaries.size() - 1);
}
// Add dummy dimension to signify the end for the previous dimension.
partition_boundaries.back().emplace_back(DUMMY_FEATURE_DIMENSION, i);
}
// Allocate for a new partition.
partition_boundaries.emplace_back();
// Save info about the first dimension for a new partition.
partition_boundaries.back().emplace_back(dimension, i);
// Each partition has dummy -1 bucket with all gradients and then info
// for all other dimensions -> if we have >1 elements for a partition,
// then it is not empty.
non_empty_partition = (i < partition_ids.size() - 1) &&
(partition_ids(i) == partition_ids(i + 1));
} else if (bucket_ids_and_dimensions(i, 1) !=
bucket_ids_and_dimensions(i - 1, 1)) {
// Dimension changed.
partition_boundaries.back().emplace_back(dimension, i);
}
}
if (tensor_elements > 0) {
if (non_empty_partition) {
non_empty_partitions.push_back(partition_boundaries.size() - 1);
}
// Add dummy dimension to signify the end for the previous dimension.
partition_boundaries.back().emplace_back(DUMMY_FEATURE_DIMENSION,
partition_ids.size());
}
int num_elements = non_empty_partitions.size();
Tensor* output_partition_ids_t = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output("output_partition_ids",
TensorShape({num_elements}),
&output_partition_ids_t));
tensorflow::TTypes<int32>::Vec output_partition_ids =
output_partition_ids_t->vec<int32>();
Tensor* gains_t = nullptr;
OP_REQUIRES_OK(
context, context->allocate_output("gains", TensorShape({num_elements}),
&gains_t));
tensorflow::TTypes<float>::Vec gains = gains_t->vec<float>();
Tensor* output_splits_t = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(
"split_infos", TensorShape({num_elements}),
&output_splits_t));
tensorflow::TTypes<string>::Vec output_splits =
output_splits_t->vec<string>();
SplitBuilderState state(context);
// For each tree node that needs to be split.
for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
const auto& dimension_boundaries =
partition_boundaries[non_empty_partitions[root_idx]];
float best_gain = std::numeric_limits<float>::lowest();
int32 best_dimension_idx = 0;
bool default_right = false;
int32 best_element_idx = 0;
NodeStats best_right_node_stats(0);
NodeStats best_left_node_stats(0);
// For each partition, the first bucket is dummy catch all.
int32 bias_start_index = dimension_boundaries[0].start_index;
OP_REQUIRES(
context,
bucket_ids_and_dimensions(bias_start_index, 0) == bias_feature_id,
errors::InvalidArgument("Bias feature ID missing."));
// Dimension for bias feature is always 0
OP_REQUIRES(
context, bucket_ids_and_dimensions(bias_start_index, 1) == 0,
errors::InvalidArgument("Bias feature ID must be with dimension 0."));
// For each root, we do two passes over the quantized feature buckets
// accumulating gradients on one side and using the root aggregate
// gradients to get the gradients for the other side.
// Split gains are evaluated for each pass at every threshold and the best
// split is picked.
GradientStats root_gradient_stats(*gradients_t, *hessians_t,
bias_start_index);
root_gradient_stats *= normalizer_ratio;
NodeStats root_stats = state.ComputeNodeStats(root_gradient_stats);
// Iterate through dimensions.
for (int j = 0; j < dimension_boundaries.size() - 1; ++j) {
const DimensionBoundary& dimension_and_start = dimension_boundaries[j];
const int32 dimension_id = dimension_and_start.dimension_id;
int start_index = dimension_and_start.start_index;
// Even for the last dimension, we always have additional dummy
// dimension that we can use to find the end index.
const int end_index =
partition_boundaries[non_empty_partitions[root_idx]][j + 1]
.start_index;
if (bucket_ids_and_dimensions(start_index, 0) == bias_feature_id) {
// 0-dimension case which has a first bucket for catch all feature.
CHECK(bucket_ids_and_dimensions(start_index, 1) == 0)
<< "Dimension of bias feature should be 0";
++start_index;
}
GradientStats present_gradient_stats;
for (int64 bucket_idx = start_index; bucket_idx < end_index;
++bucket_idx) {
present_gradient_stats +=
GradientStats(*gradients_t, *hessians_t, bucket_idx);
}
present_gradient_stats *= normalizer_ratio;
GradientStats not_present =
root_gradient_stats - present_gradient_stats;
// If there was (almost) no sparsity, fix the default direction to LEFT.
bool fixed_default_direction = not_present.IsAlmostZero();
GradientStats left_gradient_stats;
for (int64 element_idx = start_index; element_idx < end_index;
++element_idx) {
// Check that bucket ids are sorted.
if (element_idx != start_index) {
CHECK(bucket_ids_and_dimensions(element_idx - 1, 0) <
bucket_ids_and_dimensions(element_idx, 0))
<< "Bucket ids must be sorted."
<< ", problem on " << element_idx << " and dimension is " << j;
}
GradientStats g(*gradients_t, *hessians_t, element_idx);
g *= normalizer_ratio;
left_gradient_stats += g;
// We have the sum of all present gradients. Use that to compute the
// backward pass gradients.
GradientStats right_gradient_stats =
present_gradient_stats - left_gradient_stats;
{
NodeStats left_stats_default_left = state.ComputeNodeStats(
root_gradient_stats - right_gradient_stats);
NodeStats right_stats_default_left =
state.ComputeNodeStats(right_gradient_stats);
if (left_stats_default_left.gain + right_stats_default_left.gain >
best_gain) {
best_gain =
left_stats_default_left.gain + right_stats_default_left.gain;
best_left_node_stats = left_stats_default_left;
best_right_node_stats = right_stats_default_left;
best_element_idx = element_idx;
default_right = false;
best_dimension_idx = dimension_id;
}
}
// Consider calculating the default direction only when there were
// enough missing examples.
if (!fixed_default_direction) {
NodeStats left_stats_default_right =
state.ComputeNodeStats(left_gradient_stats);
NodeStats right_stats_default_right = state.ComputeNodeStats(
root_gradient_stats - left_gradient_stats);
if (left_stats_default_right.gain + right_stats_default_right.gain >
best_gain) {
best_gain = left_stats_default_right.gain +
right_stats_default_right.gain;
best_left_node_stats = left_stats_default_right;
best_right_node_stats = right_stats_default_right;
best_element_idx = element_idx;
default_right = true;
best_dimension_idx = dimension_id;
}
}
}
}
SplitInfo split_info;
boosted_trees::trees::DenseFloatBinarySplit* dense_split = nullptr;
if (default_right) {
dense_split = split_info.mutable_split_node()
->mutable_sparse_float_binary_split_default_right()
->mutable_split();
} else {
dense_split = split_info.mutable_split_node()
->mutable_sparse_float_binary_split_default_left()
->mutable_split();
}
dense_split->set_feature_column(state.feature_column_group_id());
// Set the feature index for the best feature column.
const int64 best_dimension_id =
bucket_ids_and_dimensions(best_element_idx, 1);
const int32 best_bucket_id =
bucket_ids_and_dimensions(best_element_idx, 0);
dense_split->set_dimension_id(best_dimension_id);
dense_split->set_threshold(bucket_boundaries(best_bucket_id));
auto* left_child = split_info.mutable_left_child();
auto* right_child = split_info.mutable_right_child();
state.FillLeaf(best_left_node_stats, left_child);
state.FillLeaf(best_right_node_stats, right_child);
split_info.SerializeToString(&output_splits(root_idx));
gains(root_idx) =
best_gain - root_stats.gain - state.tree_complexity_regularization();
output_partition_ids(root_idx) = partition_ids(bias_start_index);
}
}
private:
struct DimensionBoundary {
DimensionBoundary(const int32 dimension_id, const int32 start_index)
: dimension_id(dimension_id), start_index(start_index) {}
int32 dimension_id;
int32 start_index;
};
// For each partition, store start indices of feature column dimensions.
typedef std::vector<std::vector<DimensionBoundary>>
PartitionAndDimensionBoundaries;
};
REGISTER_KERNEL_BUILDER(Name("BuildSparseInequalitySplits").Device(DEVICE_CPU),
BuildSparseInequalitySplitsOp);
class BuildCategoricalEqualitySplitsOp : public OpKernel {
public:
explicit BuildCategoricalEqualitySplitsOp(OpKernelConstruction* const context)
: OpKernel(context) {}
void Compute(OpKernelContext* const context) override {
const Tensor* num_minibatches_t;
OP_REQUIRES_OK(context,
context->input("num_minibatches", &num_minibatches_t));
const int64 num_minibatches = num_minibatches_t->scalar<int64>()();
const float normalizer_ratio = (1.0f / num_minibatches);
const Tensor* partition_ids_t;
OP_REQUIRES_OK(context, context->input("partition_ids", &partition_ids_t));
const auto& partition_ids = partition_ids_t->vec<int32>();
const Tensor* feature_ids_t;
OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
const auto& feature_ids = feature_ids_t->matrix<int64>();
const Tensor* gradients_t;
OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
const Tensor* hessians_t;
OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
const Tensor* bias_feature_id_t;
OP_REQUIRES_OK(context,
context->input("bias_feature_id", &bias_feature_id_t));
int64 bias_feature_id = bias_feature_id_t->scalar<int64>()();
const Tensor* weak_learner_type_t;
OP_REQUIRES_OK(context,
context->input("weak_learner_type", &weak_learner_type_t));
const int32 weak_learner_type = weak_learner_type_t->scalar<int32>()();
// Find the number of unique partitions before we allocate the output.
std::vector<int32> partition_boundaries;
std::vector<int32> non_empty_partitions;
for (int i = 0; i < partition_ids.size() - 1; ++i) {
// Make sure the input is sorted by partition_ids;
CHECK_LE(partition_ids(i), partition_ids(i + 1));
if (i == 0 || partition_ids(i) != partition_ids(i - 1)) {
partition_boundaries.push_back(i);
// Some partitions might only have bias feature. We don't want to split
// those so check that the partition has at least 2 features.
if (partition_ids(i) == partition_ids(i + 1)) {
non_empty_partitions.push_back(partition_boundaries.size() - 1);
}
}
}
if (partition_ids.size() > 0) {
partition_boundaries.push_back(partition_ids.size());
}
int num_elements = non_empty_partitions.size();
Tensor* output_partition_ids_t = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output("output_partition_ids",
TensorShape({num_elements}),
&output_partition_ids_t));
tensorflow::TTypes<int32>::Vec output_partition_ids =
output_partition_ids_t->vec<int32>();
// For a normal tree, we output a split per partition. For an oblivious
// tree, we output one split for all partitions of the layer.
int size_output = num_elements;
if (weak_learner_type == LearnerConfig::OBLIVIOUS_DECISION_TREE &&
num_elements > 0) {
size_output = 1;
}
Tensor* gains_t = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(
"gains", TensorShape({size_output}), &gains_t));
tensorflow::TTypes<float>::Vec gains = gains_t->vec<float>();
Tensor* output_splits_t = nullptr;
OP_REQUIRES_OK(context, context->allocate_output("split_infos",
TensorShape({size_output}),
&output_splits_t));
tensorflow::TTypes<string>::Vec output_splits =
output_splits_t->vec<string>();
if (num_elements == 0) {
return;
}
SplitBuilderState state(context);
switch (weak_learner_type) {
case LearnerConfig::NORMAL_DECISION_TREE: {
ComputeNormalDecisionTree(
context, &state, normalizer_ratio, num_elements,
partition_boundaries, non_empty_partitions, bias_feature_id,
partition_ids, feature_ids, gradients_t, hessians_t,
&output_partition_ids, &gains, &output_splits);
break;
}
case LearnerConfig::OBLIVIOUS_DECISION_TREE: {
ComputeObliviousDecisionTree(
context, &state, normalizer_ratio, num_elements,
partition_boundaries, non_empty_partitions, bias_feature_id,
partition_ids, feature_ids, gradients_t, hessians_t,
&output_partition_ids, &gains, &output_splits);
break;
}
}
}
private:
void ComputeNormalDecisionTree(
OpKernelContext* const context, SplitBuilderState* state,
const float normalizer_ratio, const int num_elements,
const std::vector<int32>& partition_boundaries,
const std::vector<int32>& non_empty_partitions,
const int64 bias_feature_id,
const tensorflow::TTypes<int32>::ConstVec& partition_ids,
const tensorflow::TTypes<int64>::ConstMatrix& feature_ids,
const Tensor* gradients_t, const Tensor* hessians_t,
tensorflow::TTypes<int32>::Vec* output_partition_ids,
tensorflow::TTypes<float>::Vec* gains,
tensorflow::TTypes<string>::Vec* output_splits) {
for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
float best_gain = std::numeric_limits<float>::lowest();
int start_index = partition_boundaries[non_empty_partitions[root_idx]];
int end_index = partition_boundaries[non_empty_partitions[root_idx] + 1];
// First feature ID in each partition should be the bias feature.
OP_REQUIRES(context, feature_ids(start_index, 0) == bias_feature_id,
errors::InvalidArgument("Bias feature ID missing."));
GradientStats root_gradient_stats(*gradients_t, *hessians_t, start_index);
root_gradient_stats *= normalizer_ratio;
NodeStats root_stats = state->ComputeNodeStats(root_gradient_stats);
int32 best_feature_idx = 0;
NodeStats best_right_node_stats(0);
NodeStats best_left_node_stats(0);
for (int64 feature_idx = start_index + 1; feature_idx < end_index;
++feature_idx) {
GradientStats left_gradient_stats(*gradients_t, *hessians_t,
feature_idx);
left_gradient_stats *= normalizer_ratio;
GradientStats right_gradient_stats =
root_gradient_stats - left_gradient_stats;
NodeStats left_stats = state->ComputeNodeStats(left_gradient_stats);
NodeStats right_stats = state->ComputeNodeStats(right_gradient_stats);
if (left_stats.gain + right_stats.gain > best_gain) {
best_gain = left_stats.gain + right_stats.gain;
best_left_node_stats = left_stats;
best_right_node_stats = right_stats;
best_feature_idx = feature_idx;
}
}
SplitInfo split_info;
auto* equality_split = split_info.mutable_split_node()
->mutable_categorical_id_binary_split();
equality_split->set_feature_column(state->feature_column_group_id());
CHECK(feature_ids(best_feature_idx, 0) != bias_feature_id)
<< "Unexpected feature ID selected. "
<< "Start feature ID: [" << start_index << "] "
<< feature_ids(start_index, 0) << ", " << feature_ids(start_index, 1)
<< "\nBest feature ID: [" << best_feature_idx << "] "
<< feature_ids(best_feature_idx, 0) << ", "
<< feature_ids(best_feature_idx, 1)
<< "\nPartition IDS: " << partition_ids(start_index) << " "
<< partition_ids(best_feature_idx);
equality_split->set_feature_id(feature_ids(best_feature_idx, 0));
auto* left_child = split_info.mutable_left_child();
auto* right_child = split_info.mutable_right_child();
state->FillLeaf(best_left_node_stats, left_child);
state->FillLeaf(best_right_node_stats, right_child);
split_info.SerializeToString(&(*output_splits)(root_idx));
(*gains)(root_idx) =
best_gain - root_stats.gain - state->tree_complexity_regularization();
(*output_partition_ids)(root_idx) = partition_ids(start_index);
}
}
void ComputeObliviousDecisionTree(
OpKernelContext* const context, SplitBuilderState* state,
const float normalizer_ratio, const int num_elements,
const std::vector<int32>& partition_boundaries,
const std::vector<int32>& non_empty_partitions,
const int64 bias_feature_id,
const tensorflow::TTypes<int32>::ConstVec& partition_ids,
const tensorflow::TTypes<int64>::ConstMatrix& feature_ids,
const Tensor* gradients_t, const Tensor* hessians_t,
tensorflow::TTypes<int32>::Vec* output_partition_ids,
tensorflow::TTypes<float>::Vec* gains,
tensorflow::TTypes<string>::Vec* output_splits) {
// Holds the root stats per each node to be split.
std::vector<GradientStats> current_layer_stats;
current_layer_stats.reserve(num_elements);
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
const int start_index = partition_boundaries[root_idx];
// First feature ID in each partition should be the bias feature.
OP_REQUIRES(context, feature_ids(start_index, 0) == bias_feature_id,
errors::InvalidArgument("Bias feature ID missing."));
GradientStats root_gradient_stats(*gradients_t, *hessians_t, start_index);
root_gradient_stats *= normalizer_ratio;
current_layer_stats.push_back(root_gradient_stats);
}
float best_gain = std::numeric_limits<float>::lowest();
int64 best_feature_id = 0;
std::vector<NodeStats> best_right_node_stats(num_elements, NodeStats(0));
std::vector<NodeStats> best_left_node_stats(num_elements, NodeStats(0));
std::vector<NodeStats> current_left_node_stats(num_elements, NodeStats(0));
std::vector<NodeStats> current_right_node_stats(num_elements, NodeStats(0));
int64 current_feature_id = std::numeric_limits<int64>::max();
int64 last_feature_id = -1;
// Find the lowest feature id, this is going to be the first feature id to
// try.
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
const int start_index = partition_boundaries[root_idx];
if (feature_ids(start_index + 1, 0) < current_feature_id) {
current_feature_id = feature_ids(start_index + 1, 0);
}
}
// Indexes offsets for each of the partitions that can be used to access
// gradients of a partition for a current feature we consider. Start at one
// beacuse the zero index is for the bias.
std::vector<int> current_layer_offsets(num_elements, 1);
// The idea is to try every feature id in increasing order. In each
// iteration we calculate the gain of the layer using the current feature id
// as split value, and we also obtain the following feature id to try.
while (current_feature_id > last_feature_id) {
last_feature_id = current_feature_id;
int64 next_feature_id = -1;
// Left gradient stats per node.
std::vector<GradientStats> left_gradient_stats(num_elements);
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
int idx =
current_layer_offsets[root_idx] + partition_boundaries[root_idx];
const int end_index = partition_boundaries[root_idx + 1];
if (idx < end_index && feature_ids(idx, 0) == current_feature_id) {
GradientStats g(*gradients_t, *hessians_t, idx);
g *= normalizer_ratio;
left_gradient_stats[root_idx] = g;
current_layer_offsets[root_idx]++;
idx++;
}
if (idx < end_index &&
(feature_ids(idx, 0) < next_feature_id || next_feature_id == -1)) {
next_feature_id = feature_ids(idx, 0);
}
}
float gain_of_split = 0.0;
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
GradientStats right_gradient_stats =
current_layer_stats[root_idx] - left_gradient_stats[root_idx];
NodeStats left_stat =
state->ComputeNodeStats(left_gradient_stats[root_idx]);
NodeStats right_stat = state->ComputeNodeStats(right_gradient_stats);
gain_of_split += left_stat.gain + right_stat.gain;
current_left_node_stats[root_idx] = left_stat;
current_right_node_stats[root_idx] = right_stat;
}
if (gain_of_split > best_gain) {
best_gain = gain_of_split;
best_left_node_stats = current_left_node_stats;
best_right_node_stats = current_right_node_stats;
best_feature_id = current_feature_id;
}
current_feature_id = next_feature_id;
}
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
best_gain -= state->ComputeNodeStats(current_layer_stats[root_idx]).gain;
}
best_gain -= num_elements * state->tree_complexity_regularization();
ObliviousSplitInfo oblivious_split_info;
auto* equality_split =
oblivious_split_info.mutable_split_node()
->mutable_oblivious_categorical_id_binary_split();
equality_split->set_feature_column(state->feature_column_group_id());
equality_split->set_feature_id(best_feature_id);
(*gains)(0) = best_gain;
for (int root_idx = 0; root_idx < num_elements; root_idx++) {
auto* left_child = oblivious_split_info.add_children();
auto* right_child = oblivious_split_info.add_children();
state->FillLeaf(best_left_node_stats[root_idx], left_child);
state->FillLeaf(best_right_node_stats[root_idx], right_child);
const int start_index = partition_boundaries[root_idx];
(*output_partition_ids)(root_idx) = partition_ids(start_index);
oblivious_split_info.add_children_parent_id(partition_ids(start_index));
}
oblivious_split_info.SerializeToString(&(*output_splits)(0));
}
};
REGISTER_KERNEL_BUILDER(
Name("BuildCategoricalEqualitySplits").Device(DEVICE_CPU),
BuildCategoricalEqualitySplitsOp);
} // namespace tensorflow