diff options
Diffstat (limited to 'tensorflow/core/ops/candidate_sampling_ops.cc')
-rw-r--r-- | tensorflow/core/ops/candidate_sampling_ops.cc | 273 |
1 files changed, 266 insertions, 7 deletions
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc index 6e4d100b04..18700be67a 100644 --- a/tensorflow/core/ops/candidate_sampling_ops.cc +++ b/tensorflow/core/ops/candidate_sampling_ops.cc @@ -55,7 +55,42 @@ REGISTER_OP("UniformCandidateSampler") .Attr("seed: int = 0") .Attr("seed2: int = 0") .SetShapeFn(CandidateSamplerShapeFn) - .SetIsStateful(); + .SetIsStateful() + .Doc(R"doc( +Generates labels for candidate sampling with a uniform distribution. + +See explanations of candidate sampling and the data formats at +go/candidate-sampling. + +For each batch, this op picks a single set of sampled candidate labels. + +The advantages of sampling candidates per-batch are simplicity and the +possibility of efficient dense matrix multiplication. The disadvantage is that +the sampled candidates must be chosen independently of the context and of the +true labels. + +true_classes: A batch_size * num_true matrix, in which each row contains the + IDs of the num_true target_classes in the corresponding original label. +sampled_candidates: A vector of length num_sampled, in which each element is + the ID of a sampled candidate. +true_expected_count: A batch_size * num_true matrix, representing + the number of times each candidate is expected to occur in a batch + of sampled candidates. If unique=true, then this is a probability. +sampled_expected_count: A vector of length num_sampled, for each sampled + candidate representing the number of times the candidate is expected + to occur in a batch of sampled candidates. If unique=true, then this is a + probability. +num_true: Number of true labels per context. +num_sampled: Number of candidates to randomly sample. +unique: If unique is true, we sample with rejection, so that all sampled + candidates in a batch are unique. This requires some approximation to + estimate the post-rejection sampling probabilities. +range_max: The sampler will sample integers from the interval [0, range_max). +seed: If either seed or seed2 are set to be non-zero, the random number + generator is seeded by the given seed. Otherwise, it is seeded by a + random seed. +seed2: An second seed to avoid seed collision. +)doc"); REGISTER_OP("LogUniformCandidateSampler") .Input("true_classes: int64") @@ -69,7 +104,43 @@ REGISTER_OP("LogUniformCandidateSampler") .Attr("seed: int = 0") .Attr("seed2: int = 0") .SetShapeFn(CandidateSamplerShapeFn) - .SetIsStateful(); + .SetIsStateful() + .Doc(R"doc( +Generates labels for candidate sampling with a log-uniform distribution. + +See explanations of candidate sampling and the data formats at +go/candidate-sampling. + +For each batch, this op picks a single set of sampled candidate labels. + +The advantages of sampling candidates per-batch are simplicity and the +possibility of efficient dense matrix multiplication. The disadvantage is that +the sampled candidates must be chosen independently of the context and of the +true labels. + + +true_classes: A batch_size * num_true matrix, in which each row contains the + IDs of the num_true target_classes in the corresponding original label. +sampled_candidates: A vector of length num_sampled, in which each element is + the ID of a sampled candidate. +true_expected_count: A batch_size * num_true matrix, representing + the number of times each candidate is expected to occur in a batch + of sampled candidates. If unique=true, then this is a probability. +sampled_expected_count: A vector of length num_sampled, for each sampled + candidate representing the number of times the candidate is expected + to occur in a batch of sampled candidates. If unique=true, then this is a + probability. +num_true: Number of true labels per context. +num_sampled: Number of candidates to randomly sample. +unique: If unique is true, we sample with rejection, so that all sampled + candidates in a batch are unique. This requires some approximation to + estimate the post-rejection sampling probabilities. +range_max: The sampler will sample integers from the interval [0, range_max). +seed: If either seed or seed2 are set to be non-zero, the random number + generator is seeded by the given seed. Otherwise, it is seeded by a + random seed. +seed2: An second seed to avoid seed collision. +)doc"); REGISTER_OP("LearnedUnigramCandidateSampler") .Input("true_classes: int64") @@ -83,7 +154,42 @@ REGISTER_OP("LearnedUnigramCandidateSampler") .Attr("seed: int = 0") .Attr("seed2: int = 0") .SetShapeFn(CandidateSamplerShapeFn) - .SetIsStateful(); + .SetIsStateful() + .Doc(R"doc( +Generates labels for candidate sampling with a learned unigram distribution. + +See explanations of candidate sampling and the data formats at +go/candidate-sampling. + +For each batch, this op picks a single set of sampled candidate labels. + +The advantages of sampling candidates per-batch are simplicity and the +possibility of efficient dense matrix multiplication. The disadvantage is that +the sampled candidates must be chosen independently of the context and of the +true labels. + +true_classes: A batch_size * num_true matrix, in which each row contains the + IDs of the num_true target_classes in the corresponding original label. +sampled_candidates: A vector of length num_sampled, in which each element is + the ID of a sampled candidate. +true_expected_count: A batch_size * num_true matrix, representing + the number of times each candidate is expected to occur in a batch + of sampled candidates. If unique=true, then this is a probability. +sampled_expected_count: A vector of length num_sampled, for each sampled + candidate representing the number of times the candidate is expected + to occur in a batch of sampled candidates. If unique=true, then this is a + probability. +num_true: Number of true labels per context. +num_sampled: Number of candidates to randomly sample. +unique: If unique is true, we sample with rejection, so that all sampled + candidates in a batch are unique. This requires some approximation to + estimate the post-rejection sampling probabilities. +range_max: The sampler will sample integers from the interval [0, range_max). +seed: If either seed or seed2 are set to be non-zero, the random number + generator is seeded by the given seed. Otherwise, it is seeded by a + random seed. +seed2: An second seed to avoid seed collision. +)doc"); REGISTER_OP("ThreadUnsafeUnigramCandidateSampler") .Input("true_classes: int64") @@ -97,7 +203,42 @@ REGISTER_OP("ThreadUnsafeUnigramCandidateSampler") .Attr("seed: int = 0") .Attr("seed2: int = 0") .SetShapeFn(CandidateSamplerShapeFn) - .SetIsStateful(); + .SetIsStateful() + .Doc(R"doc( +Generates labels for candidate sampling with a learned unigram distribution. + +See explanations of candidate sampling and the data formats at +go/candidate-sampling. + +For each batch, this op picks a single set of sampled candidate labels. + +The advantages of sampling candidates per-batch are simplicity and the +possibility of efficient dense matrix multiplication. The disadvantage is that +the sampled candidates must be chosen independently of the context and of the +true labels. + +true_classes: A batch_size * num_true matrix, in which each row contains the + IDs of the num_true target_classes in the corresponding original label. +sampled_candidates: A vector of length num_sampled, in which each element is + the ID of a sampled candidate. +true_expected_count: A batch_size * num_true matrix, representing + the number of times each candidate is expected to occur in a batch + of sampled candidates. If unique=true, then this is a probability. +sampled_expected_count: A vector of length num_sampled, for each sampled + candidate representing the number of times the candidate is expected + to occur in a batch of sampled candidates. If unique=true, then this is a + probability. +num_true: Number of true labels per context. +num_sampled: Number of candidates to randomly sample. +unique: If unique is true, we sample with rejection, so that all sampled + candidates in a batch are unique. This requires some approximation to + estimate the post-rejection sampling probabilities. +range_max: The sampler will sample integers from the interval [0, range_max). +seed: If either seed or seed2 are set to be non-zero, the random number + generator is seeded by the given seed. Otherwise, it is seeded by a + random seed. +seed2: An second seed to avoid seed collision. +)doc"); REGISTER_OP("FixedUnigramCandidateSampler") .Input("true_classes: int64") @@ -117,7 +258,70 @@ REGISTER_OP("FixedUnigramCandidateSampler") .Attr("seed: int = 0") .Attr("seed2: int = 0") .SetShapeFn(CandidateSamplerShapeFn) - .SetIsStateful(); + .SetIsStateful() + .Doc(R"doc( +Generates labels for candidate sampling with a learned unigram distribution. + +A unigram sampler could use a fixed unigram distribution read from a +file or passed in as an in-memory array instead of building up the distribution +from data on the fly. There is also an option to skew the distribution by +applying a distortion power to the weights. + +The vocabulary file should be in CSV-like format, with the last field +being the weight associated with the word. + +For each batch, this op picks a single set of sampled candidate labels. + +The advantages of sampling candidates per-batch are simplicity and the +possibility of efficient dense matrix multiplication. The disadvantage is that +the sampled candidates must be chosen independently of the context and of the +true labels. + +true_classes: A batch_size * num_true matrix, in which each row contains the + IDs of the num_true target_classes in the corresponding original label. +sampled_candidates: A vector of length num_sampled, in which each element is + the ID of a sampled candidate. +true_expected_count: A batch_size * num_true matrix, representing + the number of times each candidate is expected to occur in a batch + of sampled candidates. If unique=true, then this is a probability. +sampled_expected_count: A vector of length num_sampled, for each sampled + candidate representing the number of times the candidate is expected + to occur in a batch of sampled candidates. If unique=true, then this is a + probability. +num_true: Number of true labels per context. +num_sampled: Number of candidates to randomly sample. +unique: If unique is true, we sample with rejection, so that all sampled + candidates in a batch are unique. This requires some approximation to + estimate the post-rejection sampling probabilities. +range_max: The sampler will sample integers from the interval [0, range_max). +vocab_file: Each valid line in this file (which should have a CSV-like format) + corresponds to a valid word ID. IDs are in sequential order, starting from + num_reserved_ids. The last entry in each line is expected to be a value + corresponding to the count or relative probability. Exactly one of vocab_file + and unigrams needs to be passed to this op. +distortion: The distortion is used to skew the unigram probability distribution. + Each weight is first raised to the distortion's power before adding to the + internal unigram distribution. As a result, distortion = 1.0 gives regular + unigram sampling (as defined by the vocab file), and distortion = 0.0 gives + a uniform distribution. +num_reserved_ids: Optionally some reserved IDs can be added in the range [0, + ..., num_reserved_ids) by the users. One use case is that a special unknown + word token is used as ID 0. These IDs will have a sampling probability of 0. +num_shards: A sampler can be used to sample from a subset of the original range + in order to speed up the whole computation through parallelism. This parameter + (together with 'shard') indicates the number of partitions that are being + used in the overall computation. +shard: A sampler can be used to sample from a subset of the original range + in order to speed up the whole computation through parallelism. This parameter + (together with 'num_shards') indicates the particular partition number of a + sampler op, when partitioning is being used. +unigrams: A list of unigram counts or probabilities, one per ID in sequential + order. Exactly one of vocab_file and unigrams should be passed to this op. +seed: If either seed or seed2 are set to be non-zero, the random number + generator is seeded by the given seed. Otherwise, it is seeded by a + random seed. +seed2: An second seed to avoid seed collision. +)doc"); REGISTER_OP("AllCandidateSampler") .Input("true_classes: int64") @@ -130,7 +334,41 @@ REGISTER_OP("AllCandidateSampler") .Attr("seed: int = 0") .Attr("seed2: int = 0") .SetShapeFn(CandidateSamplerShapeFn) - .SetIsStateful(); + .SetIsStateful() + .Doc(R"doc( +Generates labels for candidate sampling with a learned unigram distribution. + +See explanations of candidate sampling and the data formats at +go/candidate-sampling. + +For each batch, this op picks a single set of sampled candidate labels. + +The advantages of sampling candidates per-batch are simplicity and the +possibility of efficient dense matrix multiplication. The disadvantage is that +the sampled candidates must be chosen independently of the context and of the +true labels. + +true_classes: A batch_size * num_true matrix, in which each row contains the + IDs of the num_true target_classes in the corresponding original label. +sampled_candidates: A vector of length num_sampled, in which each element is + the ID of a sampled candidate. +true_expected_count: A batch_size * num_true matrix, representing + the number of times each candidate is expected to occur in a batch + of sampled candidates. If unique=true, then this is a probability. +sampled_expected_count: A vector of length num_sampled, for each sampled + candidate representing the number of times the candidate is expected + to occur in a batch of sampled candidates. If unique=true, then this is a + probability. +num_true: Number of true labels per context. +num_sampled: Number of candidates to produce. +unique: If unique is true, we sample with rejection, so that all sampled + candidates in a batch are unique. This requires some approximation to + estimate the post-rejection sampling probabilities. +seed: If either seed or seed2 are set to be non-zero, the random number + generator is seeded by the given seed. Otherwise, it is seeded by a + random seed. +seed2: An second seed to avoid seed collision. +)doc"); REGISTER_OP("ComputeAccidentalHits") .Input("true_classes: int64") @@ -158,6 +396,27 @@ REGISTER_OP("ComputeAccidentalHits") c->set_output(1, v); c->set_output(2, v); return Status::OK(); - }); + }) + .Doc(R"doc( +Computes the ids of the positions in sampled_candidates that match true_labels. + +When doing log-odds NCE, the result of this op should be passed through a +SparseToDense op, then added to the logits of the sampled candidates. This has +the effect of 'removing' the sampled labels that match the true labels by +making the classifier sure that they are sampled labels. + +true_classes: The true_classes output of UnpackSparseLabels. +sampled_candidates: The sampled_candidates output of CandidateSampler. +indices: A vector of indices corresponding to rows of true_candidates. +ids: A vector of IDs of positions in sampled_candidates that match a true_label + for the row with the corresponding index in indices. +weights: A vector of the same length as indices and ids, in which each element + is -FLOAT_MAX. +num_true: Number of true labels per context. +seed: If either seed or seed2 are set to be non-zero, the random number + generator is seeded by the given seed. Otherwise, it is seeded by a + random seed. +seed2: An second seed to avoid seed collision. +)doc"); } // namespace tensorflow |