Skip to content

Commit

Permalink
Add parameters for candidate checker
Browse files Browse the repository at this point in the history
- Cost max diff for first segment candidates
- Charactor coverage based filtering for realtime conversion candidate checker.
  - We can provide full match realtime conversion result for short query.
    Example: query "のとき", candidates: "の時", "のとき"

PiperOrigin-RevId: 685974932
  • Loading branch information
Toshiyuki Hanaoka authored and hiroyuki-komatsu committed Oct 15, 2024
1 parent 1d74ac4 commit 1a88b2f
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 26 deletions.
49 changes: 26 additions & 23 deletions src/converter/immutable_converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@
#include <cstdint>
#include <limits>
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>

#include "absl/algorithm/container.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/log/check.h"
#include "absl/log/log.h"
Expand Down Expand Up @@ -297,8 +297,9 @@ std::vector<absl::string_view> GetBoundaryInfo(const Segment::Candidate &c) {
// Here,"渡しの" will be filtered if there is a cost gap from "私の"
class FirstInnerSegmentCandidateChecker {
public:
explicit FirstInnerSegmentCandidateChecker(const Segment &target_segment)
: target_segment_(target_segment) {}
explicit FirstInnerSegmentCandidateChecker(const Segment &target_segment,
int cost_max_diff)
: target_segment_(target_segment), cost_max_diff_(cost_max_diff) {}

bool IsGoodCandidate(const Segment::Candidate &c) {
if (c.key.size() != target_segment_.key().size() &&
Expand All @@ -308,15 +309,7 @@ class FirstInnerSegmentCandidateChecker {
return false;
}

if (!Util::ContainsScriptType(c.value, Util::KANJI)) {
// Do not filter non-kanji candidate.
// It may have unusual candidate cost.
return true;
}

constexpr int kCostDiff = 3107; // 500*log(500)
if (const auto &f = min_cost_for_key_.find(c.key);
f != min_cost_for_key_.end() && c.cost - f->second > kCostDiff) {
if (min_cost_.has_value() && c.cost - *min_cost_ > cost_max_diff_) {
return false;
}

Expand All @@ -329,9 +322,10 @@ class FirstInnerSegmentCandidateChecker {

if (Util::ContainsScriptType(c.value, Util::KANJI)) {
// Do not use non-kanji entry's cost. Sometimes it is too small.
auto [it, inserted] = min_cost_for_key_.try_emplace(c.key, c.cost);
if (!inserted) {
it->second = std::min(it->second, c.cost);
if (!min_cost_.has_value()) {
min_cost_ = c.cost;
} else {
*min_cost_ = std::min(*min_cost_, c.cost);
}
}
}
Expand All @@ -345,7 +339,8 @@ class FirstInnerSegmentCandidateChecker {
}

const Segment &target_segment_;
absl::flat_hash_map<std::string, int> min_cost_for_key_;
int cost_max_diff_;
std::optional<int> min_cost_;
Trie<bool> trie_;
};

Expand Down Expand Up @@ -2112,37 +2107,43 @@ void ImmutableConverter::InsertCandidatesForRealtime(
void ImmutableConverter::InsertCandidatesForRealtimeWithCandidateChecker(
const ConversionRequest &request, const Lattice &lattice,
absl::Span<const uint16_t> group, Segments *segments) const {
const commands::DecoderExperimentParams params =
request.request().decoder_experiment_params();
Segment *target_segment = segments->mutable_conversion_segment(0);
absl::flat_hash_set<std::string> added;

Segments tmp_segments = *segments;
{
// Candidates for the whole path
constexpr int kMaxSize = 3;
InsertCandidates(request, &tmp_segments, lattice, group, kMaxSize,
SINGLE_SEGMENT);

// InsertCandidates for SINGLE_SEGMENT should insert at least one candidate.
// At least one candidate should be added.
// Skip to add the similar candidates unless the char coverage is still
// available.
DCHECK_GT(tmp_segments.conversion_segment(0).candidates_size(), 0);
const auto &top_cand = tmp_segments.conversion_segment(0).candidate(0);
const std::vector<absl::string_view> top_boundary =
GetBoundaryInfo(top_cand);
int remaining_char_coverage =
params.realtime_conversion_single_segment_char_coverage();
for (int i = 0; i < tmp_segments.conversion_segment(0).candidates_size();
++i) {
const auto &c = tmp_segments.conversion_segment(0).candidate(i);
constexpr int kCostDiff = 2302; // 500*log(100)
if (c.cost - top_cand.cost > kCostDiff) {
continue;
}
const std::vector<absl::string_view> boundary = GetBoundaryInfo(c);
if (boundary.size() > 2 && i != 0 && boundary == top_boundary) {
// Skip to add the similar candidates excepting the case that the
// top candidate has the simple structure (i.e., "のXX", etc)
if (i != 0 && GetBoundaryInfo(c) == top_boundary &&
remaining_char_coverage < 0) {
// Skip to add the similar candidates when there is no remaining
// coverage.
continue;
}
Segment::Candidate *candidate = target_segment->add_candidate();
*candidate = c;
added.insert(c.value);
remaining_char_coverage -= Util::CharsLen(c.value);
}
}
tmp_segments.mutable_conversion_segment(0)->clear_candidates();
Expand All @@ -2153,7 +2154,9 @@ void ImmutableConverter::InsertCandidatesForRealtimeWithCandidateChecker(
request.max_conversion_candidates_size() -
target_segment->candidates_size(),
FIRST_INNER_SEGMENT);
FirstInnerSegmentCandidateChecker checker(*target_segment);
FirstInnerSegmentCandidateChecker checker(
*target_segment,
params.realtime_conversion_candidate_checker_cost_max_diff());
for (int i = 0; i < tmp_segments.conversion_segment(0).candidates_size();
++i) {
Segment::Candidate *c =
Expand Down
85 changes: 83 additions & 2 deletions src/converter/immutable_converter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <memory>
#include <string>
#include <utility>
Expand Down Expand Up @@ -531,8 +530,89 @@ TEST(ImmutableConverterTest, FirstInnerSegmentFiltering) {
EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest(
conversion_request, &segments));

EXPECT_THAT(*segment, ContainsCandidate(ValueIs("したとき")));
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("した時")));
// The same segment structure
EXPECT_THAT(*segment, Not(ContainsCandidate(ValueIs("したとき"))));
}
{
Segments segments;
Segment *segment = segments.add_segment();
segment->set_key("のとき");
EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest(
conversion_request, &segments));

EXPECT_THAT(*segment, ContainsCandidate(ValueIs("の時")));
// The same segment structure
EXPECT_THAT(*segment, Not(ContainsCandidate(ValueIs("のとき"))));
}
{
Segments segments;
Segment *segment = segments.add_segment();
segment->set_key("かえる");
EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest(
conversion_request, &segments));

EXPECT_THAT(*segment, ContainsCandidate(ValueIs("換える")));
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("代える")));
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("買える")));
// Filtered by cost diff
EXPECT_THAT(*segment, Not(ContainsCandidate(ValueIs("飼える"))));
}
{
Segments segments;
Segment *segment = segments.add_segment();
segment->set_key("くるまでこうどうした");
EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest(
conversion_request, &segments));

EXPECT_THAT(*segment, ContainsCandidate(ValueIs("車で行動した")));
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("車で")));
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("来るまで")));
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("くるまで")));
}
}

TEST(ImmutableConverterTest, FirstInnerSegmentFilteringParams) {
commands::Request request;
request_test_util::FillMobileRequest(&request);
request.mutable_decoder_experiment_params()
->set_enable_realtime_conversion_candidate_checker(true);
request.mutable_decoder_experiment_params()
->set_realtime_conversion_single_segment_char_coverage(2);
request.mutable_decoder_experiment_params()
->set_realtime_conversion_candidate_checker_cost_max_diff(
4605); // 500*log(10000);
ConversionRequest conversion_request;
conversion_request.set_request_type(ConversionRequest::PREDICTION);
conversion_request.set_request(&request);
conversion_request.set_create_partial_candidates(true);
conversion_request.set_max_conversion_candidates_size(100);

auto data_and_converter = std::make_unique<MockDataAndImmutableConverter>();
constexpr auto ValueIs = [](const auto &value) {
return Field(&Segment::Candidate::value, StrEq(value));
};

{
Segments segments;
Segment *segment = segments.add_segment();
segment->set_key("したとき");
EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest(
conversion_request, &segments));

EXPECT_THAT(*segment, ContainsCandidate(ValueIs("した時")));
// Not enough char coverage
EXPECT_THAT(*segment, Not(ContainsCandidate(ValueIs("したとき"))));
}
{
Segments segments;
Segment *segment = segments.add_segment();
segment->set_key("のとき");
EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest(
conversion_request, &segments));

EXPECT_THAT(*segment, ContainsCandidate(ValueIs("の時")));
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("のとき")));
}
{
Segments segments;
Expand All @@ -544,6 +624,7 @@ TEST(ImmutableConverterTest, FirstInnerSegmentFiltering) {
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("換える")));
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("代える")));
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("買える")));
// cost diff < cost_max_diff
EXPECT_THAT(*segment, ContainsCandidate(ValueIs("飼える")));
}
{
Expand Down
15 changes: 14 additions & 1 deletion src/protocol/commands.proto
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ message Capability {
[default = NO_TEXT_DELETION_CAPABILITY];
}

// Next ID: 98
// Next ID: 100
// Bundles together some Android experiment flags so that they can be easily
// retrieved throughout the native code. These flags are generally specific to
// the decoder, and are made available when the decoder is initialized.
Expand Down Expand Up @@ -659,6 +659,19 @@ message DecoderExperimentParams {
// <= katakana_override_min_per_char_cost
optional int32 katakana_override_min_per_char_cost = 91 [default = 0];

// Parameters for realtime conversion
// (go/mozc-src/converter/immutable_converter.cc for details)
//
// - Character coverage for single segment
// If the total length of the included candidates' value does not exceed this
// value, the target candidate will not be filtered even when the boundary is
// the same with the top candidate.
optional int32 realtime_conversion_single_segment_char_coverage = 98
[default = 0];
// Cost max diff for first segment to filter candidates.
// default_value: 500*log(500)
optional int32 realtime_conversion_candidate_checker_cost_max_diff = 99
[default = 3107];
}

// Clients' request to the server.
Expand Down

0 comments on commit 1a88b2f

Please sign in to comment.