From 1a88b2f5eda24409053e405af2e8690ddb6396d1 Mon Sep 17 00:00:00 2001 From: Toshiyuki Hanaoka Date: Tue, 15 Oct 2024 06:25:41 +0000 Subject: [PATCH] =?UTF-8?q?Add=20parameters=20for=20candidate=20checker=20?= =?UTF-8?q?-=20Cost=20max=20diff=20for=20first=20segment=20candidates=20-?= =?UTF-8?q?=20Charactor=20coverage=20based=20filtering=20for=20realtime=20?= =?UTF-8?q?conversion=20candidate=20checker.=20=20=20-=20We=20can=20provid?= =?UTF-8?q?e=20full=20match=20realtime=20conversion=20result=20for=20short?= =?UTF-8?q?=20query.=20=20=20=20=20Example:=20query=20"=E3=81=AE=E3=81=A8?= =?UTF-8?q?=E3=81=8D",=20candidates:=20"=E3=81=AE=E6=99=82",=20"=E3=81=AE?= =?UTF-8?q?=E3=81=A8=E3=81=8D"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PiperOrigin-RevId: 685974932 --- src/converter/immutable_converter.cc | 49 +++++++------ src/converter/immutable_converter_test.cc | 85 ++++++++++++++++++++++- src/protocol/commands.proto | 15 +++- 3 files changed, 123 insertions(+), 26 deletions(-) diff --git a/src/converter/immutable_converter.cc b/src/converter/immutable_converter.cc index e799c1035..6cdae6823 100644 --- a/src/converter/immutable_converter.cc +++ b/src/converter/immutable_converter.cc @@ -36,12 +36,12 @@ #include #include #include +#include #include #include #include #include "absl/algorithm/container.h" -#include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" #include "absl/log/check.h" #include "absl/log/log.h" @@ -297,8 +297,9 @@ std::vector GetBoundaryInfo(const Segment::Candidate &c) { // Here,"渡しの" will be filtered if there is a cost gap from "私の" class FirstInnerSegmentCandidateChecker { public: - explicit FirstInnerSegmentCandidateChecker(const Segment &target_segment) - : target_segment_(target_segment) {} + explicit FirstInnerSegmentCandidateChecker(const Segment &target_segment, + int cost_max_diff) + : target_segment_(target_segment), cost_max_diff_(cost_max_diff) {} bool IsGoodCandidate(const Segment::Candidate &c) { if (c.key.size() != target_segment_.key().size() && @@ -308,15 +309,7 @@ class FirstInnerSegmentCandidateChecker { return false; } - if (!Util::ContainsScriptType(c.value, Util::KANJI)) { - // Do not filter non-kanji candidate. - // It may have unusual candidate cost. - return true; - } - - constexpr int kCostDiff = 3107; // 500*log(500) - if (const auto &f = min_cost_for_key_.find(c.key); - f != min_cost_for_key_.end() && c.cost - f->second > kCostDiff) { + if (min_cost_.has_value() && c.cost - *min_cost_ > cost_max_diff_) { return false; } @@ -329,9 +322,10 @@ class FirstInnerSegmentCandidateChecker { if (Util::ContainsScriptType(c.value, Util::KANJI)) { // Do not use non-kanji entry's cost. Sometimes it is too small. - auto [it, inserted] = min_cost_for_key_.try_emplace(c.key, c.cost); - if (!inserted) { - it->second = std::min(it->second, c.cost); + if (!min_cost_.has_value()) { + min_cost_ = c.cost; + } else { + *min_cost_ = std::min(*min_cost_, c.cost); } } } @@ -345,7 +339,8 @@ class FirstInnerSegmentCandidateChecker { } const Segment &target_segment_; - absl::flat_hash_map min_cost_for_key_; + int cost_max_diff_; + std::optional min_cost_; Trie trie_; }; @@ -2112,9 +2107,10 @@ void ImmutableConverter::InsertCandidatesForRealtime( void ImmutableConverter::InsertCandidatesForRealtimeWithCandidateChecker( const ConversionRequest &request, const Lattice &lattice, absl::Span group, Segments *segments) const { + const commands::DecoderExperimentParams params = + request.request().decoder_experiment_params(); Segment *target_segment = segments->mutable_conversion_segment(0); absl::flat_hash_set added; - Segments tmp_segments = *segments; { // Candidates for the whole path @@ -2122,11 +2118,15 @@ void ImmutableConverter::InsertCandidatesForRealtimeWithCandidateChecker( InsertCandidates(request, &tmp_segments, lattice, group, kMaxSize, SINGLE_SEGMENT); - // InsertCandidates for SINGLE_SEGMENT should insert at least one candidate. + // At least one candidate should be added. + // Skip to add the similar candidates unless the char coverage is still + // available. DCHECK_GT(tmp_segments.conversion_segment(0).candidates_size(), 0); const auto &top_cand = tmp_segments.conversion_segment(0).candidate(0); const std::vector top_boundary = GetBoundaryInfo(top_cand); + int remaining_char_coverage = + params.realtime_conversion_single_segment_char_coverage(); for (int i = 0; i < tmp_segments.conversion_segment(0).candidates_size(); ++i) { const auto &c = tmp_segments.conversion_segment(0).candidate(i); @@ -2134,15 +2134,16 @@ void ImmutableConverter::InsertCandidatesForRealtimeWithCandidateChecker( if (c.cost - top_cand.cost > kCostDiff) { continue; } - const std::vector boundary = GetBoundaryInfo(c); - if (boundary.size() > 2 && i != 0 && boundary == top_boundary) { - // Skip to add the similar candidates excepting the case that the - // top candidate has the simple structure (i.e., "のXX", etc) + if (i != 0 && GetBoundaryInfo(c) == top_boundary && + remaining_char_coverage < 0) { + // Skip to add the similar candidates when there is no remaining + // coverage. continue; } Segment::Candidate *candidate = target_segment->add_candidate(); *candidate = c; added.insert(c.value); + remaining_char_coverage -= Util::CharsLen(c.value); } } tmp_segments.mutable_conversion_segment(0)->clear_candidates(); @@ -2153,7 +2154,9 @@ void ImmutableConverter::InsertCandidatesForRealtimeWithCandidateChecker( request.max_conversion_candidates_size() - target_segment->candidates_size(), FIRST_INNER_SEGMENT); - FirstInnerSegmentCandidateChecker checker(*target_segment); + FirstInnerSegmentCandidateChecker checker( + *target_segment, + params.realtime_conversion_candidate_checker_cost_max_diff()); for (int i = 0; i < tmp_segments.conversion_segment(0).candidates_size(); ++i) { Segment::Candidate *c = diff --git a/src/converter/immutable_converter_test.cc b/src/converter/immutable_converter_test.cc index d3c9882db..b3a330c91 100644 --- a/src/converter/immutable_converter_test.cc +++ b/src/converter/immutable_converter_test.cc @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -531,8 +530,89 @@ TEST(ImmutableConverterTest, FirstInnerSegmentFiltering) { EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest( conversion_request, &segments)); - EXPECT_THAT(*segment, ContainsCandidate(ValueIs("したとき"))); EXPECT_THAT(*segment, ContainsCandidate(ValueIs("した時"))); + // The same segment structure + EXPECT_THAT(*segment, Not(ContainsCandidate(ValueIs("したとき")))); + } + { + Segments segments; + Segment *segment = segments.add_segment(); + segment->set_key("のとき"); + EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest( + conversion_request, &segments)); + + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("の時"))); + // The same segment structure + EXPECT_THAT(*segment, Not(ContainsCandidate(ValueIs("のとき")))); + } + { + Segments segments; + Segment *segment = segments.add_segment(); + segment->set_key("かえる"); + EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest( + conversion_request, &segments)); + + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("換える"))); + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("代える"))); + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("買える"))); + // Filtered by cost diff + EXPECT_THAT(*segment, Not(ContainsCandidate(ValueIs("飼える")))); + } + { + Segments segments; + Segment *segment = segments.add_segment(); + segment->set_key("くるまでこうどうした"); + EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest( + conversion_request, &segments)); + + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("車で行動した"))); + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("車で"))); + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("来るまで"))); + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("くるまで"))); + } +} + +TEST(ImmutableConverterTest, FirstInnerSegmentFilteringParams) { + commands::Request request; + request_test_util::FillMobileRequest(&request); + request.mutable_decoder_experiment_params() + ->set_enable_realtime_conversion_candidate_checker(true); + request.mutable_decoder_experiment_params() + ->set_realtime_conversion_single_segment_char_coverage(2); + request.mutable_decoder_experiment_params() + ->set_realtime_conversion_candidate_checker_cost_max_diff( + 4605); // 500*log(10000); + ConversionRequest conversion_request; + conversion_request.set_request_type(ConversionRequest::PREDICTION); + conversion_request.set_request(&request); + conversion_request.set_create_partial_candidates(true); + conversion_request.set_max_conversion_candidates_size(100); + + auto data_and_converter = std::make_unique(); + constexpr auto ValueIs = [](const auto &value) { + return Field(&Segment::Candidate::value, StrEq(value)); + }; + + { + Segments segments; + Segment *segment = segments.add_segment(); + segment->set_key("したとき"); + EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest( + conversion_request, &segments)); + + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("した時"))); + // Not enough char coverage + EXPECT_THAT(*segment, Not(ContainsCandidate(ValueIs("したとき")))); + } + { + Segments segments; + Segment *segment = segments.add_segment(); + segment->set_key("のとき"); + EXPECT_TRUE(data_and_converter->GetConverter()->ConvertForRequest( + conversion_request, &segments)); + + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("の時"))); + EXPECT_THAT(*segment, ContainsCandidate(ValueIs("のとき"))); } { Segments segments; @@ -544,6 +624,7 @@ TEST(ImmutableConverterTest, FirstInnerSegmentFiltering) { EXPECT_THAT(*segment, ContainsCandidate(ValueIs("換える"))); EXPECT_THAT(*segment, ContainsCandidate(ValueIs("代える"))); EXPECT_THAT(*segment, ContainsCandidate(ValueIs("買える"))); + // cost diff < cost_max_diff EXPECT_THAT(*segment, ContainsCandidate(ValueIs("飼える"))); } { diff --git a/src/protocol/commands.proto b/src/protocol/commands.proto index 9ff401be0..ef7099a80 100644 --- a/src/protocol/commands.proto +++ b/src/protocol/commands.proto @@ -573,7 +573,7 @@ message Capability { [default = NO_TEXT_DELETION_CAPABILITY]; } -// Next ID: 98 +// Next ID: 100 // Bundles together some Android experiment flags so that they can be easily // retrieved throughout the native code. These flags are generally specific to // the decoder, and are made available when the decoder is initialized. @@ -659,6 +659,19 @@ message DecoderExperimentParams { // <= katakana_override_min_per_char_cost optional int32 katakana_override_min_per_char_cost = 91 [default = 0]; + // Parameters for realtime conversion + // (go/mozc-src/converter/immutable_converter.cc for details) + // + // - Character coverage for single segment + // If the total length of the included candidates' value does not exceed this + // value, the target candidate will not be filtered even when the boundary is + // the same with the top candidate. + optional int32 realtime_conversion_single_segment_char_coverage = 98 + [default = 0]; + // Cost max diff for first segment to filter candidates. + // default_value: 500*log(500) + optional int32 realtime_conversion_candidate_checker_cost_max_diff = 99 + [default = 3107]; } // Clients' request to the server.