Skip to content

Commit

Permalink
Add a feature to enable findability oriented candidate order.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 576789852
  • Loading branch information
Toshiyuki Hanaoka authored and hiroyuki-komatsu committed Oct 26, 2023
1 parent 18d1bca commit 053db3b
Show file tree
Hide file tree
Showing 18 changed files with 525 additions and 4 deletions.
8 changes: 8 additions & 0 deletions src/converter/segments.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,12 @@ class Segment final {
USER_HISTORY_PREDICTOR = 1 << 6,
};

enum Category {
DEFAULT_CATEGORY, // Realtime conversion, history prediction, etc
SYMBOL, // Symbol, emoji
OTHER, // Misc candidate
};

// LINT.IfChange
std::string key; // reading
std::string value; // surface form
Expand Down Expand Up @@ -202,6 +208,8 @@ class Segment final {
// Candidate's source info which will be used for usage stats.
uint32_t source_info = SOURCE_INFO_NONE;

Category category = DEFAULT_CATEGORY;

// Candidate style. This is not a bit-field.
// The style is defined in enum |Style|.
NumberUtil::NumberString::Style style =
Expand Down
1 change: 1 addition & 0 deletions src/converter/segments_matchers.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ MATCHER_P(EqualsCandidate, candidate, "") {
COMPARE_FIELD(rid);
COMPARE_FIELD(attributes);
COMPARE_FIELD(source_info);
COMPARE_FIELD(category);
COMPARE_FIELD(style);
COMPARE_FIELD(command);
COMPARE_FIELD(inner_segment_boundary);
Expand Down
7 changes: 4 additions & 3 deletions src/protocol/commands.proto
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ message Capability {
[default = NO_TEXT_DELETION_CAPABILITY];
}

// Next ID: 42
// Next ID: 43
// Bundles together some Android experiment flags so that they can be easily
// retrieved throughout the native code. These flags are generally specific to
// the decoder, and are made available when the decoder is initialized.
Expand Down Expand Up @@ -619,12 +619,13 @@ message DecoderExperimentParams {
reserved 20; // Deprecated cancel_content_word_suffix_penalty
reserved 23; // Deprecated typing_correction_score_offset
reserved 24; // Deprecated typing_correction_move_literal_candidate_to_top
reserved 30; // Deprecated enable_number_style_learning

optional bool disable_zero_query_suffix_prediction = 36 [default = false];

reserved 30; // Deprecated enable_number_style_learning

optional bool enable_realtime_conversion_v2 = 37 [default = false];

optional bool enable_findability_oriented_order = 42 [default = false];
}

// Clients' request to the server.
Expand Down
32 changes: 32 additions & 0 deletions src/rewriter/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,7 @@ mozc_cc_library(
":language_aware_rewriter",
":merger_rewriter",
":number_rewriter",
":order_rewriter",
":remove_redundant_candidate_rewriter",
":rewriter_interface",
":single_kanji_rewriter",
Expand Down Expand Up @@ -1606,3 +1607,34 @@ mozc_cc_test(
"//testing:gunit_main",
],
)

mozc_cc_library(
name = "order_rewriter",
srcs = ["order_rewriter.cc"],
hdrs = ["order_rewriter.h"],
deps = [
":rewriter_interface",
"//base:util",
"//converter:segments",
"//request:conversion_request",
"@com_google_absl//absl/algorithm:container",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/log",
],
)

mozc_cc_test(
name = "order_rewriter_test",
srcs = ["order_rewriter_test.cc"],
deps = [
":order_rewriter",
":rewriter_interface",
"//converter:segments",
"//converter:segments_matchers",
"//request:conversion_request",
"//session:request_test_util",
"//testing:gunit_main",
"//testing:mozctest",
],
)
1 change: 1 addition & 0 deletions src/rewriter/date_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ std::unique_ptr<Segment::Candidate> CreateCandidate(
candidate->content_key = base_candidate.content_key;
candidate->attributes |= (Segment::Candidate::NO_LEARNING |
Segment::Candidate::NO_VARIANTS_EXPANSION);
candidate->category = Segment::Candidate::OTHER;
candidate->description = std::move(description);
return candidate;
}
Expand Down
1 change: 1 addition & 0 deletions src/rewriter/emoji_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ std::unique_ptr<Segment::Candidate> CreateCandidate(
}
candidate->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
candidate->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
candidate->category = Segment::Candidate::SYMBOL;

return candidate;
}
Expand Down
1 change: 1 addition & 0 deletions src/rewriter/emoticon_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ void InsertCandidates(SerializedDictionary::const_iterator begin,
sorted_value[i].description().size());
c->description = description;
}
c->category = Segment::Candidate::SYMBOL;
}
}

Expand Down
222 changes: 222 additions & 0 deletions src/rewriter/order_rewriter.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
// Copyright 2010-2021, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "rewriter/order_rewriter.h"

#include <cstddef>
#include <deque>
#include <optional>
#include <string>
#include <utility>
#include <vector>

#include "base/util.h"
#include "converter/segments.h"
#include "request/conversion_request.h"
#include "rewriter/rewriter_interface.h"
#include "absl/algorithm/container.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"

namespace mozc {

namespace {

class CandidateGroup {
public:
CandidateGroup() = default;
explicit CandidateGroup(Segment::Candidate::Category category)
: category_(category) {}
~CandidateGroup() = default;

const std::deque<Segment::Candidate> &candidates() const {
return candidates_;
}

void AppendToSegment(Segment &segment) const {
for (const Segment::Candidate &c : candidates_) {
Segment::Candidate *candidate = segment.add_candidate();
*candidate = c;
}
}

std::deque<Segment::Candidate> *mutable_candidates() { return &candidates_; }
size_t size() const { return candidates_.size(); }

void AddCandidate(const Segment::Candidate &candidate) {
if (const auto [_, inserted] =
added_.emplace(candidate.key, candidate.value);
inserted) {
candidates_.push_back(candidate);
if (category_.has_value()) {
candidates_.back().category = *category_;
}
}
}

void AddHiragnaCandidates() {
// (Key, Base candidate)
absl::flat_hash_map<std::string, const Segment::Candidate *> keys;
for (const Segment::Candidate &c : candidates_) {
keys.insert({c.key, &c});
}
for (const auto &itr : keys) {
Segment::Candidate c = *itr.second;
c.value = c.key;
c.content_key = c.key;
c.content_value = c.key;
c.description.clear();
c.inner_segment_boundary.clear();

candidates_.push_front(std::move(c));
}
}

void SortCandidates() {
// key length -> value length
const auto cmp = [](const Segment::Candidate &lhs,
const Segment::Candidate &rhs) {
if (lhs.key.size() != rhs.key.size()) {
if (lhs.content_key.size() != rhs.content_key.size()) {
return lhs.content_key.size() > rhs.content_key.size();
}
return lhs.key.size() > rhs.key.size();
}
if (lhs.key != rhs.key) {
return lhs.key < rhs.key;
}
const size_t lhs_len = Util::CharsLen(lhs.value);
const size_t rhs_len = Util::CharsLen(rhs.value);
return lhs_len > rhs_len;
};
absl::c_stable_sort(candidates_, cmp);
}

private:
const std::optional<Segment::Candidate::Category> category_ = std::nullopt;
std::deque<Segment::Candidate> candidates_;
absl::flat_hash_set<std::pair<std::string, std::string>> added_;
};

} // namespace

int OrderRewriter::capability(const ConversionRequest &request) const {
if (request.request().mixed_conversion()) { // For mobile
return RewriterInterface::PREDICTION | RewriterInterface::SUGGESTION;
} else {
return RewriterInterface::NOT_AVAILABLE;
}
}

bool OrderRewriter::Rewrite(const ConversionRequest &request,
Segments *segments) const {
if (!request.request()
.decoder_experiment_params()
.enable_findability_oriented_order()) {
return false;
}
if (segments->conversion_segments_size() != 1) {
return false;
}

Segment *segment = segments->mutable_conversion_segment(0);

// Candidates in the same category will be deduped.
CandidateGroup top, normal, partial, t13n;
CandidateGroup single_kanji(Segment::Candidate::OTHER),
single_kanji_partial(Segment::Candidate::OTHER),
symbol(Segment::Candidate::OTHER), other(Segment::Candidate::OTHER);

constexpr int kTopCandidatesSize = 5;
for (size_t i = 0; i < segment->candidates_size(); ++i) {
const Segment::Candidate &candidate = segment->candidate(i);
if (top.size() < kTopCandidatesSize &&
candidate.category != Segment::Candidate::OTHER) {
top.AddCandidate(candidate);
continue;
}
if (candidate.category == Segment::Candidate::SYMBOL) {
symbol.AddCandidate(candidate);
} else if (candidate.category == Segment::Candidate::OTHER) {
other.AddCandidate(candidate);
} else if (candidate.category == Segment::Candidate::DEFAULT_CATEGORY) {
// TODO(toshiyuki): Use better way to check single kanji entries.
// - There are single characters with multiple code points
// (e.g. SVS, IVS). Grapheme treats those multiple code points as a single
// character, but CharsLen() treats them as multiple characters.
// - Or, we may be able to set candidate category when we generate single
// kanji candidates.
const bool is_single_kanji =
Util::CharsLen(candidate.value) == 1 &&
Util::IsScriptType(candidate.value, Util::KANJI);
const bool is_partial =
candidate.attributes & Segment::Candidate::PARTIALLY_KEY_CONSUMED;
if (is_partial && is_single_kanji) {
single_kanji_partial.AddCandidate(candidate);
} else if (is_partial) {
partial.AddCandidate(candidate);
} else if (is_single_kanji) {
single_kanji.AddCandidate(candidate);
} else {
normal.AddCandidate(candidate);
}
}
}
for (size_t i = 0; i < segment->meta_candidates_size(); ++i) {
const Segment::Candidate &c = segment->meta_candidate(i);
t13n.AddCandidate(c);
}

single_kanji.AddHiragnaCandidates();
single_kanji_partial.AddHiragnaCandidates();

// The following candidates are originally sorted in LM based order.
// Reorder these candidates based on the length of key and value so that
// users can find the expected candidate easily.
normal.SortCandidates();
partial.SortCandidates();
single_kanji.SortCandidates();
single_kanji_partial.SortCandidates();

segment->clear_candidates();
segment->clear_meta_candidates();

top.AppendToSegment(*segment);
normal.AppendToSegment(*segment);
t13n.AppendToSegment(*segment);
other.AppendToSegment(*segment);
single_kanji.AppendToSegment(*segment);
symbol.AppendToSegment(*segment);
partial.AppendToSegment(*segment);
single_kanji_partial.AppendToSegment(*segment);

return true;
}

} // namespace mozc
51 changes: 51 additions & 0 deletions src/rewriter/order_rewriter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright 2010-2021, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#ifndef MOZC_REWRITER_ORDER_REWRITER_H_
#define MOZC_REWRITER_ORDER_REWRITER_H_

#include "converter/segments.h"
#include "request/conversion_request.h"
#include "rewriter/rewriter_interface.h"

namespace mozc {

class OrderRewriter : public RewriterInterface {
public:
OrderRewriter() = default;
~OrderRewriter() override = default;

int capability(const ConversionRequest &request) const override;
bool Rewrite(const ConversionRequest &request,
Segments *segments) const override;
};

} // namespace mozc

#endif // MOZC_REWRITER_ORDER_REWRITER_H_
Loading

0 comments on commit 053db3b

Please sign in to comment.