Skip to content

Commit 73a9e37

Browse files
author
Noriyuki Takahashi
committed
Fix #379: 1-char user dictionary item may not work
User dictionary entries were sometimes dropped from prediction because their cost are relatively high even after promotion. This CL puts some upper limit on the cost of user dictionary entries. Also, when the value of a user dictionary entry has a common prefix as that of a system dictionary entry, it's detected as "maybe redundant" and discarded from prediction. This CL stops applying this mechanism for user dictionary. BUG=##379 TEST= REF_BUG=19940231,30374836 REF_CL=91386248,128960625,129404563 REF_TIME=2016-08-01T15:56:48+09:00 REF_TIME_RAW=1470034608 +0900
1 parent 3b25cde commit 73a9e37

File tree

4 files changed

+179
-17
lines changed

4 files changed

+179
-17
lines changed

src/data/version/mozc_version_template.bzl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MAJOR=2
22
MINOR=18
3-
BUILD=2582
3+
BUILD=2583
44
REVISION=102
55
# CAUTION: NACL_DICTIONARY_VERSION is going to be migrated to ENGINE_VERSION.
66
# NACL_DICTIONARY_VERSION is the target version of the system dictionary to be

src/prediction/dictionary_predictor.cc

+32-8
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,7 @@ bool DictionaryPredictor::AddPredictionToCandidates(
506506
continue;
507507
}
508508

509-
// don't suggest exactly the same candidate as key.
509+
// Don't suggest exactly the same candidate as key.
510510
// if |mixed_conversion| is true, that's not the case.
511511
if (!mixed_conversion &&
512512
!(result.types & REALTIME) &&
@@ -765,6 +765,10 @@ void DictionaryPredictor::Result::SetSourceInfoForZeroQuery(
765765
}
766766
}
767767

768+
bool DictionaryPredictor::Result::IsUserDictionaryResult() const {
769+
return (candidate_attributes & Segment::Candidate::USER_DICTIONARY) != 0;
770+
}
771+
768772
bool DictionaryPredictor::GetHistoryKeyAndValue(
769773
const Segments &segments, string *key, string *value) const {
770774
DCHECK(key);
@@ -916,9 +920,7 @@ void DictionaryPredictor::SetLMCost(const Segments &segments,
916920

917921
const size_t input_key_len = Util::CharsLen(
918922
segments.conversion_segment(0).key());
919-
for (size_t i = 0; i < results->size(); ++i) {
920-
const Result &result = results->at(i);
921-
923+
for (Result &result : *results) {
922924
int cost = GetLMCost(result, rid);
923925
// Demote filtered word here, because they are not filtered for exact match.
924926
// Even for exact match, we don't want to show aggressive words
@@ -960,7 +962,19 @@ void DictionaryPredictor::SetLMCost(const Segments &segments,
960962
const int kBigramBonus = 800; // ~= 500*ln(5)
961963
cost += (kDefaultTransitionCost - kBigramBonus - prev_cost);
962964
}
963-
results->at(i).cost = cost;
965+
if (result.candidate_attributes & Segment::Candidate::USER_DICTIONARY) {
966+
// Decrease cost for words from user dictionary in order to promote them.
967+
// Currently user dictionary words are evaluated 5 times bigger in
968+
// frequency, being capped by 1000 (this number is adhoc, so feel free to
969+
// adjust).
970+
const int kUserDictionaryPromotionFactor = 804; // 804 = 500 * log(5)
971+
const int kUserDictionaryCostUpperLimit = 1000;
972+
cost = min(cost - kUserDictionaryPromotionFactor,
973+
kUserDictionaryCostUpperLimit);
974+
}
975+
// Note that the cost is defined as -500 * log(prob).
976+
// Even after the ad hoc manipulations, cost must remain larger than 0.
977+
result.cost = max(1, cost);
964978
}
965979
}
966980

@@ -1345,11 +1359,20 @@ void DictionaryPredictor::AggregateUnigramCandidateForMixedConversion(
13451359
const ConversionRequest &request,
13461360
const Segments &segments,
13471361
vector<Result> *results) const {
1362+
AggregateUnigramCandidateForMixedConversion(*dictionary_, request,
1363+
segments, results);
1364+
}
1365+
1366+
void DictionaryPredictor::AggregateUnigramCandidateForMixedConversion(
1367+
const dictionary::DictionaryInterface &dictionary,
1368+
const ConversionRequest &request,
1369+
const Segments &segments,
1370+
vector<Result> *results) {
13481371
const size_t cutoff_threshold = kPredictionMaxResultsSize;
13491372

13501373
vector<Result> raw_result;
13511374
// No history key
1352-
GetPredictiveResults(*dictionary_, "", request, segments, UNIGRAM,
1375+
GetPredictiveResults(dictionary, "", request, segments, UNIGRAM,
13531376
cutoff_threshold, &raw_result);
13541377

13551378
// Hereafter, we split "Needed Results" and "(maybe) Unneeded Results."
@@ -1383,7 +1406,8 @@ void DictionaryPredictor::AggregateUnigramCandidateForMixedConversion(
13831406

13841407
// Traverse all remaining elements and check if each result is redundant.
13851408
for (Iter iter = min_iter; iter != max_iter; ) {
1386-
if (MaybeRedundant(reference_result.value, iter->value)) {
1409+
if (!iter->IsUserDictionaryResult() &&
1410+
MaybeRedundant(reference_result.value, iter->value)) {
13871411
// Swap out the redundant result.
13881412
--max_iter;
13891413
std::iter_swap(iter, max_iter);
@@ -1564,7 +1588,7 @@ void DictionaryPredictor::GetPredictiveResults(
15641588
const Segments &segments,
15651589
PredictionTypes types,
15661590
size_t lookup_limit,
1567-
vector<Result> *results) const {
1591+
vector<Result> *results) {
15681592
if (!request.has_composer() ||
15691593
!FLAGS_enable_expansion_for_dictionary_predictor) {
15701594
const string &query_key = segments.conversion_segment(0).key();

src/prediction/dictionary_predictor.h

+19-7
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ class DictionaryPredictor : public PredictorInterface {
117117
dictionary::Token::AttributesBitfield token_attr);
118118
void SetSourceInfoForZeroQuery(
119119
ZeroQueryType zero_query_type);
120+
bool IsUserDictionaryResult() const;
120121

121122
string key;
122123
string value;
@@ -212,6 +213,8 @@ class DictionaryPredictor : public PredictorInterface {
212213
FRIEND_TEST(DictionaryPredictorTest, AggregateZeroQueryBigramPrediction);
213214
FRIEND_TEST(DictionaryPredictorTest, AggregateSuffixPrediction);
214215
FRIEND_TEST(DictionaryPredictorTest, AggregateZeroQuerySuffixPrediction);
216+
FRIEND_TEST(DictionaryPredictorTest,
217+
AggregateUnigramCandidateForMixedConversion);
215218
FRIEND_TEST(DictionaryPredictorTest, ZeroQuerySuggestionAfterNumbers);
216219
FRIEND_TEST(DictionaryPredictorTest, TriggerNumberZeroQuerySuggestion);
217220
FRIEND_TEST(DictionaryPredictorTest, TriggerZeroQuerySuggestion);
@@ -224,6 +227,7 @@ class DictionaryPredictor : public PredictorInterface {
224227
FRIEND_TEST(DictionaryPredictorTest, RemoveMissSpelledCandidates);
225228
FRIEND_TEST(DictionaryPredictorTest, ConformCharacterWidthToPreference);
226229
FRIEND_TEST(DictionaryPredictorTest, SetLMCost);
230+
FRIEND_TEST(DictionaryPredictorTest, SetLMCostForUserDictionaryWord);
227231
FRIEND_TEST(DictionaryPredictorTest, SetDescription);
228232
FRIEND_TEST(DictionaryPredictorTest, SetDebugDescription);
229233
FRIEND_TEST(DictionaryPredictorTest, GetZeroQueryCandidates);
@@ -278,13 +282,14 @@ class DictionaryPredictor : public PredictorInterface {
278282
const ConversionRequest &request,
279283
Result *result) const;
280284

281-
void GetPredictiveResults(const dictionary::DictionaryInterface &dictionary,
282-
const string &history_key,
283-
const ConversionRequest &request,
284-
const Segments &segments,
285-
PredictionTypes types,
286-
size_t lookup_limit,
287-
vector<Result> *results) const;
285+
static void GetPredictiveResults(
286+
const dictionary::DictionaryInterface &dictionary,
287+
const string &history_key,
288+
const ConversionRequest &request,
289+
const Segments &segments,
290+
PredictionTypes types,
291+
size_t lookup_limit,
292+
vector<Result> *results);
288293

289294
void GetPredictiveResultsForBigram(
290295
const dictionary::DictionaryInterface &dictionary,
@@ -428,6 +433,13 @@ class DictionaryPredictor : public PredictorInterface {
428433

429434
// Aggregates unigram candidate for mixed conversion.
430435
// This reduces redundant candidates.
436+
static void AggregateUnigramCandidateForMixedConversion(
437+
const dictionary::DictionaryInterface &dictionary,
438+
const ConversionRequest &request,
439+
const Segments &segments,
440+
vector<Result> *results);
441+
442+
// The same as the static version of this method above but uses |dictionary_|.
431443
void AggregateUnigramCandidateForMixedConversion(
432444
const ConversionRequest &request,
433445
const Segments &segments,

src/prediction/dictionary_predictor_test.cc

+127-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ using mozc::dictionary::SuppressionDictionary;
8787
using mozc::dictionary::Token;
8888
using testing::_;
8989

90-
DECLARE_string(test_tmpdir);
9190
DECLARE_bool(enable_expansion_for_dictionary_predictor);
9291

9392
namespace mozc {
@@ -1324,6 +1323,48 @@ TEST_F(DictionaryPredictorTest, AggregateUnigramPrediction) {
13241323
EXPECT_EQ(1, segments.conversion_segments_size());
13251324
}
13261325

1326+
TEST_F(DictionaryPredictorTest, AggregateUnigramCandidateForMixedConversion) {
1327+
const char kHiraganaA[] = "\xE3\x81\x82";
1328+
1329+
DictionaryMock mock_dict;
1330+
// A system dictionary entry "a".
1331+
mock_dict.AddLookupPredictive(kHiraganaA, kHiraganaA, "a", Token::NONE);
1332+
// System dictionary entries "a0", ..., "a9", which are detected as redundant
1333+
// by MaybeRedundant(); see dictionary_predictor.cc.
1334+
for (int i = 0; i < 10; ++i) {
1335+
mock_dict.AddLookupPredictive(kHiraganaA, kHiraganaA,
1336+
Util::StringPrintf("a%d", i), Token::NONE);
1337+
}
1338+
// A user dictionary entry "aaa". MaybeRedundant() detects this entry as
1339+
// redundant but it should not be filtered in prediction.
1340+
mock_dict.AddLookupPredictive(kHiraganaA, kHiraganaA, "aaa",
1341+
Token::USER_DICTIONARY);
1342+
1343+
config_->set_use_dictionary_suggest(true);
1344+
config_->set_use_realtime_conversion(false);
1345+
table_->LoadFromFile("system://12keys-hiragana.tsv");
1346+
composer_->SetTable(table_.get());
1347+
InsertInputSequence(kHiraganaA, composer_.get());
1348+
Segments segments;
1349+
segments.set_request_type(Segments::PREDICTION);
1350+
Segment *segment = segments.add_segment();
1351+
segment->set_key(kHiraganaA);
1352+
1353+
vector<DictionaryPredictor::Result> results;
1354+
DictionaryPredictor::AggregateUnigramCandidateForMixedConversion(
1355+
mock_dict, *convreq_, segments, &results);
1356+
1357+
// Check if "aaa" is not filtered.
1358+
auto iter = results.begin();
1359+
for (; iter != results.end(); ++iter) {
1360+
if (iter->key == kHiraganaA && iter->value == "aaa" &&
1361+
iter->IsUserDictionaryResult()) {
1362+
break;
1363+
}
1364+
}
1365+
EXPECT_NE(results.end(), iter);
1366+
}
1367+
13271368
TEST_F(DictionaryPredictorTest, AggregateBigramPrediction) {
13281369
unique_ptr<MockDataAndPredictor> data_and_predictor(
13291370
CreateDictionaryPredictorWithMockData());
@@ -2842,6 +2883,91 @@ TEST_F(DictionaryPredictorTest, SetLMCost) {
28422883
EXPECT_GT(results[2].cost, results[1].cost);
28432884
}
28442885

2886+
namespace {
2887+
2888+
void AddTestableDictionaryPredictorResult(
2889+
const char *key, const char *value, int wcost,
2890+
TestableDictionaryPredictor::PredictionTypes prediction_types,
2891+
Token::AttributesBitfield attributes,
2892+
vector<TestableDictionaryPredictor::Result> *results) {
2893+
results->push_back(TestableDictionaryPredictor::MakeEmptyResult());
2894+
TestableDictionaryPredictor::Result *result = &results->back();
2895+
result->key = key;
2896+
result->value = value;
2897+
result->wcost = wcost;
2898+
result->SetTypesAndTokenAttributes(prediction_types, attributes);
2899+
}
2900+
2901+
} // namespace
2902+
2903+
TEST_F(DictionaryPredictorTest, SetLMCostForUserDictionaryWord) {
2904+
unique_ptr<MockDataAndPredictor> data_and_predictor(
2905+
CreateDictionaryPredictorWithMockData());
2906+
const TestableDictionaryPredictor *predictor =
2907+
data_and_predictor->dictionary_predictor();
2908+
2909+
// "あいか"
2910+
const char *kAikaHiragana = "\xe3\x81\x82\xe3\x81\x84\xe3\x81\x8b";
2911+
// "愛佳"
2912+
const char *kAikaKanji = "\xe6\x84\x9b\xe4\xbd\xb3";
2913+
2914+
Segments segments;
2915+
segments.set_request_type(Segments::PREDICTION);
2916+
Segment *segment = segments.add_segment();
2917+
ASSERT_NE(nullptr, segment);
2918+
segment->set_key(kAikaHiragana);
2919+
2920+
{
2921+
// Cost of words in user dictionary should be decreased.
2922+
const int kOrigianlWordCost = 10000;
2923+
vector<TestableDictionaryPredictor::Result> results;
2924+
AddTestableDictionaryPredictorResult(
2925+
kAikaHiragana, kAikaKanji, kOrigianlWordCost,
2926+
TestableDictionaryPredictor::UNIGRAM, Token::USER_DICTIONARY,
2927+
&results);
2928+
2929+
predictor->SetLMCost(segments, &results);
2930+
2931+
EXPECT_EQ(1, results.size());
2932+
EXPECT_EQ(kAikaKanji, results[0].value);
2933+
EXPECT_GT(kOrigianlWordCost, results[0].cost);
2934+
EXPECT_LE(1, results[0].cost);
2935+
}
2936+
2937+
{
2938+
// Cost of words in user dictionary should not be decreased to below 1.
2939+
const int kOrigianlWordCost = 10;
2940+
vector<TestableDictionaryPredictor::Result> results;
2941+
AddTestableDictionaryPredictorResult(
2942+
kAikaHiragana, kAikaKanji, kOrigianlWordCost,
2943+
TestableDictionaryPredictor::UNIGRAM, Token::USER_DICTIONARY,
2944+
&results);
2945+
2946+
predictor->SetLMCost(segments, &results);
2947+
2948+
EXPECT_EQ(1, results.size());
2949+
EXPECT_EQ(kAikaKanji, results[0].value);
2950+
EXPECT_GT(kOrigianlWordCost, results[0].cost);
2951+
EXPECT_LE(1, results[0].cost);
2952+
}
2953+
2954+
{
2955+
// Cost of words not in user dictionary should not be decreased.
2956+
const int kOrigianlWordCost = 10000;
2957+
vector<TestableDictionaryPredictor::Result> results;
2958+
AddTestableDictionaryPredictorResult(
2959+
kAikaHiragana, kAikaKanji, kOrigianlWordCost,
2960+
TestableDictionaryPredictor::UNIGRAM, Token::NONE,
2961+
&results);
2962+
2963+
predictor->SetLMCost(segments, &results);
2964+
2965+
EXPECT_EQ(1, results.size());
2966+
EXPECT_EQ(kAikaKanji, results[0].value);
2967+
EXPECT_EQ(kOrigianlWordCost, results[0].cost);
2968+
}
2969+
}
2970+
28452971
TEST_F(DictionaryPredictorTest, SuggestSpellingCorrection) {
28462972
testing::MockDataManager data_manager;
28472973

0 commit comments

Comments
 (0)