Fix #379: 1-char user dictionary item may not work

Noriyuki Takahashi · Noriyuki Takahashi · commit 73a9e378b937 · 2016-08-01T15:56:48.000+09:00
User dictionary entries were sometimes dropped from prediction because their cost are relatively high even after promotion. This CL puts some upper limit on the cost of user dictionary entries. Also, when the value of a user dictionary entry has a common prefix as that of a system dictionary entry, it's detected as "maybe redundant" and discarded from prediction. This CL stops applying this mechanism for user dictionary. BUG=##379 TEST= REF_BUG=19940231,30374836 REF_CL=91386248,128960625,129404563 REF_TIME=2016-08-01T15:56:48+09:00 REF_TIME_RAW=1470034608 +0900
diff --git a/src/data/version/mozc_version_template.bzl b/src/data/version/mozc_version_template.bzl
@@ -1,6 +1,6 @@
 MAJOR=2
 MINOR=18
-BUILD=2582
+BUILD=2583
 REVISION=102
 # CAUTION: NACL_DICTIONARY_VERSION is going to be migrated to ENGINE_VERSION.
 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
diff --git a/src/prediction/dictionary_predictor.cc b/src/prediction/dictionary_predictor.cc
@@ -506,7 +506,7 @@ bool DictionaryPredictor::AddPredictionToCandidates(
       continue;
     }
 
-    // don't suggest exactly the same candidate as key.
+    // Don't suggest exactly the same candidate as key.
     // if |mixed_conversion| is true, that's not the case.
     if (!mixed_conversion &&
         !(result.types & REALTIME) &&
@@ -765,6 +765,10 @@ void DictionaryPredictor::Result::SetSourceInfoForZeroQuery(
   }
 }
 
+bool DictionaryPredictor::Result::IsUserDictionaryResult() const {
+  return (candidate_attributes & Segment::Candidate::USER_DICTIONARY) != 0;
+}
+
 bool DictionaryPredictor::GetHistoryKeyAndValue(
     const Segments &segments, string *key, string *value) const {
   DCHECK(key);
@@ -916,9 +920,7 @@ void DictionaryPredictor::SetLMCost(const Segments &segments,
 
   const size_t input_key_len = Util::CharsLen(
       segments.conversion_segment(0).key());
-  for (size_t i = 0; i < results->size(); ++i) {
-    const Result &result = results->at(i);
-
+  for (Result &result : *results) {
     int cost = GetLMCost(result, rid);
     // Demote filtered word here, because they are not filtered for exact match.
     // Even for exact match, we don't want to show aggressive words
@@ -960,7 +962,19 @@ void DictionaryPredictor::SetLMCost(const Segments &segments,
       const int kBigramBonus = 800;  // ~= 500*ln(5)
       cost += (kDefaultTransitionCost - kBigramBonus - prev_cost);
     }
-    results->at(i).cost = cost;
+    if (result.candidate_attributes & Segment::Candidate::USER_DICTIONARY) {
+      // Decrease cost for words from user dictionary in order to promote them.
+      // Currently user dictionary words are evaluated 5 times bigger in
+      // frequency, being capped by 1000 (this number is adhoc, so feel free to
+      // adjust).
+      const int kUserDictionaryPromotionFactor = 804;  // 804 = 500 * log(5)
+      const int kUserDictionaryCostUpperLimit = 1000;
+      cost = min(cost - kUserDictionaryPromotionFactor,
+                 kUserDictionaryCostUpperLimit);
+    }
+    // Note that the cost is defined as -500 * log(prob).
+    // Even after the ad hoc manipulations, cost must remain larger than 0.
+    result.cost = max(1, cost);
   }
 }
 
@@ -1345,11 +1359,20 @@ void DictionaryPredictor::AggregateUnigramCandidateForMixedConversion(
     const ConversionRequest &request,
     const Segments &segments,
     vector<Result> *results) const {
+  AggregateUnigramCandidateForMixedConversion(*dictionary_, request,
+                                              segments, results);
+}
+
+void DictionaryPredictor::AggregateUnigramCandidateForMixedConversion(
+    const dictionary::DictionaryInterface &dictionary,
+    const ConversionRequest &request,
+    const Segments &segments,
+    vector<Result> *results) {
   const size_t cutoff_threshold = kPredictionMaxResultsSize;
 
   vector<Result> raw_result;
   // No history key
-  GetPredictiveResults(*dictionary_, "", request, segments, UNIGRAM,
+  GetPredictiveResults(dictionary, "", request, segments, UNIGRAM,
                        cutoff_threshold, &raw_result);
 
   // Hereafter, we split "Needed Results" and "(maybe) Unneeded Results."
@@ -1383,7 +1406,8 @@ void DictionaryPredictor::AggregateUnigramCandidateForMixedConversion(
 
     // Traverse all remaining elements and check if each result is redundant.
     for (Iter iter = min_iter; iter != max_iter; ) {
-      if (MaybeRedundant(reference_result.value, iter->value)) {
+      if (!iter->IsUserDictionaryResult() &&
+          MaybeRedundant(reference_result.value, iter->value)) {
         // Swap out the redundant result.
         --max_iter;
         std::iter_swap(iter, max_iter);
@@ -1564,7 +1588,7 @@ void DictionaryPredictor::GetPredictiveResults(
     const Segments &segments,
     PredictionTypes types,
     size_t lookup_limit,
-    vector<Result> *results) const {
+    vector<Result> *results) {
   if (!request.has_composer() ||
       !FLAGS_enable_expansion_for_dictionary_predictor) {
     const string &query_key = segments.conversion_segment(0).key();
diff --git a/src/prediction/dictionary_predictor.h b/src/prediction/dictionary_predictor.h
@@ -117,6 +117,7 @@ class DictionaryPredictor : public PredictorInterface {
         dictionary::Token::AttributesBitfield token_attr);
     void SetSourceInfoForZeroQuery(
         ZeroQueryType zero_query_type);
+    bool IsUserDictionaryResult() const;
 
     string key;
     string value;
@@ -212,6 +213,8 @@ class DictionaryPredictor : public PredictorInterface {
   FRIEND_TEST(DictionaryPredictorTest, AggregateZeroQueryBigramPrediction);
   FRIEND_TEST(DictionaryPredictorTest, AggregateSuffixPrediction);
   FRIEND_TEST(DictionaryPredictorTest, AggregateZeroQuerySuffixPrediction);
+  FRIEND_TEST(DictionaryPredictorTest,
+              AggregateUnigramCandidateForMixedConversion);
   FRIEND_TEST(DictionaryPredictorTest, ZeroQuerySuggestionAfterNumbers);
   FRIEND_TEST(DictionaryPredictorTest, TriggerNumberZeroQuerySuggestion);
   FRIEND_TEST(DictionaryPredictorTest, TriggerZeroQuerySuggestion);
@@ -224,6 +227,7 @@ class DictionaryPredictor : public PredictorInterface {
   FRIEND_TEST(DictionaryPredictorTest, RemoveMissSpelledCandidates);
   FRIEND_TEST(DictionaryPredictorTest, ConformCharacterWidthToPreference);
   FRIEND_TEST(DictionaryPredictorTest, SetLMCost);
+  FRIEND_TEST(DictionaryPredictorTest, SetLMCostForUserDictionaryWord);
   FRIEND_TEST(DictionaryPredictorTest, SetDescription);
   FRIEND_TEST(DictionaryPredictorTest, SetDebugDescription);
   FRIEND_TEST(DictionaryPredictorTest, GetZeroQueryCandidates);
@@ -278,13 +282,14 @@ class DictionaryPredictor : public PredictorInterface {
                          const ConversionRequest &request,
                          Result *result) const;
 
-  void GetPredictiveResults(const dictionary::DictionaryInterface &dictionary,
-                            const string &history_key,
-                            const ConversionRequest &request,
-                            const Segments &segments,
-                            PredictionTypes types,
-                            size_t lookup_limit,
-                            vector<Result> *results) const;
+  static void GetPredictiveResults(
+      const dictionary::DictionaryInterface &dictionary,
+      const string &history_key,
+      const ConversionRequest &request,
+      const Segments &segments,
+      PredictionTypes types,
+      size_t lookup_limit,
+      vector<Result> *results);
 
   void GetPredictiveResultsForBigram(
       const dictionary::DictionaryInterface &dictionary,
@@ -428,6 +433,13 @@ class DictionaryPredictor : public PredictorInterface {
 
   // Aggregates unigram candidate for mixed conversion.
   // This reduces redundant candidates.
+  static void AggregateUnigramCandidateForMixedConversion(
+      const dictionary::DictionaryInterface &dictionary,
+      const ConversionRequest &request,
+      const Segments &segments,
+      vector<Result> *results);
+
+  // The same as the static version of this method above but uses |dictionary_|.
   void AggregateUnigramCandidateForMixedConversion(
       const ConversionRequest &request,
       const Segments &segments,
diff --git a/src/prediction/dictionary_predictor_test.cc b/src/prediction/dictionary_predictor_test.cc
@@ -87,7 +87,6 @@ using mozc::dictionary::SuppressionDictionary;
 using mozc::dictionary::Token;
 using testing::_;
 
-DECLARE_string(test_tmpdir);
 DECLARE_bool(enable_expansion_for_dictionary_predictor);
 
 namespace mozc {
@@ -1324,6 +1323,48 @@ TEST_F(DictionaryPredictorTest, AggregateUnigramPrediction) {
   EXPECT_EQ(1, segments.conversion_segments_size());
 }
 
+TEST_F(DictionaryPredictorTest, AggregateUnigramCandidateForMixedConversion) {
+  const char kHiraganaA[] = "\xE3\x81\x82";
+
+  DictionaryMock mock_dict;
+  // A system dictionary entry "a".
+  mock_dict.AddLookupPredictive(kHiraganaA, kHiraganaA, "a", Token::NONE);
+  // System dictionary entries "a0", ..., "a9", which are detected as redundant
+  // by MaybeRedundant(); see dictionary_predictor.cc.
+  for (int i = 0; i < 10; ++i) {
+    mock_dict.AddLookupPredictive(kHiraganaA, kHiraganaA,
+                                  Util::StringPrintf("a%d", i), Token::NONE);
+  }
+  // A user dictionary entry "aaa".  MaybeRedundant() detects this entry as
+  // redundant but it should not be filtered in prediction.
+  mock_dict.AddLookupPredictive(kHiraganaA, kHiraganaA, "aaa",
+                                Token::USER_DICTIONARY);
+
+  config_->set_use_dictionary_suggest(true);
+  config_->set_use_realtime_conversion(false);
+  table_->LoadFromFile("system://12keys-hiragana.tsv");
+  composer_->SetTable(table_.get());
+  InsertInputSequence(kHiraganaA, composer_.get());
+  Segments segments;
+  segments.set_request_type(Segments::PREDICTION);
+  Segment *segment = segments.add_segment();
+  segment->set_key(kHiraganaA);
+
+  vector<DictionaryPredictor::Result> results;
+  DictionaryPredictor::AggregateUnigramCandidateForMixedConversion(
+      mock_dict, *convreq_, segments, &results);
+
+  // Check if "aaa" is not filtered.
+  auto iter = results.begin();
+  for (; iter != results.end(); ++iter) {
+    if (iter->key == kHiraganaA && iter->value == "aaa" &&
+        iter->IsUserDictionaryResult()) {
+      break;
+    }
+  }
+  EXPECT_NE(results.end(), iter);
+}
+
 TEST_F(DictionaryPredictorTest, AggregateBigramPrediction) {
   unique_ptr<MockDataAndPredictor> data_and_predictor(
       CreateDictionaryPredictorWithMockData());
@@ -2842,6 +2883,91 @@ TEST_F(DictionaryPredictorTest, SetLMCost) {
   EXPECT_GT(results[2].cost, results[1].cost);
 }
 
+namespace {
+
+void AddTestableDictionaryPredictorResult(
+    const char *key, const char *value, int wcost,
+    TestableDictionaryPredictor::PredictionTypes prediction_types,
+    Token::AttributesBitfield attributes,
+    vector<TestableDictionaryPredictor::Result> *results) {
+  results->push_back(TestableDictionaryPredictor::MakeEmptyResult());
+  TestableDictionaryPredictor::Result *result = &results->back();
+  result->key = key;
+  result->value = value;
+  result->wcost = wcost;
+  result->SetTypesAndTokenAttributes(prediction_types, attributes);
+}
+
+}  // namespace
+
+TEST_F(DictionaryPredictorTest, SetLMCostForUserDictionaryWord) {
+  unique_ptr<MockDataAndPredictor> data_and_predictor(
+      CreateDictionaryPredictorWithMockData());
+  const TestableDictionaryPredictor *predictor =
+      data_and_predictor->dictionary_predictor();
+
+  // "あいか"
+  const char *kAikaHiragana = "\xe3\x81\x82\xe3\x81\x84\xe3\x81\x8b";
+  // "愛佳"
+  const char *kAikaKanji = "\xe6\x84\x9b\xe4\xbd\xb3";
+
+  Segments segments;
+  segments.set_request_type(Segments::PREDICTION);
+  Segment *segment = segments.add_segment();
+  ASSERT_NE(nullptr, segment);
+  segment->set_key(kAikaHiragana);
+
+  {
+    // Cost of words in user dictionary should be decreased.
+    const int kOrigianlWordCost = 10000;
+    vector<TestableDictionaryPredictor::Result> results;
+    AddTestableDictionaryPredictorResult(
+        kAikaHiragana, kAikaKanji, kOrigianlWordCost,
+        TestableDictionaryPredictor::UNIGRAM, Token::USER_DICTIONARY,
+        &results);
+
+    predictor->SetLMCost(segments, &results);
+
+    EXPECT_EQ(1, results.size());
+    EXPECT_EQ(kAikaKanji, results[0].value);
+    EXPECT_GT(kOrigianlWordCost, results[0].cost);
+    EXPECT_LE(1, results[0].cost);
+  }
+
+  {
+    // Cost of words in user dictionary should not be decreased to below 1.
+    const int kOrigianlWordCost = 10;
+    vector<TestableDictionaryPredictor::Result> results;
+    AddTestableDictionaryPredictorResult(
+        kAikaHiragana, kAikaKanji, kOrigianlWordCost,
+        TestableDictionaryPredictor::UNIGRAM, Token::USER_DICTIONARY,
+        &results);
+
+    predictor->SetLMCost(segments, &results);
+
+    EXPECT_EQ(1, results.size());
+    EXPECT_EQ(kAikaKanji, results[0].value);
+    EXPECT_GT(kOrigianlWordCost, results[0].cost);
+    EXPECT_LE(1, results[0].cost);
+  }
+
+  {
+    // Cost of words not in user dictionary should not be decreased.
+    const int kOrigianlWordCost = 10000;
+    vector<TestableDictionaryPredictor::Result> results;
+    AddTestableDictionaryPredictorResult(
+        kAikaHiragana, kAikaKanji, kOrigianlWordCost,
+        TestableDictionaryPredictor::UNIGRAM, Token::NONE,
+        &results);
+
+    predictor->SetLMCost(segments, &results);
+
+    EXPECT_EQ(1, results.size());
+    EXPECT_EQ(kAikaKanji, results[0].value);
+    EXPECT_EQ(kOrigianlWordCost, results[0].cost);
+  }
+}
+
 TEST_F(DictionaryPredictorTest, SuggestSpellingCorrection) {
   testing::MockDataManager data_manager;