Codebase list mozc / debian/1.4.1033.102-2 prediction / dictionary_predictor.h
debian/1.4.1033.102-2

Tree @debian/1.4.1033.102-2 (Download .tar.gz)

dictionary_predictor.h @debian/1.4.1033.102-2raw · history · blame

// Copyright 2010-2012, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#ifndef MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_
#define MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_

#include <string>
#include <vector>
#include "base/base.h"
#include "prediction/predictor_interface.h"
// for FRIEND_TEST()
#include "testing/base/public/gunit_prod.h"

namespace mozc {

class ConnectorInterface;
class ConversionRequest;
class DictionaryInterface;
class ImmutableConverterInterface;
class NodeAllocatorInterface;
class Segments;
class SegmenterInterface;
struct Node;

// Dictioanry-based predictor
class DictionaryPredictor: public PredictorInterface {
 public:
  DictionaryPredictor();
  explicit DictionaryPredictor(SegmenterInterface *segmenter);
  virtual ~DictionaryPredictor();

  bool Predict(Segments *segments) const;
  bool PredictForRequest(const ConversionRequest &request,
                         Segments *segments) const;

 protected:
  // Protected members for unittesting
  // For use util method accessing private members, made them protected.
  enum PredictionType {
    // don't need to show any suggestions.
    NO_PREDICTION = 0,
    // suggests from current key user is now typing
    UNIGRAM = 1,
    // suggests from the previous history key user typed before.
    BIGRAM = 2,
    // suggests from immutable_converter
    REALTIME = 4,
    // add suffixes like "さん", "が" which matches to the pevious context.
    SUFFIX =  8,
  };

  struct Result {
    Result() : node(NULL), type(NO_PREDICTION), cost(0) {}
    Result(const Node *node_, PredictionType type_)
        : node(node_), type(type_), cost(0) {}
    const Node *node;
    PredictionType type;
    int cost;
  };

  // On MSVS2008/2010, TestableDictionaryPredictor::Result(node, type) causes
  // a compile error even if you change the access right of it to public.
  // You can use TestableDictionaryPredictor::MakeResult(node, type) instead.
  static Result MakeResult(const Node *node, PredictionType type) {
    return Result(node, type);
  }

  void AggregateRealtimeConversion(PredictionType type,
                                   Segments *segments,
                                   NodeAllocatorInterface *allocator,
                                   vector<Result> *results) const;

  void AggregateUnigramPrediction(PredictionType type,
                                  const ConversionRequest &request,
                                  Segments *segments,
                                  NodeAllocatorInterface *allocator,
                                  vector<Result> *results) const;

  void AggregateBigramPrediction(PredictionType type,
                                 const ConversionRequest &request,
                                 Segments *segments,
                                 NodeAllocatorInterface *allocator,
                                 vector<Result> *results) const;

  void AggregateSuffixPrediction(PredictionType type,
                                 const ConversionRequest &request,
                                 Segments *segments,
                                 NodeAllocatorInterface *allocator,
                                 vector<Result> *results) const;

  void ApplyPenaltyForKeyExpansion(const Segments &segments,
                                   vector<Result> *results) const;

 private:
  FRIEND_TEST(DictionaryPredictorTest, GetPredictionType);
  FRIEND_TEST(DictionaryPredictorTest,
              GetPredictionTypeTestWithZeroQuerySuggestion);
  FRIEND_TEST(DictionaryPredictorTest, IsZipCodeRequest);
  FRIEND_TEST(DictionaryPredictorTest, GetRealtimeCandidateMaxSize);
  FRIEND_TEST(DictionaryPredictorTest, GetRealtimeCandidateMaxSizeForMixed);
  FRIEND_TEST(DictionaryPredictorTest, AggregateRealtimeConversion);
  FRIEND_TEST(DictionaryPredictorTest, GetUnigramCandidateCutoffThreshold);
  FRIEND_TEST(DictionaryPredictorTest, AggregateUnigramPrediction);
  FRIEND_TEST(DictionaryPredictorTest, AggregateBigramPrediction);
  FRIEND_TEST(DictionaryPredictorTest, AggregateSuffixPrediction);
  FRIEND_TEST(DictionaryPredictorTest, ZeroQuerySuggestionAfterNumbers);
  FRIEND_TEST(DictionaryPredictorTest, GetHistoryKeyAndValue);
  FRIEND_TEST(DictionaryPredictorTest, RealtimeConversionStartingWithAlphabets);
  FRIEND_TEST(DictionaryPredictorTest, IsAggressiveSuggestion);
  FRIEND_TEST(DictionaryPredictorTest, LookupKeyValueFromDictionary);
  FRIEND_TEST(DictionaryPredictorTest,
              RealtimeConversionWithSpellingCorrection);
  FRIEND_TEST(DictionaryPredictorTest, GetMissSpelledPosition);
  FRIEND_TEST(DictionaryPredictorTest, RemoveMissSpelledCandidates);
  FRIEND_TEST(DictionaryPredictorTest, ConformCharacterWidthToPreference);
  FRIEND_TEST(DictionaryPredictorTest, SetLMCost);

  class ResultCompare {
   public:
    bool operator() (const Result &a, const Result &b) const {
      return a.cost > b.cost;
    }
  };

  // Return false if no results were aggregated.
  bool AggregatePrediction(const ConversionRequest &request,
                           Segments *segments,
                           NodeAllocatorInterface *allocator,
                           vector<Result> *results) const;

  void SetCost(const Segments &segments, vector<Result> *results) const;

  // Remove prediciton by setting NO_PREDICTION to result type if necessary.
  void RemovePrediction(const Segments &segments,
                        vector<Result> *results) const;

  bool AddPredictionToCandidates(Segments *segments,
                                 vector<Result> *results) const;

  const Node *GetPredictiveNodes(const DictionaryInterface *dictionary,
                                 const string &history_key,
                                 const ConversionRequest &request,
                                 const Segments &segments,
                                 NodeAllocatorInterface *allocator) const;

  // return the position of mis-spelled character position
  //
  // Example1:
  // key: "れみおめろん"
  // value: "レミオロメン"
  // returns 3
  //
  // Example3:
  // key: "ろっぽんぎ"5
  // value: "六本木"
  // returns 5 (charslen("六本木"))
  size_t GetMissSpelledPosition(const string &key,
                                const string &value) const;

  // return true if key/value is in dictionary.
  const Node *LookupKeyValueFromDictionary(
      const string &key,
      const string &value,
      NodeAllocatorInterface *allocator) const;

  // return language model cost of |node|
  // |rid| is the right id of previous word (node).
  // if |rid| is uknown, set 0 as a default value.
  int GetLMCost(PredictionType type, const Node &node, int rid) const;

  // Given the results aggregated by aggregates, remove
  // miss-spelled results from the |results|.
  // we don't directly remove miss-spelled result but set
  // result[i].type = NO_PREDICTION.
  //
  // Here's the basic step of removal:
  // Case1:
  // result1: "あぼがど" => "アボガド"
  // result2: "あぼがど" => "アボカド" (spelling correction)
  // result3: "あぼかど" => "アボカド"
  // In this case, we can remove result 1 and 2.
  // If there exists the same result2.key in result1,3 and
  // the same result2.value in result1,3, we can remove the
  // 1) spelling correction candidate 2) candidate having
  // the same key as the spelling correction candidate.
  //
  // Case2:
  // result1: "あぼかど" => "アボカド"
  // result2: "あぼがど" => "アボカド" (spelling correction)
  // In this case, remove result2.
  //
  // Case3:
  // result1: "あぼがど" => "アボガド"
  // result2: "あぼがど" => "アボカド" (spelling correction)
  // In this case,
  //   a) user input: あ,あぼ,あぼ => remove result1, result2
  //   b) user input: あぼが,あぼがど  => remove result1
  //
  // let |same_key_size| and |same_value_size| be the number of
  // non-spelling-correction-candidates who have the same key/value as
  // spelling-correction-candidate respectively.
  //
  // if (same_key_size > 0 && same_value_size > 0) {
  //   remove spelling correction and candidates having the
  //   same key as the spelling correction.
  // } else if (same_key_size == 0 && same_value_size > 0) {
  //   remove spelling correction
  // } else {
  //   do nothing.
  // }
  void RemoveMissSpelledCandidates(size_t request_key_len,
                                   vector<Result> *results) const;

  // Scoring function which takes prediction bounus into account.
  // It basically reranks the candidate by lang_prob * (1 + remain_len).
  void SetPredictionCost(const Segments &segments,
                         vector<Result> *results) const;

  // Language model-based scoring function.
  void SetLMCost(const Segments &segments,
                 vector<Result> *results) const;

  // return true if key consistes of '0'-'9' or '-'
  bool IsZipCodeRequest(const string &key) const;

  // return true if the suggestion is classified
  // as "aggressive".
  bool IsAggressiveSuggestion(
      size_t query_len, size_t key_len, int32 cost,
      bool is_suggestion, size_t total_candidates_size) const;

  // return history key/value.
  bool GetHistoryKeyAndValue(const Segments &segments,
                             string *key, string *value) const;

  // return PredictionType.
  // return value may be UNIGRAM | BIGRAM | REALTIME | SUFFIX.
  PredictionType GetPredictionType(const Segments &segments) const;

  // return max size of realtime candidates.
  size_t GetRealtimeCandidateMaxSize(const Segments &segments,
                                     bool mixed_conversion,
                                     size_t max_size) const;

  // return cutoff threshold of unigram candidates.
  // AggregateUnigramPrediction method does not return any candidates
  // if there are too many (>= cutoff threshold) eligible candidates.
  // This behavior prevents a user from seeing too many prefix-match
  // candidates.
  size_t GetUnigramCandidateCutoffThreshold(const Segments &segments,
                                            bool mixed_conversion) const;

  DictionaryInterface *dictionary_;
  DictionaryInterface *suffix_dictionary_;
  ConnectorInterface *connector_;
  const SegmenterInterface *segmenter_;
  ImmutableConverterInterface *immutable_converter_;
};
}  // namespace mozc

#endif  // MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_