Codebase list mozc / upstream/0.12.410.102 rewriter / single_kanji_rewriter.cc
upstream/0.12.410.102

Tree @upstream/0.12.410.102 (Download .tar.gz)

single_kanji_rewriter.cc @upstream/0.12.410.102raw · history · blame

// Copyright 2010, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "rewriter/single_kanji_rewriter.h"

#include <string>
#include <vector>
#include <set>
#include "base/base.h"
#include "base/singleton.h"
#include "base/util.h"
#include "converter/segments.h"
#include "rewriter/rewriter_interface.h"
#include "rewriter/embedded_dictionary.h"
#include "session/config_handler.h"
#include "session/config.pb.h"

namespace mozc {

namespace {

#include "rewriter/single_kanji_rewriter_data.h"

class SingleKanjiDictionary {
 public:
  SingleKanjiDictionary()
      : dic_(new EmbeddedDictionary(kSingleKanjiData_token_data,
                                    kSingleKanjiData_token_size)) {}

  ~SingleKanjiDictionary() {}

  EmbeddedDictionary *GetDictionary() const {
    return dic_.get();
  }

 private:
  scoped_ptr<EmbeddedDictionary> dic_;
};

// Insert SingleKanji into segment.
void InsertCandidate(Segment *segment,
                     bool is_single_segment,
                     const EmbeddedDictionary::Value *dict_values,
                     size_t dict_values_size) {
  const int kOffsetSize = 50;    // Expand 50 candidates
  segment->GetCandidates(kOffsetSize);
  if (segment->candidates_size() == 0) {
    LOG(WARNING) << "candidates_size is 0";
    return;
  }

  // Adding 3000 to the single kanji cost
  const int kOffsetDiff = 3000;

  const Segment::Candidate &base_candidate = segment->candidate(0);
  size_t idx_j = 0;

  if (is_single_segment) {
    // Merge default candidate and SingleKanji candidate.
    // This procedure makes dup, but we just ignore it, as
    // session layer removes dups
    size_t idx_i = 0;

    // we don't touch the first candidate if it is already fixed
    if (segment->segment_type() == Segment::FIXED_VALUE) {
      idx_i = 1;
    }

    // Find insertion point
    for (size_t i = 0; i < segment->candidates_size(); ++i) {
      const string &value = segment->candidate(i).value;
      // We want to insert under hiragana, katakana,
      // and single kanjis in system dictionary.
      if (Util::IsScriptType(value, Util::HIRAGANA) ||
          Util::IsScriptType(value, Util::KATAKANA) ||
          (Util::IsScriptType(value, Util::KANJI) &&
           Util::CharsLen(value) == 1)) {
        ++idx_i;
        continue;
      }
      break;
    }

    while (idx_i < segment->candidates_size() && idx_j < dict_values_size) {
      const int cost = dict_values[idx_j].cost + kOffsetDiff;
      if (cost >= segment->candidate(idx_i).cost) {
        ++idx_i;
        continue;
      }
      Segment::Candidate *c = segment->insert_candidate(idx_i);
      c->lid = dict_values[idx_j].lid;
      c->rid = dict_values[idx_j].rid;
      c->cost = dict_values[idx_j].cost + kOffsetDiff;
      c->content_value = dict_values[idx_j].value;
      c->content_key = base_candidate.content_key;
      c->value = dict_values[idx_j].value;
      c->learning_type |= Segment::Candidate::CONTEXT_SENSITIVE;
      const string &desc = (dict_values[idx_j].description == NULL)?
          "" : dict_values[idx_j].description;
      c->SetDescription(Segment::Candidate::PLATFORM_DEPENDENT_CHARACTER,
                        desc);
      ++idx_i;
      ++idx_j;
    }
  }

  // append remaining single-kanji
  while (idx_j < dict_values_size) {
    Segment::Candidate *c = segment->push_back_candidate();
    c->lid = dict_values[idx_j].lid;
    c->rid = dict_values[idx_j].rid;
    c->cost = dict_values[idx_j].cost + kOffsetDiff;
    c->content_value = dict_values[idx_j].value;
    c->content_key = base_candidate.content_key;
    c->value = dict_values[idx_j].value;
    c->learning_type |= Segment::Candidate::CONTEXT_SENSITIVE;
    const string &desc = (dict_values[idx_j].description == NULL)?
        "" : dict_values[idx_j].description;
    c->SetDescription(Segment::Candidate::PLATFORM_DEPENDENT_CHARACTER,
                      desc);

    ++idx_j;
  }
}
}  // namespace

SingleKanjiRewriter::SingleKanjiRewriter() {}

SingleKanjiRewriter::~SingleKanjiRewriter() {}

bool SingleKanjiRewriter::Rewrite(Segments *segments) const {
  if (!GET_CONFIG(use_single_kanji_conversion)) {
    VLOG(2) << "no use_single_kanji_conversion";
    return false;
  }

  bool modified = false;
  const size_t segments_size = segments->conversion_segments_size();
  const bool is_single_segment = (segments_size == 1);
  for (size_t i = 0; i < segments_size; ++i) {
    const string &key = segments->conversion_segment(i).key();
    const EmbeddedDictionary::Token *token =
        Singleton<SingleKanjiDictionary>::get()->GetDictionary()->Lookup(key);
    if (token == NULL) {
      continue;
    }
    InsertCandidate(segments->mutable_conversion_segment(i),
                    is_single_segment,
                    token->value, token->value_size);
    modified = true;
  }

  return modified;
}
}  // namespace mozc