Codebase list mozc / upstream/1.1.758.102 rewriter / number_rewriter.cc
upstream/1.1.758.102

Tree @upstream/1.1.758.102 (Download .tar.gz)

number_rewriter.cc @upstream/1.1.758.102raw · history · blame

// Copyright 2010-2011, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "rewriter/number_rewriter.h"

#include <stdio.h>
#include <algorithm>
#include <string>
#include <utility>
#include <vector>

#include "base/util.h"
#include "dictionary/pos_matcher.h"
#include "converter/segments.h"
#include "session/commands.pb.h"
#include "session/config_handler.h"
#include "session/config.pb.h"

namespace mozc {
namespace {

void PushBackCandidate(const string &value, const string &desc, uint16 style,
                       vector<Segment::Candidate> *results) {
  bool found = false;
  for (vector<Segment::Candidate>::const_iterator it = results->begin();
       it != results->end(); ++it) {
    if (it->value == value) {
      found = true;
      break;
    }
  }
  if (!found) {
    Segment::Candidate cand;
    cand.value = value;
    cand.description = desc;
    cand.style = style;
    results->push_back(cand);
  }
}

bool IsNumber(uint16 lid) {
  return
      (POSMatcher::IsNumber(lid) ||
       POSMatcher::IsKanjiNumber(lid));
}

// Return true if rewriter should insert numerical variants.
// *base_candidate_pos: candidate index of base_candidate. POS information
// for numerical variants are coped from the base_candidate.
// *insert_pos: the candidate index from which numerical variants
// should be inserted.
bool GetNumericCandidatePositions(Segment *seg, int *base_candidate_pos,
                                  int *insert_pos) {
  CHECK(base_candidate_pos);
  CHECK(insert_pos);
  for (size_t i = 0; i < seg->candidates_size(); ++i) {
    const Segment::Candidate &c = seg->candidate(i);
    if (!IsNumber(c.lid)) {
      continue;
    }

    if (Util::GetScriptType(c.content_value) == Util::NUMBER) {
      *base_candidate_pos = i;
      // +2 as fullwidht/(or halfwidth) variant is on i + 1 postion.
      *insert_pos = i + 2;
      return true;
    }

    string kanji_number, arabic_number, half_width_new_content_value;
    Util::FullWidthToHalfWidth(c.content_key, &half_width_new_content_value);
    // try to get normalized kanji_number and arabic_number.
    // if it failed, do nothing.
    if (!Util::NormalizeNumbers(c.content_value, true, &kanji_number,
                                &arabic_number) ||
        arabic_number == half_width_new_content_value) {
      return false;
    }

    // Insert arabic number first
    Segment::Candidate *arabic_c = seg->insert_candidate(i + 1);
    DCHECK(arabic_c);
    const string suffix =
        c.value.substr(c.content_value.size(),
                       c.value.size() - c.content_value.size());
    arabic_c->Init();
    arabic_c->value = arabic_number + suffix;
    arabic_c->content_value = arabic_number;
    arabic_c->key = c.key;
    arabic_c->content_key = c.content_key;
    arabic_c->cost = c.cost;
    arabic_c->structure_cost = c.structure_cost;
    arabic_c->lid = c.lid;
    arabic_c->rid = c.rid;

    // If top candidate is Kanji numeric, we want to expand at least
    // 5 candidates here.
    // http://b/issue?id=2872048
    const int kArabicNumericOffset = 5;
    *base_candidate_pos = i + 1;
    *insert_pos = i + kArabicNumericOffset;
    return true;
  }

  return false;
}
}  // namespace

NumberRewriter::NumberRewriter() {}
NumberRewriter::~NumberRewriter() {}

int NumberRewriter::capability() const {
  return RewriterInterface::CONVERSION;
}

bool NumberRewriter::Rewrite(Segments *segments) const {
  if (!GET_CONFIG(use_number_conversion)) {
    VLOG(2) << "no use_number_conversion";
    return false;
  }

  bool modified = false;
  for (size_t i = segments->history_segments_size();
       i < segments->segments_size(); ++i) {
    Segment *seg = segments->mutable_segment(i);
    DCHECK(seg);
    int base_candidate_pos = 0;
    int insert_pos = 0;
    if (!GetNumericCandidatePositions(seg, &base_candidate_pos, &insert_pos)) {
      continue;
    }

    const Segment::Candidate &base_cand = seg->candidate(base_candidate_pos);

    if (base_cand.content_value.size() > base_cand.value.size()) {
      LOG(ERROR) << "Invalid content_value/value: ";
      continue;
    }

    string base_content_value;
    Util::FullWidthToHalfWidth(base_cand.content_value, &base_content_value);

    if (Util::GetScriptType(base_content_value) != Util::NUMBER) {
      LOG(ERROR) << "base_content_value is not number: " << base_content_value;
      continue;
    }

    insert_pos = min(insert_pos, static_cast<int>(seg->candidates_size()));

    modified = true;
    vector<Util::NumberString> output;
    vector<Segment::Candidate> converted_numbers;

    Util::ArabicToWideArabic(base_content_value, &output);
    Util::ArabicToSeparatedArabic(base_content_value, &output);
    Util::ArabicToKanji(base_content_value, &output);
    Util::ArabicToOtherForms(base_content_value, &output);

    if (segments->conversion_segments_size() == 1) {
      Util::ArabicToOtherRadixes(base_content_value, &output);
    }

    for (int i = 0; i < output.size(); i++) {
      PushBackCandidate(output[i].value, output[i].description, output[i].style,
                        &converted_numbers);
    }

    const string suffix =
        base_cand.value.substr(base_cand.content_value.size(),
                               base_cand.value.size() -
                               base_cand.content_value.size());

    for (vector<Segment::Candidate>::const_iterator iter =
             converted_numbers.begin();
         iter != converted_numbers.end(); ++iter) {
      Segment::Candidate *c = seg->insert_candidate(insert_pos++);
      DCHECK(c);
      c->lid = base_cand.lid;
      c->rid = base_cand.rid;
      c->cost = base_cand.cost;
      c->value = iter->value + suffix;
      c->content_value = iter->value;
      c->key = base_cand.key;
      c->content_key = base_cand.content_key;
      c->style = iter->style;
      c->description = iter->description;
      // Don't want to have FULL_WIDTH form for Hex/Oct/BIN..etc.
      if (c->style == Segment::Candidate::NUMBER_HEX ||
          c->style == Segment::Candidate::NUMBER_OCT ||
          c->style == Segment::Candidate::NUMBER_BIN) {
        c->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
      }
    }
  }

  return modified;
}
}  // namespace mozc